diff --git a/addb2/dump.c b/addb2/dump.c
index c9bd0717e2a..b306a74e55d 100644
--- a/addb2/dump.c
+++ b/addb2/dump.c
@@ -861,6 +861,17 @@ static void dtx0_state_counter(struct m0_addb2__context *ctx, char *buf)
 	sm_trans(&m0_dtx_sm_conf, "dtx0", ctx, buf);
 }
 
+extern struct m0_sm_conf m0_drm_sm_conf;
+static void drm_state(struct m0_addb2__context *ctx, const uint64_t *v,
+		      char *buf)
+{
+	sm_state(&m0_drm_sm_conf, ctx, v, buf);
+}
+
+static void drm_state_counter(struct m0_addb2__context *ctx, char *buf)
+{
+	sm_trans(&m0_drm_sm_conf, "drm", ctx, buf);
+}
 
 extern struct m0_sm_conf op_states_conf;
 static void beop_state_counter(struct m0_addb2__context *ctx, char *buf)
@@ -1149,6 +1160,11 @@ struct m0_addb2__id_intrp ids[] = {
 	  .ii_repeat = M0_AVI_DTX0_SM_COUNTER_END - M0_AVI_DTX0_SM_COUNTER,
 	  .ii_spec   = &dtx0_state_counter },
 
+	{ M0_AVI_DRM_SM_STATE,     "drm-state",    { &drm_state, SKIP2  } },
+	{ M0_AVI_DRM_SM_COUNTER,   "",
+	  .ii_repeat = M0_AVI_DRM_SM_COUNTER_END - M0_AVI_DRM_SM_COUNTER,
+	  .ii_spec   = &drm_state_counter },
+
 	{ M0_AVI_BE_TX_STATE,     "tx-state",        { &tx_state, SKIP2  } },
 	{ M0_AVI_BE_TX_COUNTER,   "",
 	  .ii_repeat = M0_AVI_BE_TX_COUNTER_END - M0_AVI_BE_TX_COUNTER,
diff --git a/be/dtm0_log.c b/be/dtm0_log.c
index 813222f1dfe..fae419ab20b 100644
--- a/be/dtm0_log.c
+++ b/be/dtm0_log.c
@@ -116,15 +116,12 @@ M0_INTERNAL void m0_be_dtm0_log_fini(struct m0_be_dtm0_log *log)
 	log->dl_cs = NULL;
 }
 
-M0_INTERNAL void m0_be_dtm0_log_free(struct m0_be_dtm0_log **in_log)
+M0_INTERNAL void m0_be_dtm0_log_free(struct m0_be_dtm0_log *log)
 {
-	struct m0_be_dtm0_log *log = *in_log;
-
 	M0_PRE(!log->dl_is_persistent);
 
 	m0_free(log->u.dl_inmem);
 	m0_free(log);
-	*in_log = NULL;
 }
 
 /**
@@ -394,8 +391,8 @@ static int dtm0_log__insert(struct m0_be_dtm0_log  *log,
 			    struct m0_buf          *payload)
 {
 	int                      rc;
-	struct m0_dtm0_log_rec	*rec;
-	struct m0_be_seg	*seg = log->dl_seg;
+	struct m0_dtm0_log_rec  *rec;
+	struct m0_be_seg        *seg = log->dl_seg;
 
 	if (log->dl_is_persistent) {
 		rc = plog_rec_init(&rec, tx, seg, txd, payload);
@@ -454,7 +451,7 @@ M0_INTERNAL int m0_be_dtm0_log_update(struct m0_be_dtm0_log  *log,
 				      struct m0_dtm0_tx_desc *txd,
 				      struct m0_buf          *payload)
 {
-	struct m0_dtm0_log_rec	*rec;
+	struct m0_dtm0_log_rec *rec;
 
 	M0_PRE(payload != NULL);
 	M0_PRE(m0_be_dtm0_log__invariant(log));
@@ -653,8 +650,8 @@ static bool be_dtm0_log_iter_is_first(const struct m0_be_dtm0_log_iter *iter)
 static bool be_dtm0_log_iter_invariant(const struct m0_be_dtm0_log_iter *iter)
 {
 	return _0C(m0_be_dtm0_log__invariant(iter->dli_log)) &&
-	       _0C(be_dtm0_log_iter_is_first(iter) ||
-		   m0_dtm0_tid__invariant(&iter->dli_current_tid));
+	       _0C(ergo(!be_dtm0_log_iter_is_first(iter),
+			m0_dtm0_tid__invariant(&iter->dli_current_tid)));
 }
 
 M0_INTERNAL void m0_be_dtm0_log_iter_init(struct m0_be_dtm0_log_iter *iter,
@@ -671,10 +668,10 @@ M0_INTERNAL void m0_be_dtm0_log_iter_fini(struct m0_be_dtm0_log_iter *iter)
 }
 
 M0_INTERNAL int m0_be_dtm0_log_iter_next(struct m0_be_dtm0_log_iter *iter,
-					 struct m0_dtm0_log_rec	    *out)
+					 struct m0_dtm0_log_rec     *out)
 {
 	struct m0_dtm0_log_rec *rec;
-	int rc;
+	int    rc;
 
 	M0_PRE(m0_mutex_is_locked(&iter->dli_log->dl_lock));
 	M0_PRE(be_dtm0_log_iter_invariant(iter));
@@ -699,7 +696,7 @@ M0_INTERNAL int m0_be_dtm0_log_iter_next(struct m0_be_dtm0_log_iter *iter,
 			return M0_ERR(rc);
 	}
 
-	return M0_RC(rec == NULL ? 0 : +1);
+	return M0_RC(rec != NULL ? 0 : -ENOENT);
 }
 
 M0_INTERNAL int m0_dtm0_log_rec_copy(struct m0_dtm0_log_rec       *dst,
diff --git a/be/dtm0_log.h b/be/dtm0_log.h
index 4e5751bdf5f..70763aebd6f 100644
--- a/be/dtm0_log.h
+++ b/be/dtm0_log.h
@@ -179,7 +179,7 @@ struct m0_dtm0_log_rec {
 };
 
 /**
- * @b  m0_be_dtm0_log_rec structure
+ * @b  m0_be_dtm0_log structure
  *
  * A DTM0 log is represented by m0_be_dtm0_log structure. The important
  * fields in this structure are:
@@ -277,13 +277,21 @@ M0_INTERNAL void m0_be_dtm0_log_fini(struct m0_be_dtm0_log *log);
  * Free the memory allocated by m0_be_dtm0_log_alloc
  *
  * @pre m0_be_dtm0_log->dl_is_persistent needs to be false.
- * @post *log is set to NULL.
+ * @post None
  *
  * @param log Pointer to a log structure that has been previously allocated.
  *
  * @return None
  */
-M0_INTERNAL void m0_be_dtm0_log_free(struct m0_be_dtm0_log **log);
+M0_INTERNAL void m0_be_dtm0_log_free(struct m0_be_dtm0_log *log);
+
+/** Frees memory and zeroes the pointer. */
+#define m0_be_dtm0_log_free0(pptr)            \
+	do {                                  \
+		typeof(pptr) __pptr = (pptr); \
+		m0_be_dtm0_log_free(*__pptr); \
+		*__pptr = NULL;               \
+	} while (0)
 
 /**
  * For performing an operation on a persistent log, we need to take the
@@ -553,11 +561,11 @@ M0_INTERNAL void m0_be_dtm0_log_iter_fini(struct m0_be_dtm0_log_iter *iter);
  * @param out returned record which is copied and needs to be freed with
  *            m0_dtm0_log_iter_rec_fini()
  *
- * @return +1 when the iterator was successfully moved to the next record.
- * @return  0 when the end of the log has been reached.
+ * @return 0 when the iterator was successfully moved to the next record.
+ * @return  -ENOENT when the end of the log has been reached.
  */
 M0_INTERNAL int m0_be_dtm0_log_iter_next(struct m0_be_dtm0_log_iter *iter,
-					 struct m0_dtm0_log_rec	    *out);
+					 struct m0_dtm0_log_rec     *out);
 
 /** @} */ /* end DTM0Internals */
 
diff --git a/be/ut/dtm0_log_ut.c b/be/ut/dtm0_log_ut.c
index 88f9ec77127..281a8654e12 100644
--- a/be/ut/dtm0_log_ut.c
+++ b/be/ut/dtm0_log_ut.c
@@ -281,7 +281,7 @@ void test_volatile_dtm0_log(void)
 
 	m0_mutex_unlock(&log->dl_lock);
 	m0_be_dtm0_log_fini(log);
-	m0_be_dtm0_log_free(&log);
+	m0_be_dtm0_log_free0(&log);
 	m0_dtm0_clk_src_fini(&cs);
 }
 
@@ -625,7 +625,7 @@ static void m0_be_ut_dtm0_log_init_fini(void)
 	m0_be_dtm0_log_iter_fini(&iter);
 
 	m0_be_dtm0_log_fini(log);
-	m0_be_dtm0_log_free(&log);
+	m0_be_dtm0_log_free0(&log);
 	m0_dtm0_clk_src_fini(&cs);
 }
 
@@ -635,10 +635,10 @@ static void m0_be_ut_dtm0_log_next(void)
 	struct m0_buf           buf = {};
 
 	struct m0_be_dtm0_log_iter iter;
-	struct m0_dtm0_log_rec	   out;
+	struct m0_dtm0_log_rec     out;
 	struct m0_dtm0_clk_src     cs;
 	struct m0_be_dtm0_log     *log;
-	int rc;
+	int                        rc;
 
 	m0_dtm0_clk_src_init(&cs, M0_DTM0_CS_PHYS);
 
@@ -660,11 +660,12 @@ static void m0_be_ut_dtm0_log_next(void)
 
 	m0_be_dtm0_log_iter_init(&iter, log);
 	rc = m0_be_dtm0_log_iter_next(&iter, &out);
-	M0_UT_ASSERT(rc == 1);
+	M0_UT_ASSERT(rc == 0);
+	M0_UT_ASSERT(ut_dl_verify_log_rec(&out, 42));
 	m0_dtm0_log_iter_rec_fini(&out);
 
 	rc = m0_be_dtm0_log_iter_next(&iter, &out);
-	M0_UT_ASSERT(rc == 0);
+	M0_UT_ASSERT(rc != 0);
 	m0_be_dtm0_log_iter_fini(&iter);
 
 	/* make log finalisation happy */
@@ -672,7 +673,7 @@ static void m0_be_ut_dtm0_log_next(void)
 	M0_UT_ASSERT(rc == 0);
 	m0_mutex_unlock(&log->dl_lock);
 	m0_be_dtm0_log_fini(log);
-	m0_be_dtm0_log_free(&log);
+	m0_be_dtm0_log_free0(&log);
 	m0_dtm0_clk_src_fini(&cs);
 }
 
diff --git a/cas/cas.h b/cas/cas.h
index fb09975e798..7e9e92fb4b9 100644
--- a/cas/cas.h
+++ b/cas/cas.h
@@ -471,6 +471,9 @@ M0_INTERNAL int m0_cas_fom_spawn(
 	struct m0_fop           *cas_fop,
 	void                   (*on_fom_complete)(struct m0_fom_thralldom *,
 						  struct m0_fom           *));
+M0_INTERNAL uint32_t m0_cas_svc_device_id_get(
+	const struct m0_reqh_service_type *stype,
+	const struct m0_reqh              *reqh);
 #else
 #define m0_cas_svc_init()
 #define m0_cas_svc_fini()
diff --git a/cas/service.c b/cas/service.c
index 515bff9675f..2458b824855 100644
--- a/cas/service.c
+++ b/cas/service.c
@@ -308,9 +308,21 @@ enum {
 	STATS_NR
 };
 
+enum {
+	INVALID_CAS_SDEV_ID = UINT32_MAX
+};
+
 struct cas_service {
 	struct m0_reqh_service  c_service;
 	struct m0_be_domain    *c_be_domain;
+
+	/**
+	 * sdev used by an instance of CAS service. This should be just one
+	 * for the standard configuration, but current unit tests use more
+	 * than one storage device attached to emulate several CAS services.
+	 * In this case a special invalid value is used.
+	 */
+	uint32_t                c_sdev_id;
 };
 
 struct cas_kv {
@@ -556,6 +568,60 @@ m0_cas__ut_svc_be_get(struct m0_reqh_service *svc)
 	return service->c_be_domain;
 }
 
+static int cas_service_sdev_id_set(struct cas_service *cas_svc)
+{
+	struct m0_reqh         *reqh;
+	struct m0_conf_cache   *cache;
+	struct m0_conf_obj     *obj;
+	struct m0_fid          *cas_svc_fid;
+	struct m0_conf_service *service;
+	struct m0_conf_obj     *sdevs_dir;
+	struct m0_conf_sdev    *sdev = NULL;
+	bool                    found = false;
+	int                     rc = 0;
+
+	M0_ASSERT(cas_svc->c_sdev_id == INVALID_CAS_SDEV_ID);
+
+	if (cas_in_ut())
+		return M0_RC(0);
+
+	reqh = cas_svc->c_service.rs_reqh;
+	cache = &m0_reqh2confc(reqh)->cc_cache;
+
+	cas_svc_fid = &cas_svc->c_service.rs_service_fid;
+	obj = m0_conf_cache_lookup(cache, cas_svc_fid);
+	M0_ASSERT(obj != NULL);
+
+	service = M0_CONF_CAST(obj, m0_conf_service);
+	M0_ASSERT(service != NULL);
+
+	sdevs_dir = &service->cs_sdevs->cd_obj;
+	rc = m0_confc_open_sync(&sdevs_dir, sdevs_dir, M0_FID0);
+	if (rc != 0)
+		return M0_RC(rc);
+
+	obj = NULL;
+	while ((rc = m0_confc_readdir_sync(sdevs_dir, &obj)) > 0) {
+		sdev = M0_CONF_CAST(obj, m0_conf_sdev);
+		if (!found) {
+			cas_svc->c_sdev_id = sdev->sd_dev_idx;
+			found = true;
+		} else {
+			/*
+			 * Several devices attached to a single CAS service are
+			 * possible if we are run in ut. In this case we use
+			 * an invalid value for attached storage device.
+			 * Also we can not break the loop until whole directory
+			 * is iterated since it leads to crash during request
+			 * handler finalisation.
+			 */
+			cas_svc->c_sdev_id = INVALID_CAS_SDEV_ID;
+		}
+	}
+
+	m0_confc_close(sdevs_dir);
+	return M0_RC(rc);
+}
 
 static int cas_service_start(struct m0_reqh_service *svc)
 {
@@ -568,6 +634,7 @@ static int cas_service_start(struct m0_reqh_service *svc)
 	/* XXX It's a workaround. It's needed until we have a better way. */
 	service->c_be_domain = ut_dom != NULL ?
 			       ut_dom : svc->rs_reqh_ctx->rc_beseg->bs_domain;
+	service->c_sdev_id = INVALID_CAS_SDEV_ID;
 	rc = m0_ctg_store_init(service->c_be_domain);
 	if (rc == 0) {
 		/*
@@ -576,6 +643,7 @@ static int cas_service_start(struct m0_reqh_service *svc)
 		 * If no pending index drop, it finishes soon.
 		 */
 		m0_cas_gc_start(svc);
+		rc = cas_service_sdev_id_set(service);
 	}
 	return rc;
 }
@@ -1832,6 +1900,7 @@ static int cas_device_check(const struct cas_fom   *fom,
 			pm = &pver->pv_mach;
 			rc = cas_sdev_state(pm, device_id, &state);
 			if (rc == 0 && !M0_IN(state, (M0_PNDS_ONLINE,
+						      M0_PNDS_OFFLINE,
 						      M0_PNDS_SNS_REBALANCING)))
 				rc = M0_ERR(-EBADFD);
 		} else
@@ -2610,6 +2679,26 @@ M0_INTERNAL int m0_cas_fom_spawn(
 	return M0_RC(rc);
 }
 
+M0_INTERNAL uint32_t m0_cas_svc_device_id_get(
+	const struct m0_reqh_service_type *stype,
+	const struct m0_reqh              *reqh)
+{
+	struct m0_reqh_service *svc;
+	struct cas_service     *cas_svc;
+
+	M0_PRE(stype != NULL);
+	M0_PRE(reqh != NULL);
+
+	svc = m0_reqh_service_find(stype, reqh);
+	M0_ASSERT(svc != NULL);
+	M0_ASSERT(m0_reqh_service_state_get(svc) == M0_RST_STARTED);
+
+	cas_svc = M0_AMB(cas_svc, svc, c_service);
+
+	M0_ASSERT(cas_svc->c_sdev_id != INVALID_CAS_SDEV_ID);
+	return cas_svc->c_sdev_id;
+}
+
 static int cas_ctg_crow_handle(struct cas_fom *fom, const struct m0_cas_id *cid)
 {
 	struct m0_fop         *fop;
diff --git a/dix/fid_convert.c b/dix/fid_convert.c
index 5a221d89de0..c326158c554 100644
--- a/dix/fid_convert.c
+++ b/dix/fid_convert.c
@@ -35,6 +35,15 @@
  * @{
  */
 
+/* Set device id in a DIX fid. */
+M0_INTERNAL void m0_dix_fid__device_id_set(struct m0_fid *fid,
+					   uint32_t       dev_id)
+{
+	M0_PRE(fid != NULL && (dev_id <= M0_DIX_FID_DEVICE_ID_MAX));
+	fid->f_container = (fid->f_container & ~M0_DIX_FID_DEVICE_ID_MASK) |
+			   (((uint64_t)dev_id) << M0_DIX_FID_DEVICE_ID_OFFSET);
+}
+
 /* extract bits [32, 56) from fid->f_container */
 M0_INTERNAL uint32_t m0_dix_fid__device_id_extract(const struct m0_fid *fid)
 {
diff --git a/dix/fid_convert.h b/dix/fid_convert.h
index 548636e0e6a..3508201e450 100644
--- a/dix/fid_convert.h
+++ b/dix/fid_convert.h
@@ -78,6 +78,8 @@ M0_INTERNAL bool m0_dix_fid_validate_cctg(const struct m0_fid *cctg_fid);
 
 M0_INTERNAL uint32_t m0_dix_fid__device_id_extract(const struct m0_fid *fid);
 
+M0_INTERNAL void m0_dix_fid__device_id_set(struct m0_fid *fid,
+					   uint32_t       dev_id);
 /** @} end of dix group */
 #endif /* __MOTR_DIX_FID_CONVERT_H__ */
 
diff --git a/dix/req.c b/dix/req.c
index 8f797311ab9..f5cb4b113fd 100644
--- a/dix/req.c
+++ b/dix/req.c
@@ -1666,33 +1666,6 @@ static void dix_rop_completed(struct m0_sm_group *grp, struct m0_sm_ast *ast)
 	}
 }
 
-static void dix_rop_one_completed(struct m0_dix_cas_rop *crop)
-{
-	struct m0_dix_req     *dreq = crop->crp_parent;
-	struct m0_dix_rop_ctx *rop;
-
-	M0_ENTRY();
-	M0_PRE(!dreq->dr_is_meta);
-	M0_PRE(M0_IN(dreq->dr_type, (DIX_PUT, DIX_DEL)));
-	M0_PRE(dreq->dr_dtx != NULL);
-	M0_PRE(dix_req_smgrp(dreq) == dreq->dr_dtx->tx_dtx->dd_sm.sm_grp);
-
-	rop = crop->crp_parent->dr_rop;
-	dix_cas_rop_rc_update(crop, 0);
-
-	m0_dtx0_executed(dreq->dr_dtx, crop->crp_pa_idx);
-
-	if (rop->dg_completed_nr == rop->dg_cas_reqs_nr) {
-		rop->dg_ast = (struct m0_sm_ast) {
-			.sa_cb = dix_rop_completed,
-			.sa_datum = dreq,
-		};
-		m0_sm_ast_post(dix_req_smgrp(dreq), &rop->dg_ast);
-	}
-
-	M0_LEAVE();
-}
-
 static bool dix_cas_rop_clink_cb(struct m0_clink *cl)
 {
 	struct m0_dix_cas_rop  *crop = container_of(cl, struct m0_dix_cas_rop,
@@ -1714,28 +1687,20 @@ static bool dix_cas_rop_clink_cb(struct m0_clink *cl)
 				dreq, crop->crp_creq.ccr_sess,
 				&crop->crp_creq.ccr_remid);
 
-
 		m0_clink_del(cl);
 		m0_clink_fini(cl);
 		rop = crop->crp_parent->dr_rop;
 		rop->dg_completed_nr++;
 		M0_PRE(rop->dg_completed_nr <= rop->dg_cas_reqs_nr);
 
-		if (dreq->dr_dtx != NULL) {
-			M0_ASSERT(dix_req_smgrp(dreq) ==
-				  dreq->dr_dtx->tx_dtx->dd_sm.sm_grp);
-			dix_rop_one_completed(crop);
-		} else {
-			if (rop->dg_completed_nr == rop->dg_cas_reqs_nr) {
-				rop->dg_ast = (struct m0_sm_ast) {
-					.sa_cb = dix_rop_completed,
-					.sa_datum = dreq,
-				};
-				m0_sm_ast_post(dix_req_smgrp(dreq),
-					       &rop->dg_ast);
-			}
+		if (rop->dg_completed_nr == rop->dg_cas_reqs_nr) {
+			rop->dg_ast = (struct m0_sm_ast) {
+				.sa_cb = dix_rop_completed,
+				.sa_datum = dreq,
+			};
+			m0_sm_ast_post(dix_req_smgrp(dreq),
+				       &rop->dg_ast);
 		}
-
 	}
 	return true;
 }
@@ -1747,7 +1712,6 @@ static int dix_cas_rops_send(struct m0_dix_req *req)
 	struct m0_dix_cas_rop      *cas_rop;
 	struct m0_cas_req          *creq;
 	uint32_t                    sdev_idx;
-	uint32_t                    pa_idx;
 	struct m0_cas_id            cctg_id;
 	struct m0_reqh_service_ctx *cas_svc;
 	struct m0_dix_layout       *layout = &req->dr_indices[0].dd_layout;
@@ -1812,17 +1776,6 @@ static int dix_cas_rops_send(struct m0_dix_req *req)
 		}
 
 		if (rc != 0) {
-			/*
-			 * Treat failed and not sent CAS requests as executed
-			 * to unblock the EXECUTED-ALL logic. It allows to move
-			 * transaction to the stable state once the persistent
-			 * message received (EXECUTED state required for all
-			 * participants). So EXECUTED participant state is
-			 * reused in case of failure.
-			 */
-			if (req->dr_dtx != NULL)
-				m0_dtx0_executed(req->dr_dtx,
-						 cas_rop->crp_pa_idx);
 			m0_clink_del(&cas_rop->crp_clink);
 			m0_clink_fini(&cas_rop->crp_clink);
 			m0_cas_req_fini(&cas_rop->crp_creq);
@@ -1849,16 +1802,6 @@ static int dix_cas_rops_send(struct m0_dix_req *req)
 		rc = m0_dtx0_close(req->dr_dtx);
 		if (rc != 0)
 			return M0_ERR(rc);
-		/*
-		 * It is safe to set EXECUTED dtx state for those
-		 * participants that experience transient failure,
-		 * it allows to trigger EXECUTED-ALL logic. See
-		 * the similar comment above for details.
-		 */
-		for (pa_idx = cas_rop_tlist_length(&rop->dg_cas_reqs);
-		     pa_idx < req->dr_dtx->tx_dtx->dd_txd.dtd_ps.dtp_nr;
-		     pa_idx++)
-			m0_dtx0_executed(req->dr_dtx, pa_idx);
 	}
 
 	return M0_RC(0);
diff --git a/doc/dld/dld-index.c b/doc/dld/dld-index.c
index a1a53a2d44d..a5f11c764e8 100644
--- a/doc/dld/dld-index.c
+++ b/doc/dld/dld-index.c
@@ -53,6 +53,7 @@ file.c -->
 - @subpage spiel-dld "SPIEL API DLD" <!-- spiel/spiel.h -->
 - @subpage cas-dld "The catalogue service (CAS)" <!-- cas/service.c -->
 - @subpage fis-dld "Fault Injection at run time" <!-- fis/fi_service.h -->
+- @subpage dtm0br-dld "DTM0 basic recovery SM" <!-- dtm0/recovery.c -->
 
 Detailed designs should use the <i>@subpage DLD "Motr DLD Template"
 <!-- doc/dld-template.c --> </i> as a style guide.
diff --git a/dtm0/Makefile.sub b/dtm0/Makefile.sub
index 965dbbd2dee..3d9a3ffee64 100644
--- a/dtm0/Makefile.sub
+++ b/dtm0/Makefile.sub
@@ -15,6 +15,7 @@ nobase_motr_include_HEADERS += \
                             dtm0/remach.h \
                             dtm0/service.h \
                             dtm0/svc_internal.h \
+                            dtm0/recovery.h \
                             dtm0/tx_desc.h
 
 
@@ -33,6 +34,7 @@ motr_libmotr_la_SOURCES += \
                         dtm0/pruner.c \
                         dtm0/remach.c \
                         dtm0/service.c \
+                        dtm0/recovery.c \
                         dtm0/tx_desc.c
 
 
diff --git a/dtm0/addb2.h b/dtm0/addb2.h
index 79f1918295d..7203c0ff812 100644
--- a/dtm0/addb2.h
+++ b/dtm0/addb2.h
@@ -34,9 +34,15 @@
 #include "addb2/identifier.h"
 
 enum m0_avi_dtm0_labels {
-	M0_AVI_DTX0_SM_STATE = M0_AVI_DTM0_RANGE_START,
+	/* dtx0 */
+	M0_AVI_DTX0_SM_STATE = M0_AVI_DTM0_RANGE_START + 1,
 	M0_AVI_DTX0_SM_COUNTER,
 	M0_AVI_DTX0_SM_COUNTER_END = M0_AVI_DTX0_SM_COUNTER + 0x100,
+
+	/* Recovery Machine */
+	M0_AVI_DRM_SM_STATE = M0_AVI_DTX0_SM_COUNTER_END + 1,
+	M0_AVI_DRM_SM_COUNTER,
+	M0_AVI_DRM_SM_COUNTER_END = M0_AVI_DRM_SM_COUNTER + 0x100,
 };
 
 /** @} end of dtm0 group */
diff --git a/dtm0/drlink.c b/dtm0/drlink.c
index 9e5dc46d443..e29eeb75d1b 100644
--- a/dtm0/drlink.c
+++ b/dtm0/drlink.c
@@ -23,6 +23,7 @@
 #include "lib/trace.h"
 #include "addb2/identifier.h"        /* M0_AVI_FOM_TO_TX */
 #include "dtm0/fop.h"                /* dtm0_req_fop */
+#include "dtm0/fop_xc.h"             /* dtm0_req_fop_xc */
 #include "dtm0/service.h"            /* m0_dtm0_service */
 #include "dtm0/svc_internal.h"       /* dtm0_process */
 #include "lib/coroutine.h"           /* m0_co API */
@@ -31,6 +32,18 @@
 #include "rpc/rpc.h"                 /* m0_rpc_item_post */
 #include "rpc/rpc_machine.h"         /* m0_rpc_machine */
 #include "rpc/rpc_opcodes.h"         /* M0_DTM0_{RLINK,REQ}_OPCODE */
+#include "lib/string.h"              /* m0_streq */
+
+enum {
+	/*
+	 * TODO: DTM model assumes infinite timeouts. But this is too scary at
+	 * the moment, we cannot yet rely that infinite timeout will work
+	 * without issues in connect/disconnect case. These timeouts will be
+	 * adjusted/reworked when we work more on stabilising the DTM.
+	 */
+	DRLINK_CONNECT_TIMEOUT_SEC = 1,
+	DRLINK_DISCONN_TIMEOUT_SEC = DRLINK_CONNECT_TIMEOUT_SEC,
+};
 
 struct drlink_fom {
 	struct m0_fom            df_gen;
@@ -71,7 +84,9 @@ static const struct m0_fom_ops drlink_fom_ops = {
 };
 
 static struct m0_fom_type drlink_fom_type;
-static const struct m0_fom_type_ops drlink_fom_type_ops = {};
+static const struct m0_fom_type_ops drlink_fom_type_ops = {
+	.fto_create = NULL
+};
 const static struct m0_sm_conf drlink_fom_conf;
 
 M0_INTERNAL int m0_dtm0_rpc_link_mod_init(void)
@@ -88,33 +103,26 @@ M0_INTERNAL void m0_dtm0_rpc_link_mod_fini(void)
 {
 }
 
-/* Create a deep copy of the given request. */
+/** Create a deep copy of the given request. */
 static struct dtm0_req_fop *dtm0_req_fop_dup(const struct dtm0_req_fop *src)
 {
-	int                  rc;
-	struct dtm0_req_fop *dst;
-
-	M0_ALLOC_PTR(dst);
-	if (dst == NULL)
-		return NULL;
-
-	rc = m0_dtm0_tx_desc_copy(&src->dtr_txr, &dst->dtr_txr);
-	if (rc != 0) {
-		M0_ASSERT(rc == -ENOMEM);
-		m0_free(dst);
-		return NULL;
-	}
-
-	rc = m0_buf_copy(&dst->dtr_payload, &src->dtr_payload);
-	if (rc != 0) {
-		m0_dtm0_tx_desc_fini(&dst->dtr_txr);
-		m0_free(dst);
-		return NULL;
-	}
-
-	dst->dtr_msg = src->dtr_msg;
-
-	return dst;
+	int                 rc = 0;
+	struct m0_xcode_obj src_obj;
+	struct m0_xcode_obj dest_obj;
+	struct m0_xcode_ctx sctx;
+	struct m0_xcode_ctx dctx;
+	struct dtm0_req_fop *dest = NULL;
+
+	/* There is no const version of xcode objects, we'll have to cast it. */
+	src_obj = M0_XCODE_OBJ(dtm0_req_fop_xc, (struct dtm0_req_fop *)src);
+	dest_obj = M0_XCODE_OBJ(dtm0_req_fop_xc, NULL);
+	m0_xcode_ctx_init(&sctx, &src_obj);
+	m0_xcode_ctx_init(&dctx, &dest_obj);
+	dctx.xcx_alloc = m0_xcode_alloc;
+	rc = m0_xcode_dup(&dctx, &sctx);
+	if (rc == 0)
+		dest = dctx.xcx_it.xcu_stack[0].s_obj.xo_ptr;
+	return dest;
 }
 
 static void dtm0_req_fop_fini(struct dtm0_req_fop *req)
@@ -189,7 +197,8 @@ static int drlink_fom_init(struct drlink_fom            *fom,
 static void drlink_fom_fini(struct m0_fom *fom)
 {
 	struct drlink_fom *df = fom2drlink_fom(fom);
-	m0_fop_put_lock(df->df_rfop);
+	if (df->df_rfop != NULL)
+		m0_fop_put_lock(df->df_rfop);
 	m0_co_op_fini(&df->df_co_op);
 	m0_fom_fini(fom);
 	m0_free(fom);
@@ -207,23 +216,6 @@ static void co_long_write_lock(struct m0_co_context     *context,
 	M0_CO_YIELD_RC(context, outcome);
 }
 
-static void co_rpc_link_connect(struct m0_co_context *context,
-				struct m0_rpc_link   *rlink,
-				struct m0_fom        *fom,
-				int                   next_phase)
-{
-	M0_CO_REENTER(context);
-
-	m0_chan_lock(&rlink->rlk_wait);
-	m0_fom_wait_on(fom, &rlink->rlk_wait, &fom->fo_cb);
-	m0_chan_unlock(&rlink->rlk_wait);
-
-	m0_rpc_link_connect_async(rlink, M0_TIME_NEVER, NULL);
-	m0_fom_phase_set(fom, next_phase);
-
-	M0_CO_YIELD_RC(context, M0_FSO_WAIT);
-}
-
 static int find_or_add(struct m0_dtm0_service *dtms,
 		       const struct m0_fid    *tgt,
 		       struct dtm0_process   **out)
@@ -254,6 +246,7 @@ enum drlink_fom_state {
 	DRF_INIT = M0_FOM_PHASE_INIT,
 	DRF_DONE = M0_FOM_PHASE_FINISH,
 	DRF_LOCKING = M0_FOM_PHASE_NR,
+	DRF_DISCONNECTING,
 	DRF_CONNECTING,
 	DRF_SENDING,
 	DRF_WAITING_FOR_REPLY,
@@ -286,10 +279,15 @@ static struct m0_sm_state_descr drlink_fom_states[] = {
 		.sd_name    = #name,  \
 		.sd_allowed = allowed \
 	}
-	_ST(DRF_LOCKING,           M0_BITS(DRF_CONNECTING,
+	_ST(DRF_LOCKING,           M0_BITS(DRF_DISCONNECTING,
+					   DRF_CONNECTING,
 					   DRF_SENDING,
 					   DRF_FAILED)),
+	_ST(DRF_DISCONNECTING,        M0_BITS(DRF_CONNECTING,
+					      DRF_FAILED)),
 	_ST(DRF_CONNECTING,        M0_BITS(DRF_SENDING,
+					   DRF_CONNECTING,
+					   DRF_DISCONNECTING,
 					   DRF_FAILED)),
 	_ST(DRF_SENDING,           M0_BITS(DRF_DONE,
 					   DRF_WAITING_FOR_REPLY,
@@ -345,6 +343,7 @@ static int dtm0_process_rlink_reinit(struct dtm0_process *proc,
 	const int max_in_flight = DTM0_MAX_RPCS_IN_FLIGHT;
 
 	if (!M0_IS0(&proc->dop_rlink)) {
+		m0_rpc_conn_sessions_cancel(&proc->dop_rlink.rlk_conn);
 		m0_rpc_link_fini(&proc->dop_rlink);
 		M0_SET0(&proc->dop_rlink);
 	}
@@ -360,6 +359,9 @@ static int dtm0_process_rlink_send(struct dtm0_process *proc,
 	struct m0_rpc_session  *session = &proc->dop_rlink.rlk_sess;
 	struct m0_rpc_item     *item = &fop->f_item;
 
+	M0_ENTRY("remote: proc=" FID_F ", ep=%s",
+		 FID_P(&proc->dop_rproc_fid), proc->dop_rep);
+
 	item->ri_ops      = &dtm0_req_fop_rlink_rpc_item_ops;
 	item->ri_session  = session;
 	item->ri_prio     = M0_RPC_ITEM_PRIO_MID;
@@ -368,43 +370,7 @@ static int dtm0_process_rlink_send(struct dtm0_process *proc,
 	if (drf->df_wait_for_ack)
 		m0_co_op_active(&drf->df_co_op);
 
-	return m0_rpc_post(item);
-}
-
-/** An aggregated status of a dtm0_process:dop_rlink */
-enum dpr_state {
-	/** Link is not alive but we can resurrect it. */
-	DPR_TRANSIENT,
-	/** Link is alive and ready to transfer items. */
-	DPR_ONLINE,
-	/** Link is permanently dead. */
-	DPR_FAILED,
-};
-
-static enum dpr_state dpr_state_infer(struct dtm0_process *proc)
-{
-	/*
-	 * TODO:
-	 * Observe the states of the following enitities:
-	 *	RPC connection
-	 *	RPC session
-	 *	Conf obj
-	 * and then decide whether it is alive, dead or permanently dead.
-	 *
-	 * @verbatim
-	 *	if (conf_obj is ONLINE) {
-	 *		if (conn is ACTIVE && session is in (IDLE, BUSY))
-	 *			return ONLINE;
-	 *		else
-	 *			return TRANSIENT;
-	 *	} else
-	 *		return FAILED;
-	 * @endverbatim
-	 */
-	if (m0_rpc_link_is_connected(&proc->dop_rlink))
-		return DPR_ONLINE;
-
-	return DPR_TRANSIENT;
+	return M0_RC(m0_rpc_post(item));
 }
 
 /*
@@ -430,12 +396,96 @@ static void drlink_addb_drf2parent_relate(struct drlink_fom *drf)
 		     m0_sm_id_get(&drf->df_gen.fo_sm_phase));
 }
 
+
 #define F M0_CO_FRAME_DATA
-static void drlink_coro_fom_tick(struct m0_co_context *context)
+
+/**
+ * Connect-disconnect context for an rpc link.
+ * The context tightens together clink-based notification from rpc link
+ * channel and a co_op.
+ */
+struct rlink_cd_ctx {
+	struct m0_clink dc_clink;
+	struct m0_co_op dc_op;
+};
+
+static bool rlink_cd_cb(struct m0_clink *clink)
+{
+	struct rlink_cd_ctx *dc = M0_AMB(dc, clink, dc_clink);
+	m0_co_op_done(&dc->dc_op);
+	return false;
+}
+
+static void co_rlink_do(struct m0_co_context *context,
+			struct dtm0_process  *proc,
+			enum drlink_fom_state what)
 {
-	int                 rc   = 0;
 	struct drlink_fom  *drf  = M0_AMB(drf, context, df_co);
 	struct m0_fom      *fom  = &drf->df_gen;
+	int                 timeout_sec = 0;
+	void              (*action)(struct m0_rpc_link *, m0_time_t,
+				    struct m0_clink *) = NULL;
+
+	M0_CO_REENTER(context, struct rlink_cd_ctx dc;);
+
+	M0_SET0(&F(dc));
+	m0_clink_init(&F(dc).dc_clink, rlink_cd_cb);
+	F(dc).dc_clink.cl_is_oneshot = true;
+	m0_co_op_init(&F(dc).dc_op);
+	m0_co_op_active(&F(dc).dc_op);
+
+	switch (what) {
+	case DRF_CONNECTING:
+		timeout_sec = DRLINK_CONNECT_TIMEOUT_SEC;
+		action = m0_rpc_link_connect_async;
+		break;
+	case DRF_DISCONNECTING:
+		m0_rpc_conn_sessions_cancel(&proc->dop_rlink.rlk_conn);
+		timeout_sec = DRLINK_DISCONN_TIMEOUT_SEC;
+		action = m0_rpc_link_disconnect_async;
+		break;
+	default:
+		M0_IMPOSSIBLE("%d is something that cannot be done", what);
+		break;
+	}
+
+	action(&proc->dop_rlink, m0_time_from_now(timeout_sec, 0),
+	       &F(dc).dc_clink);
+
+	M0_CO_YIELD_RC(context, m0_co_op_tick_ret(&F(dc).dc_op, fom, what));
+	m0_chan_wait(&F(dc).dc_clink);
+
+	m0_co_op_fini(&F(dc).dc_op);
+	m0_clink_fini(&F(dc).dc_clink);
+}
+
+static bool has_volatile_param(struct m0_conf_obj     *obj)
+{
+	struct m0_conf_service *svc;
+	const char            **param;
+
+	svc = M0_CONF_CAST(obj, m0_conf_service);
+	M0_ASSERT(svc->cs_params != NULL);
+
+	for (param = svc->cs_params; *param != NULL; ++param) {
+		if (m0_streq(*param, "origin:in-volatile"))
+			return true;
+		else if (m0_streq(*param, "origin:in-persistent"))
+			return false;
+	}
+
+	M0_IMPOSSIBLE("Service origin is not defined in the config?");
+}
+
+static void drlink_coro_fom_tick(struct m0_co_context *context)
+{
+	int                 rc     = 0;
+	struct drlink_fom  *drf    = M0_AMB(drf, context, df_co);
+	struct m0_fom      *fom    = &drf->df_gen;
+	const char         *reason = "Unknown";
+	struct m0_conf_obj *obj    = NULL;
+	struct m0_confc    *confc  = m0_reqh2confc(m0_fom2reqh(fom));
+	bool                always_reconnect = false;
 
 	M0_CO_REENTER(context,
 		      struct m0_long_lock_link   llink;
@@ -455,8 +505,10 @@ static void drlink_coro_fom_tick(struct m0_co_context *context)
 	 */
 	m0_mutex_unlock(&drf->df_svc->dos_generic.rs_mutex);
 
-	if (rc != 0)
+	if (rc != 0) {
+		reason = "Cannot find-or-add remote process";
 		goto out;
+	}
 
 	m0_long_lock_link_init(&F(llink), fom, &F(llock_addb2));
 
@@ -466,27 +518,73 @@ static void drlink_coro_fom_tick(struct m0_co_context *context)
 					      DRF_LOCKING));
 	M0_ASSERT(m0_long_is_write_locked(&F(proc)->dop_llock, fom));
 
-	if (dpr_state_infer(F(proc)) == DPR_TRANSIENT) {
+	m0_conf_cache_lock(&confc->cc_cache);
+	obj = m0_conf_cache_lookup(&confc->cc_cache, &F(proc)->dop_rserv_fid);
+	M0_ASSERT(obj != NULL);
+	if (has_volatile_param(obj) && obj->co_ha_state != M0_NC_ONLINE) {
+		M0_LOG(M0_DEBUG, "Force state transition %s -> %s for " FID_F,
+		       m0_ha_state2str(obj->co_ha_state),
+		       m0_ha_state2str(obj->co_ha_state),
+		       FID_P(&F(proc)->dop_rserv_fid));
+		obj->co_ha_state = M0_NC_ONLINE;
+		m0_chan_broadcast(&obj->co_ha_chan);
+		always_reconnect = true;
+	}
+	m0_conf_cache_unlock(&confc->cc_cache);
+
+	/* Reconnect if the session was canceled */
+	if (m0_rpc_link_is_connected(&F(proc)->dop_rlink) &&
+	    F(proc)->dop_rlink.rlk_sess.s_cancelled) {
+		always_reconnect = true;
+	}
+
+	/* XXX:
+	 * At this moment we cannot detect client falures.
+	 * Because of that, we cannot detect the case where
+	 * the client drops RPC items because it cannot
+	 * find the corresponding connection.
+	 * As a workaround, we force drlink to re-connect
+	 * whenever it tries to send a message.
+	 */
+
+	if (always_reconnect) {
+		if (m0_rpc_link_is_connected(&F(proc)->dop_rlink)) {
+			M0_CO_FUN(context, co_rlink_do(context, F(proc),
+						       DRF_DISCONNECTING));
+		}
+
 		rc = dtm0_process_rlink_reinit(F(proc), drf);
-		if (rc != 0)
+		if (rc != 0) {
+			reason = "Cannot reinit RPC link.";
 			goto unlock;
+		}
 		/*
 		 * TODO handle network failure after link is connected, but
 		 * before the message is successfully sent
 		 */
-		M0_CO_FUN(context, co_rpc_link_connect(context,
-						       &F(proc)->dop_rlink,
-						       fom, DRF_CONNECTING));
+		M0_CO_FUN(context, co_rlink_do(context, F(proc),
+					       DRF_CONNECTING));
+	} else {
+		if (!m0_rpc_link_is_connected(&F(proc)->dop_rlink)) {
+			rc = dtm0_process_rlink_reinit(F(proc), drf);
+			if (rc != 0) {
+				reason = "Cannot reinit RPC link.";
+				goto unlock;
+			}
+			M0_CO_FUN(context, co_rlink_do(context, F(proc),
+						       DRF_CONNECTING));
+		}
 	}
 
-	if (dpr_state_infer(F(proc)) == DPR_FAILED)
-			goto unlock;
+	M0_ASSERT(ergo(m0_rpc_link_is_connected(&F(proc)->dop_rlink),
+		       !F(proc)->dop_rlink.rlk_sess.s_cancelled));
 
-	M0_ASSERT(dpr_state_infer(F(proc)) == DPR_ONLINE);
 	m0_fom_phase_set(fom, DRF_SENDING);
 	rc = dtm0_process_rlink_send(F(proc), drf);
-	if (rc != 0)
+	if (rc != 0) {
+		reason = "Failed to post a message to RPC layer.";
 		goto unlock;
+	}
 
 	/* Safety: FOP (and item) can be released only in ::drlink_fom_fini. */
 	drlink_addb_drf2item_relate(drf);
@@ -497,15 +595,27 @@ static void drlink_coro_fom_tick(struct m0_co_context *context)
 						 DRF_WAITING_FOR_REPLY));
 		m0_co_op_reset(&drf->df_co_op);
 		rc = m0_rpc_item_error(&drf->df_rfop->f_item);
+		if (rc != 0) {
+			reason = "Rpc item error";
+			goto unlock;
+		}
 	}
 
 unlock:
+	if (drf->df_rfop != NULL) {
+		m0_fop_put_lock(drf->df_rfop);
+		drf->df_rfop = NULL;
+	}
+
 	m0_long_write_unlock(&F(proc)->dop_llock, &F(llink));
 	m0_long_lock_link_fini(&F(llink));
 out:
 	/* TODO handle the error */
-	if (rc != 0)
+	if (rc != 0) {
+		M0_LOG(M0_ERROR, "Failed to send a message to" FID_F ", rc=%d, "
+		       "reason=%s", FID_P(&drf->df_tgt), rc, reason);
 		m0_fom_phase_move(fom, rc, DRF_FAILED);
+	}
 	if (drf->df_op != NULL)
 		m0_be_op_done(drf->df_op);
 
diff --git a/dtm0/dtx.c b/dtm0/dtx.c
index b9825b769cd..f81009e0bde 100644
--- a/dtm0/dtx.c
+++ b/dtm0/dtx.c
@@ -46,15 +46,7 @@ static struct m0_sm_state_descr dtx_states[] = {
 	},
 	[M0_DDS_INPROGRESS] = {
 		.sd_name      = "inprogress",
-		.sd_allowed   = M0_BITS(M0_DDS_EXECUTED, M0_DDS_FAILED),
-	},
-	[M0_DDS_EXECUTED] = {
-		.sd_name      = "executed",
-		.sd_allowed   = M0_BITS(M0_DDS_EXECUTED_ALL),
-	},
-	[M0_DDS_EXECUTED_ALL] = {
-		.sd_name      = "executed-all",
-		.sd_allowed   = M0_BITS(M0_DDS_STABLE),
+		.sd_allowed   = M0_BITS(M0_DDS_STABLE, M0_DDS_FAILED),
 	},
 	[M0_DDS_STABLE] = {
 		.sd_name      = "stable",
@@ -72,11 +64,9 @@ static struct m0_sm_state_descr dtx_states[] = {
 
 static struct m0_sm_trans_descr dtx_trans[] = {
 	{ "populated",  M0_DDS_INIT,         M0_DDS_INPROGRESS   },
-	{ "executed",   M0_DDS_INPROGRESS,   M0_DDS_EXECUTED     },
-	{ "exec-all",   M0_DDS_EXECUTED,     M0_DDS_EXECUTED_ALL },
-	{ "exec-fail",  M0_DDS_INPROGRESS,   M0_DDS_FAILED       },
-	{ "stable",     M0_DDS_EXECUTED_ALL, M0_DDS_STABLE       },
-	{ "prune",      M0_DDS_STABLE,       M0_DDS_DONE         }
+	{ "stabilised", M0_DDS_INPROGRESS,   M0_DDS_STABLE       },
+	{ "stab-fail",  M0_DDS_INPROGRESS,   M0_DDS_FAILED       },
+	{ "prune-it",   M0_DDS_STABLE,       M0_DDS_DONE         }
 };
 
 struct m0_sm_conf m0_dtx_sm_conf = {
@@ -109,7 +99,6 @@ static int dtx_log_insert(struct m0_dtm0_dtx *dtx)
 	struct m0_dtm0_log_rec *record = M0_AMB(record, dtx, dlr_dtx);
 	int                     rc;
 
-	M0_PRE(m0_dtm0_tx_desc_state_eq(&dtx->dd_txd, M0_DTPS_INPROGRESS));
 	M0_PRE(dtx->dd_dtms != NULL);
 	log = dtx->dd_dtms->dos_log;
 	M0_PRE(log != NULL);
@@ -285,12 +274,20 @@ static int dtx_fid_assign(struct m0_dtm0_dtx  *dtx,
 static int dtx_close(struct m0_dtm0_dtx *dtx)
 {
 	int rc;
+	int i;
+	struct m0_dtm0_tx_pa   *pa;
 
 	M0_ENTRY("dtx=%p", dtx);
 
 	M0_PRE(dtx != NULL);
 	M0_PRE(m0_sm_group_is_locked(dtx->dd_sm.sm_grp));
 
+	for (i = 0; i < dtx->dd_txd.dtd_ps.dtp_nr; ++i) {
+		pa = &dtx->dd_txd.dtd_ps.dtp_pa[i];
+		pa->p_state = max_check(pa->p_state,
+					(uint32_t)M0_DTPS_EXECUTED);
+	}
+
 	/*
 	 * TODO:REDO: We may want to capture the fop contents here.
 	 * See ::fol_record_pack and ::m0_fop_encdec for the details.
@@ -299,14 +296,11 @@ static int dtx_close(struct m0_dtm0_dtx *dtx)
 	 */
 
 	rc = dtx_log_insert(dtx);
-	M0_ASSERT(rc == 0);
+	if (rc == 0)
+		m0_sm_state_set(&dtx->dd_sm, M0_DDS_INPROGRESS);
+	else
+		m0_sm_move(&dtx->dd_sm, rc, M0_DDS_FAILED);
 
-	/*
-	 * Once a dtx is closed, the FOP (or FOPs) has to be serialized
-	 * into the log, so that we should no longer hold any references to it.
-	 */
-	dtx->dd_fop = NULL;
-	m0_sm_state_set(&dtx->dd_sm, M0_DDS_INPROGRESS);
 	return M0_RC(rc);
 }
 
@@ -322,24 +316,6 @@ static void dtx_done(struct m0_dtm0_dtx *dtx)
 	M0_LEAVE();
 }
 
-static void dtx_exec_all_ast_cb(struct m0_sm_group *grp, struct m0_sm_ast *ast)
-{
-	struct m0_dtm0_dtx *dtx = ast->sa_datum;
-
-	M0_ENTRY("dtx=%p", dtx);
-
-	M0_ASSERT(dtx->dd_sm.sm_state == M0_DDS_EXECUTED_ALL);
-	M0_ASSERT(dtx->dd_nr_executed == dtx->dd_txd.dtd_ps.dtp_nr);
-
-	if (m0_dtm0_tx_desc_state_exists(&dtx->dd_txd, M0_DTPS_PERSISTENT)) {
-		m0_sm_state_set(&dtx->dd_sm, M0_DDS_STABLE);
-		M0_LOG(M0_DEBUG, "dtx " DTID0_F "is stable (EXEC_ALL)",
-		       DTID0_P(&dtx->dd_txd.dtd_id));
-	}
-
-	M0_LEAVE();
-}
-
 static void dtx_persistent_ast_cb(struct m0_sm_group *grp,
 				  struct m0_sm_ast   *ast)
 {
@@ -355,21 +331,20 @@ static void dtx_persistent_ast_cb(struct m0_sm_group *grp,
 
 	m0_mutex_lock(&log->dl_lock);
 	rec = m0_be_dtm0_log_find(log, &txd->dtd_id);
+	dtx = rec == NULL ? NULL : &rec->dlr_dtx;
 
-	if (rec != NULL) {
+	if (dtx != NULL && dtx->dd_sm.sm_state < M0_DDS_STABLE) {
 		dtx = &rec->dlr_dtx;
 		m0_dtm0_tx_desc_apply(&dtx->dd_txd, txd);
 		dtx_log_update(dtx);
 
-		if (dtx->dd_sm.sm_state == M0_DDS_EXECUTED_ALL &&
-		    m0_dtm0_tx_desc_state_exists(&dtx->dd_txd,
-						 M0_DTPS_PERSISTENT)) {
-			M0_ASSERT(dtx->dd_nr_executed ==
-				  dtx->dd_txd.dtd_ps.dtp_nr);
-			M0_LOG(M0_DEBUG, "dtx " DTID0_F "is stable (PMA)",
-			       DTID0_P(&dtx->dd_txd.dtd_id));
-			m0_sm_state_set(&dtx->dd_sm, M0_DDS_STABLE);
-		}
+		M0_LOG(M0_DEBUG, "dtx " DTID0_F "is stable.",
+		       DTID0_P(&dtx->dd_txd.dtd_id));
+		/*
+		 * At this moment, DTX is considered to be stable if
+		 * at least one Pmsg was received.
+		 */
+		m0_sm_state_set(&dtx->dd_sm, M0_DDS_STABLE);
 	}
 	m0_mutex_unlock(&log->dl_lock);
 
@@ -404,9 +379,8 @@ M0_INTERNAL void m0_dtm0_dtx_pmsg_post(struct m0_be_dtm0_log *log,
 	m0_mutex_lock(&log->dl_lock);
 	rec = m0_be_dtm0_log_find(log, &txd->dtd_id);
 	dtx_sm_grp = rec != NULL ? rec->dlr_dtx.dd_sm.sm_grp : NULL;
-	m0_mutex_unlock(&log->dl_lock);
 
-	if (rec != NULL) {
+	if (dtx_sm_grp != NULL) {
 		M0_ASSERT(fop->f_opaque != NULL);
 		pma = fop->f_opaque;
 		*pma = (struct m0_dtm0_pmsg_ast) {
@@ -423,61 +397,8 @@ M0_INTERNAL void m0_dtm0_dtx_pmsg_post(struct m0_be_dtm0_log *log,
 		m0_fop_get(fop);
 		m0_sm_ast_post(dtx_sm_grp, &pma->p_ast);
 	}
-	M0_LEAVE();
-}
-
-static void dtx_executed(struct m0_dtm0_dtx *dtx, uint32_t idx)
-{
-	struct m0_dtm0_tx_pa  *pa;
-
-	M0_ENTRY("dtx=%p, idx=%"PRIu32, dtx, idx);
-
-	M0_PRE(dtx != NULL);
-	M0_PRE(m0_sm_group_is_locked(dtx->dd_sm.sm_grp));
-
-	pa = &dtx->dd_txd.dtd_ps.dtp_pa[idx];
-
-	M0_ASSERT(pa->p_state >= M0_DTPS_INPROGRESS);
-
-	pa->p_state = max_check(pa->p_state, (uint32_t)M0_DTPS_EXECUTED);
-
-	dtx->dd_nr_executed++;
-
-	if (dtx->dd_sm.sm_state < M0_DDS_EXECUTED) {
-		M0_ASSERT(dtx->dd_sm.sm_state == M0_DDS_INPROGRESS);
-		m0_sm_state_set(&dtx->dd_sm, M0_DDS_EXECUTED);
-	}
-
-	if (dtx->dd_nr_executed == dtx->dd_txd.dtd_ps.dtp_nr) {
-		M0_ASSERT(dtx->dd_sm.sm_state == M0_DDS_EXECUTED);
-		M0_ASSERT_INFO(dtds_forall(&dtx->dd_txd, >= M0_DTPS_EXECUTED),
-			       "Non-executed PAs should not exist "
-			       "at this point.");
-		m0_sm_state_set(&dtx->dd_sm, M0_DDS_EXECUTED_ALL);
-
-		/*
-		 * EXECUTED and STABLE should not be triggered within the
-		 * same ast tick. This ast helps us to enforce it.
-		 * XXX: there is a catch22-like problem with DIX states:
-		 * DIX request should already be in "FINAL" state when
-		 * the corresponding dtx reaches STABLE. However, the dtx
-		 * cannot transit from EXECUTED to STABLE (through EXEC_ALL)
-		 * if DIX reached FINAL already (the list of CAS rops has been
-		 * destroyed). So that the EXEC_ALL ast cuts this knot by
-		 * scheduling the transition EXEC_ALL -> STABLE in a separate
-		 * tick where DIX request reached FINAL.
-		 */
-		dtx->dd_exec_all_ast = (struct m0_sm_ast) {
-			.sa_cb = dtx_exec_all_ast_cb,
-			.sa_datum = dtx,
-		};
-		m0_sm_ast_post(dtx->dd_sm.sm_grp, &dtx->dd_exec_all_ast);
-	}
-
-	m0_mutex_lock(&dtx->dd_dtms->dos_log->dl_lock);
-	dtx_log_update(dtx);
-	m0_mutex_unlock(&dtx->dd_dtms->dos_log->dl_lock);
 
+	m0_mutex_unlock(&log->dl_lock);
 	M0_LEAVE();
 }
 
@@ -516,19 +437,12 @@ M0_INTERNAL void m0_dtx0_fop_assign(struct m0_dtx       *dtx,
 	dtx_fop_assign(dtx->tx_dtx, pa_idx, pa_fop);
 }
 
-
 M0_INTERNAL int m0_dtx0_close(struct m0_dtx *dtx)
 {
 	M0_PRE(dtx != NULL);
 	return dtx_close(dtx->tx_dtx);
 }
 
-M0_INTERNAL void m0_dtx0_executed(struct m0_dtx *dtx, uint32_t pa_idx)
-{
-	M0_PRE(dtx != NULL);
-	dtx_executed(dtx->tx_dtx, pa_idx);
-}
-
 M0_INTERNAL void m0_dtx0_done(struct m0_dtx *dtx)
 {
 	struct m0_be_dtm0_log *log;
diff --git a/dtm0/dtx.h b/dtm0/dtx.h
index ac3dfc8448d..f8907579b7b 100644
--- a/dtm0/dtx.h
+++ b/dtm0/dtx.h
@@ -33,10 +33,6 @@ enum m0_dtm0_dtx_state {
 	M0_DDS_INIT,
 	/* dtx has a valid tx record. */
 	M0_DDS_INPROGRESS,
-	/* dtx got one reply. */
-	M0_DDS_EXECUTED,
-	/* dtx got all replies. */
-	M0_DDS_EXECUTED_ALL,
 	/* dtx got enough PERSISTENT messages. */
 	M0_DDS_STABLE,
 	/* dtx can be released when this state reached. */
@@ -61,8 +57,6 @@ struct m0_dtm0_dtx {
 	struct m0_sm            dd_sm;
 	struct m0_dtm0_tx_desc  dd_txd;
 	struct m0_dtm0_service *dd_dtms;
-	uint32_t                dd_nr_executed;
-	struct m0_sm_ast        dd_exec_all_ast;
 
 	/*
 	 * XXX: The implementation is very simple and it relies on the idea
@@ -129,13 +123,6 @@ M0_INTERNAL void m0_dtx0_fop_assign(struct m0_dtx       *dtx,
  */
 M0_INTERNAL int m0_dtx0_close(struct m0_dtx *dtx);
 
-/**
- * Notifies DTM0 that DTX is executed on the particular participant.
- * @param dtx    A DTX that is executed on the particular participant.
- * @param pa_idx Index of the participant.
- */
-M0_INTERNAL void m0_dtx0_executed(struct m0_dtx *dtx, uint32_t pa_idx);
-
 /**
  * Marks a transaction as "no longer in-use".
  * The user does not have exclusive ownership on a dtx after it has
diff --git a/dtm0/fop.c b/dtm0/fop.c
index 3fd5b1ace6d..7b88c6bdefd 100644
--- a/dtm0/fop.c
+++ b/dtm0/fop.c
@@ -24,6 +24,7 @@
 #include "lib/trace.h"         /* M0_LOG */
 #include "cas/cas.h"
 #include "cas/cas_xc.h"
+#include "dix/fid_convert.h"   /* m0_dix_fid__device_id_set */
 #include "dtm0/fop.h"
 #include "dtm0/fop_xc.h"
 #include "dtm0/addb2.h"
@@ -58,7 +59,8 @@ static int dtm0_fom_create(struct m0_fop *fop, struct m0_fom **out,
 			       struct m0_reqh *reqh);
 static void dtm0_fom_fini(struct m0_fom *fom);
 static size_t dtm0_fom_locality(const struct m0_fom *fom);
-static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req,
+static int dtm0_cas_fop_prepare(struct m0_reqh      *reqh,
+				struct dtm0_req_fop *req,
 				struct m0_fop_type  *cas_fopt,
 				struct m0_fop      **cas_fop_out);
 static int dtm0_cas_fom_spawn(
@@ -119,6 +121,7 @@ struct m0_sm_state_descr dtm0_phases[] = {
 		.sd_name      = "dtm0-entry",
 		.sd_allowed   = M0_BITS(M0_FOPH_DTM0_LOGGING,
 					M0_FOPH_DTM0_TO_CAS,
+					M0_FOPH_DTM0_CAS_DONE,
 					M0_FOPH_SUCCESS,
 					M0_FOPH_FAILURE)
 	},
@@ -150,6 +153,8 @@ struct m0_sm_trans_descr dtm0_phases_trans[] = {
 
 	{"dtm0-to-cas", M0_FOPH_DTM0_ENTRY, M0_FOPH_DTM0_TO_CAS},
 
+	{"dtm0-empty-rec", M0_FOPH_DTM0_ENTRY, M0_FOPH_DTM0_CAS_DONE},
+
 	{"dtm0-to-cas-fail", M0_FOPH_DTM0_TO_CAS, M0_FOPH_FAILURE},
 	{"dtm0-cas-done", M0_FOPH_DTM0_TO_CAS, M0_FOPH_DTM0_CAS_DONE},
 
@@ -202,8 +207,10 @@ M0_INTERNAL int m0_dtm0_fop_init(void)
 			 .rpc_flags = M0_RPC_ITEM_TYPE_REPLY,
 			 .fom_ops   = &dtm0_req_fom_type_ops);
 
-	return m0_fop_type_addb2_instrument(&dtm0_req_fop_fopt) ?:
+	return m0_fop_type_addb2_instrument(&dtm0_req_fop_fopt);
+	/*
 		m0_fop_type_addb2_instrument(&dtm0_redo_fop_fopt);
+		*/
 }
 
 
@@ -358,7 +365,7 @@ M0_INTERNAL int m0_dtm0_on_committed(struct m0_fom            *fom,
 		if (m0_fid_eq(target, source))
 			target = &txd->dtd_id.dti_fid;
 
-		rc = m0_dtm0_req_post(dtms, NULL, &req, target, fom, false);
+		rc = m0_dtm0_req_post(dtms, NULL, &req, target, fom, true);
 		if (rc != 0) {
 			M0_LOG(M0_WARN, "Failed to send PERSISTENT msg "
 				    FID_F " -> " FID_F " (%d).",
@@ -532,7 +539,35 @@ static int dtm0_cas_fom_spawn(
 #endif
 }
 
-static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req,
+static void dtm0_cas_sdev_id_set(
+	struct m0_reqh                    *reqh,
+	const struct m0_reqh_service_type *stype,
+	struct m0_cas_op                  *cas_op)
+{
+#ifndef __KERNEL__
+	struct m0_cas_id             *cid = &cas_op->cg_id;
+	uint32_t                      sdev_id;
+
+	M0_PRE(reqh != NULL);
+	M0_PRE(stype != NULL);
+	M0_PRE(cas_op != NULL);
+
+	M0_ENTRY();
+
+	sdev_id = m0_cas_svc_device_id_get(stype, reqh);
+	M0_LOG(M0_DEBUG, "Replace device ID: %d -> %d",
+	       m0_dix_fid_cctg_device_id(&cid->ci_fid), sdev_id);
+	m0_dix_fid__device_id_set(&cid->ci_fid, sdev_id);
+
+	M0_LEAVE();
+#else
+	/* CAS service is not compiled for kernel. */
+	return;
+#endif
+}
+
+static int dtm0_cas_fop_prepare(struct m0_reqh      *reqh,
+				struct dtm0_req_fop *req,
 				struct m0_fop_type  *cas_fopt,
 				struct m0_fop      **cas_fop_out)
 {
@@ -540,6 +575,8 @@ static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req,
 	struct m0_cas_op *cas_op;
 	struct m0_fop    *cas_fop;
 
+	M0_ENTRY();
+
 	*cas_fop_out = NULL;
 
 	M0_ALLOC_PTR(cas_op);
@@ -552,9 +589,12 @@ static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req,
 			&M0_XCODE_OBJ(m0_cas_op_xc, cas_op),
 			req->dtr_payload.b_addr,
 			req->dtr_payload.b_nob);
-		if (rc == 0)
-			m0_fop_init(cas_fop, cas_fopt, cas_op, &m0_fop_release);
-		else
+		if (rc == 0) {
+			dtm0_cas_sdev_id_set(
+				reqh, cas_fopt->ft_fom_type.ft_rstype, cas_op);
+			m0_fop_init(cas_fop, cas_fopt, cas_op,
+				    &m0_fop_release);
+		} else
 			M0_LOG(M0_ERROR, "Could not decode the REDO payload");
 	}
 
@@ -565,7 +605,7 @@ static int dtm0_cas_fop_prepare(struct dtm0_req_fop *req,
 		m0_free(cas_fop);
 	}
 
-	return rc;
+	return M0_RC(rc);
 }
 
 static int dtm0_rmsg_fom_tick(struct m0_fom *fom)
@@ -576,6 +616,12 @@ static int dtm0_rmsg_fom_tick(struct m0_fom *fom)
 	struct dtm0_fom     *dfom = M0_AMB(dfom, fom, dtf_fom);
 	struct dtm0_req_fop *req  = m0_fop_data(fom->fo_fop);
 	struct m0_fop       *cas_fop = NULL;
+	struct m0_dtm0_service          *svc = m0_dtm0_fom2service(fom);
+	struct m0_dtm0_recovery_machine *m = &svc->dos_remach;
+	const struct m0_dtm0_tx_desc    *txd = &req->dtr_txr;
+
+	M0_PRE(ergo(m0_dtm0_tx_desc_is_none(txd),
+		    !!(req->dtr_flags & M0_BITS(M0_DMF_EOL))));
 
 	M0_ENTRY("fom %p phase %d", fom, phase);
 
@@ -584,7 +630,9 @@ static int dtm0_rmsg_fom_tick(struct m0_fom *fom)
 		result = m0_fom_tick_generic(fom);
 		break;
 	case M0_FOPH_DTM0_ENTRY:
-		m0_fom_phase_set(fom, M0_FOPH_DTM0_TO_CAS);
+		m0_fom_phase_set(fom,
+				 m0_dtm0_tx_desc_is_none(txd) ?
+				 M0_FOPH_DTM0_CAS_DONE : M0_FOPH_DTM0_TO_CAS);
 		break;
 	case M0_FOPH_DTM0_TO_CAS:
 		/* REDO_END()s from all recovering processes received, send
@@ -593,7 +641,8 @@ static int dtm0_rmsg_fom_tick(struct m0_fom *fom)
 		cs_ha_process_event(m0_cs_ctx_get(m0_fom_reqh(fom)),
 				    M0_CONF_HA_PROCESS_DTM_RECOVERED);
 		*/
-		rc = dtm0_cas_fop_prepare(req, &cas_put_fopt, &cas_fop);
+		rc = dtm0_cas_fop_prepare(dfom->dtf_fom.fo_service->rs_reqh,
+					  req, &cas_put_fopt, &cas_fop);
 		if (rc == 0) {
 			rc = dtm0_cas_fom_spawn(dfom, cas_fop,
 						&dtm0_cas_done_cb);
@@ -621,6 +670,25 @@ static int dtm0_rmsg_fom_tick(struct m0_fom *fom)
 					  M0_FOPH_FAILURE);
 		} else {
 			m0_fom_phase_set(fom, M0_FOPH_SUCCESS);
+			/*
+			 * TODO: make it async when recovery machine starts
+			 * using a larger sliding window or the amount of
+			 * participants gets bigger than the length of
+			 * the EOL queue.
+			 * At this moment EOLQ_MAX_LEN is 100, and we may
+			 * have at most 3 concurrent EOL. It leaves room
+			 * for 97 pending HA state transitions which is
+			 * far more than enough for a 3 node cluster with
+			 * one single failure.
+			 */
+			/*
+			 * TODO: Consider propagating FOM or its sm_id
+			 * (directly or indirectly) down to the recovery
+			 * machine, so that we may relate REDO FOM
+			 * and the corresponding recovery FOM.
+			 */
+			M0_BE_OP_SYNC(op,
+			      m0_dtm0_recovery_machine_redo_post(m, req, &op));
 		}
 		break;
 	default:
diff --git a/dtm0/fop.h b/dtm0/fop.h
index 38f00cc15bf..357aac4a572 100644
--- a/dtm0/fop.h
+++ b/dtm0/fop.h
@@ -47,16 +47,34 @@ enum m0_dtm0s_msg {
 	DTM_EXECUTE,
 	DTM_EXECUTED,
 	DTM_PERSISTENT,
-	DTM_REDO
+	DTM_REDO,
 } M0_XCA_ENUM;
 
+enum m0_dtm0_msg_flags {
+	M0_DMF_EOL,
+	M0_DMF_EVICTION,
+};
+
 /** A DTM0 message sent as an RPC request to remote DTM0 services. */
 struct dtm0_req_fop {
 	uint32_t               dtr_msg M0_XCA_FENUM(m0_dtm0s_msg);
 	struct m0_dtm0_tx_desc dtr_txr;
 	struct m0_buf          dtr_payload;
+	uint64_t               dtr_flags;
+	/**
+	 * The participant (DTM0 service) that sent this message.
+	 * The initiator is set for DTM_REDO messages.
+	 */
+	struct m0_fid          dtr_initiator;
 } M0_XCA_RECORD M0_XCA_DOMAIN(rpc);
 
+
+#define REDO_F "redo={ txid=" DTID0_F ", ini=" FID_F ", is_eol=%d }"
+#define REDO_P(_redo)                       \
+	DTID0_P(&(_redo)->dtr_txr.dtd_id),  \
+	FID_P(&(_redo)->dtr_initiator),      \
+	!!((_redo)->dtr_flags & M0_BITS(M0_DMF_EOL))
+
 struct dtm0_rep_fop {
 	/** Status code of dtm operation. */
 	int32_t                dr_rc;
diff --git a/dtm0/it/all2all/.gitignore b/dtm0/it/all2all/.gitignore
new file mode 100644
index 00000000000..5e548163dff
--- /dev/null
+++ b/dtm0/it/all2all/.gitignore
@@ -0,0 +1,4 @@
+m0trace.*
+addb_*/
+addb-stobs-*/
+m0crate.yaml
diff --git a/dtm0/it/all2all/all2all b/dtm0/it/all2all/all2all
index f659b62b9d7..b6db8f8985a 100755
--- a/dtm0/it/all2all/all2all
+++ b/dtm0/it/all2all/all2all
@@ -15,6 +15,10 @@ LOOP_IMG_DIR=$TEST_ROOT
 CLIENT_PID=
 M0D_DIR_COMMON=$MOTR_VAR_DIR/m0d-0x720000000000000
 ADDB_DUMP_DIR="/tmp/a2a-addb-out"
+CTGDUMP="$MOTR_ROOT/cas/m0ctgdump"
+DINDEX="1:5"
+CTGDUMP_DIR="/tmp/a2a-ctgdump"
+
 
 M0D_ENDPOINTS=()
 M0D_FIDS_DEC=()
@@ -23,10 +27,16 @@ M0D_PIDS=()
 
 M0D_CLI_FID_DEC=
 M0D_CLI_FID_HEX=
+MOD_CLI_EP=
 
 POOL_WIDTH=4
 IPOOL_WIDTH=2
 
+# m0d that got killed
+VICTIM=1
+# m0ds that are alive
+WITNESSES=(0 2)
+
 . ${MOTR_ROOT}/scripts/addb-py/chronometry/common/common_funcs
 
 
@@ -37,7 +47,12 @@ function stop_cluster()
 
 function bootstrap_cluster()
 {
-    hctl bootstrap --mkfs $CURRENT_CDF
+    # XXX: For some reason Hare does not like localhost.
+    hctl bootstrap --mkfs <(sed "s/localhost/$(hostname -f)/" $CURRENT_CDF)
+    if [[ $? -ne 0 ]]; then
+      _err "Cluster bootstrap failed"
+      exit 1
+    fi
 }
 
 function get_m0d_pids()
@@ -46,7 +61,7 @@ function get_m0d_pids()
     local pid
 
     for fid in ${M0D_FIDS_HEX[@]} ; do
-        pid=$(ps ax | grep m0d | grep $fid | awk '{ print $1; }')
+        pid=$(pgrep -a lt-m0d | grep $fid | awk '{ print $1; }')
         M0D_PIDS+=($pid)
         pids+="$pid "
     done
@@ -60,10 +75,12 @@ function create_m0crate_cfg()
     local svcs_json_out=$(echo $hctl_json_out | jq -r '.nodes[] | .svcs[]')
 
     local PROF=$(echo $hctl_json_out | jq -r '.profiles[] | .fid')
-    local MOTR_LOCAL_ADDR=$(echo $svcs_json_out | jq -r 'select( .name | contains("m0_client")) | .ep')
-    local PROCESS_FID=$(echo $svcs_json_out | jq -r 'select( .name | contains("m0_client")) | .fid')
+    local MOTR_LOCAL_ADDR=$(echo $svcs_json_out | jq -r 'select( .name | contains("motr_client")) | .ep')
+    local PROCESS_FID=$(echo $svcs_json_out | jq -r 'select( .name | contains("motr_client")) | .fid')
     local MOTR_HA_ADDR=$(echo $svcs_json_out | jq -r 'select( .name | contains("hax")) | .ep')
 
+    MOD_CLI_EP=$(echo "$MOTR_LOCAL_ADDR")
+
     local M0CRATE_CFG_TMP=m0crate_cfg.tmp
     cp $M0CRATE_CFG_IN $M0CRATE_CFG_TMP
     sed -i "s/###__PROF__###/$PROF/g" $M0CRATE_CFG_TMP
@@ -77,64 +94,175 @@ function get_params_for_ha_msgs()
 {
     local svcs_json_out=$(hctl status --json | jq -r '.nodes[] | .svcs[]')
     local svc_json_out=$(echo $svcs_json_out | jq -r 'select( .name | contains("ioservice"))')
-    local cli_json_out=$(echo $svcs_json_out | jq -r 'select( .name | contains("m0_client"))')
-    M0D_ENDPOINTS=($(echo $svc_json_out | jq -r '.ep' | sed -E 's/.*@tcp[:](.*)/\1/'))
+    local cli_json_out=$(echo $svcs_json_out | jq -r 'select( .name | contains("motr_client"))')
+    M0D_ENDPOINTS=($(echo $svc_json_out | jq -r '.ep'))
     M0D_FIDS_HEX=($(echo $svc_json_out | jq -r '.fid' | sed -E 's/0x720+([0-9][:]0x[A-Za-z0-9]+)/\1/'))
     M0D_FIDS_DEC=($(echo $svc_json_out | jq -r '.fid' | sed -E 's/0x720+([0-9][:])(0x[A-Za-z0-9]+)/printf "%s%d" \1 \2/e'))
     M0D_CLI_FID_DEC=$(echo $cli_json_out | jq -r '.fid' | sed -E 's/0x720+([0-9][:])(0x[A-Za-z0-9]+)/printf "%s%d" \1 \2/e')
     M0D_CLI_FID_HEX=$(echo $cli_json_out | jq -r '.fid' | sed -E 's/0x720+([0-9][:]0x[A-Za-z0-9]+)/\1/')
 }
 
-function ha_msg_send_transient()
+# params
+# (i) whom to send the message (endpoint). Use "cli" for the client; 0, 1, 2, etc. for m0ds.
+# (ii) who has changed its state. Use "cli" or 0, 1, 2.
+# (iii) what happened. Use "transient", "online" etc.
+function ha_msg_send()
 {
-    # Here we send "TRANSIENT" messages to trigger start of
-    # HA messages handling on the m0d side as dtm0 doesn't
-    # handle them until "TRANSIENT" received due to incomplete
-    # implementation on the Hare side.
-    for i in $(seq 0 $((${#M0D_ENDPOINTS[@]}-1))) ; do
-        for j in $(seq 0 $((${#M0D_FIDS_DEC[@]}-1))) ; do
-            if [[ $i -ne $j ]]; then
-                $MOTR_ST_UTILS_DIR/ha_msg_send.sh "${M0D_ENDPOINTS[$i]}" "^r|${M0D_FIDS_DEC[$j]}" "transient"
-                break
-            fi
-        done
-    done
+    local whom="$1"
+    local who="$2"
+    local what="$3"
+
+    if [[ "x$whom" == "xcli" ]]; then
+        whom="${MOD_CLI_EP}"
+    else
+        whom="${M0D_ENDPOINTS[$whom]}"
+    fi
+
+    if [[ "x$who" == "xcli" ]]; then
+        who="${M0D_CLI_FID_DEC}"
+    else
+        who="${M0D_FIDS_DEC[$who]}"
+    fi
+
+    _info "ha_msg_send: whom='${whom}', who='${who}', what='${what}'"
+    $MOTR_ST_UTILS_DIR/ha_msg_send.sh "$whom" "^r|$who" "$what"
 }
 
-function ha_msg_send_online()
+function ha_msg_send_cli()
 {
-    # Here we send "ONLINE" messages to trigger connections logic.
-    for i in $(seq 0 $((${#M0D_ENDPOINTS[@]}-1))) ; do
-        for j in $(seq 0 $((${#M0D_FIDS_DEC[@]}-1))) ; do
-            if [[ $i -ne $j ]]; then
-                $MOTR_ST_UTILS_DIR/ha_msg_send.sh "${M0D_ENDPOINTS[$i]}" "^r|${M0D_FIDS_DEC[$j]}" "online"
-            fi
-        done
-    done
+    local whom="cli"
+    local who="$1"
+    local what="$2"
+    ha_msg_send $whom $who $what
 }
 
-function ha_msg_send_cli_online()
+# params
+# (i)   who     Who changed its state?
+# (ii)  what    What is the new state?
+function ha_msg_send_m0ds()
 {
-    # Here we send "ONLINE" messages to connect servers to client.
-    for i in $(seq 0 $((${#M0D_ENDPOINTS[@]}-1))) ; do
-        $MOTR_ST_UTILS_DIR/ha_msg_send.sh "${M0D_ENDPOINTS[$i]}" "^r|${M0D_CLI_FID_DEC}" "online"
+    local who="$1"
+    local what="$2"
+
+    if [[ ${#M0D_ENDPOINTS[@]} -ne ${#M0D_FIDS_DEC[@]} ]]; then
+        echo "The number of endpoints is not equal to the number of fids. How is that possible?"
+        exit 1
+    fi
+
+    for i in $(seq 0 $((${#M0D_ENDPOINTS[@]}-1))); do
+        ha_msg_send $i $who $what
     done
 }
 
-function expected_trace_lines_num()
+function ha_msg_send_all()
+{
+    local who="$1"
+    local what="$2"
+    ha_msg_send_cli  "$who" "$what"
+    ha_msg_send_m0ds "$who" "$what"
+}
+
+# param
+# (i)   trace_path Path to the trace file
+# (ii)  pattern    String to grep.
+# (iii) exp_cnt    Expected number of lines that match the pattern.
+function expect_trace_lines
+{
+    local trace_path="$1"
+    local pattern="$2"
+    local exp_cnt="$3"
+    local cnt
+    local cmd;
+
+    _info "expect trace: path=$trace_path, pattern=$pattern, exp_cnt=$exp_cnt"
+    cnt=$($MOTR_ROOT/utils/trace/m0trace -i "$trace_path*" | grep -c "$pattern")
+    if [[ $cnt -ne $exp_cnt ]]; then
+        _info "Unexpected number of trace lines: $cnt != $exp_cnt"
+        return 1
+    fi
+
+    _info "Found"
+    return 0
+}
+
+function expect_trace_lines_from_m0d()
+{
+    local index="$1"
+    local pattern="$2"
+    local exp_cnt="$3"
+    local path="${M0D_DIR_COMMON}${M0D_FIDS_HEX[index]}/m0trace.${M0D_PIDS[index]}"
+
+    if expect_trace_lines "$path" "$pattern" "$exp_cnt"; then
+        return 0;
+    fi
+
+    return 1;
+}
+
+function expect_trace_lines_from_m0ds()
 {
     local pattern="$1"
     local exp_cnt=$2
-    local cnt
 
     for i in ${!M0D_PIDS[@]} ; do
-        cnt=$($MOTR_ROOT/utils/trace/m0trace -i "${M0D_DIR_COMMON}${M0D_FIDS_HEX[i]}/m0trace.${M0D_PIDS[i]}" | grep "$pattern" | wc -l)
-        if [[ $cnt -ne $exp_cnt ]]; then
+        if ! expect_trace_lines_from_m0d "$i" "$pattern" "$exp_cnt"; then
+            _info "Not found in ${M0D_PIDS[i]}"
             return 1
         fi
     done
 
-    return 0
+    return 0;
+}
+
+function expect_trace_lines_from_cli()
+{
+    local pattern="$1"
+    local exp_cnt="$2"
+    local path="$PWD/m0trace.$CLIENT_PID"
+    local cnt
+
+    if ! expect_trace_lines "$path" "$pattern" "$exp_cnt"; then
+        return 1;
+    fi
+
+    return 0;
+}
+
+function expect_trace_lines_from_all()
+{
+    local pattern="$1"
+    local exp_cnt="$2"
+
+    if expect_trace_lines_from_m0ds "$pattern" "$exp_cnt" && \
+        expect_trace_lines_from_cli  "$pattern" "$exp_cnt"; then
+        return 0;
+    fi
+
+    return 1
+}
+
+function addb2dump()
+{
+    local inpfile=$1
+    local outfile=$2
+    local a2d=$MOTR_ROOT/utils/m0addb2dump
+    local size
+
+    _info "Dumping ${inpfile} -> ${outfile} ..."
+
+    if ! $a2d -f "${inpfile}" > "${outfile}"; then
+        if [[ "$(stat -c %s ${inpfile})" == "0" ]]; then
+            _info "Skipping empty ADDB file."
+            return 0;
+        else
+            _info "Addb file is not empty but cannot dump it"
+            return 1;
+        fi
+    else
+        echo "OK."
+        return 0;
+    fi
+
 }
 
 function addb_dump()
@@ -143,7 +271,6 @@ function addb_dump()
     local outfile
     local inpfile
     local fid
-    local a2d=$MOTR_ROOT/utils/m0addb2dump
 
     rm -fR "${outdir}"
     mkdir "${outdir}"
@@ -152,15 +279,17 @@ function addb_dump()
         fid=$(echo "${M0D_FIDS_HEX[i]}" | awk -F'x' '{ print $2; }')
         outfile="${outdir}/addb_${fid}.dump"
         inpfile="${M0D_DIR_COMMON}${M0D_FIDS_HEX[i]}/addb-stobs-${M0D_PIDS[i]}/o/100000000000000:2"
-        _info "Dumping ${inpfile} -> ${outfile} ..."
-        $a2d -f "${inpfile}" > "${outfile}"
+        addb2dump ${inpfile} ${outfile}
     done
 
-    inpfile="$PWD/addb_${CLIENT_PID}/o/100000000000000:2"
-    fid=$(echo "$M0D_CLI_FID_HEX" | awk -F'x' '{ print $2; }')
-    outfile="${outdir}/addb_${fid}.dump"
-    _info "Dumping ${inpfile} -> ${outfile} ..."
-    $a2d -f "${inpfile}" > "${outfile}"
+    if [[ "x$CLIENT_PID" != "x" ]]; then
+        inpfile="$PWD/addb_${CLIENT_PID}/o/100000000000000:2"
+        fid=$(echo "$M0D_CLI_FID_HEX" | awk -F'x' '{ print $2; }')
+        outfile="${outdir}/addb_${fid}.dump"
+        addb2dump ${inpfile} ${outfile}
+    else
+        _info "Skipping client addb dumps."
+    fi
 }
 
 function processes_status_check()
@@ -186,10 +315,135 @@ function fail()
     exit 1
 }
 
-function main()
+function client_run()
+{
+    _info "Launching the client..."
+    $MOTR_ROOT/motr/m0crate/m0crate -S $M0CRATE_CFG &
+    CLIENT_PID=$!
+    _info "Client pid: ${CLIENT_PID}"
+}
+
+function client_run_gdb()
+{
+    _info "Launching the client under gdb ..."
+    libtool --mode=execute gdb --args $MOTR_ROOT/motr/m0crate/m0crate -S $M0CRATE_CFG
+}
+
+function client_wait()
+{
+    wait ${CLIENT_PID}
+    rc="$?"
+    if [[ "$rc" -gt 127 ]]; then
+      _err "Client terminated, rc = $rc"
+      exit 1
+    fi
+    return "$rc"
+}
+
+function m0kv_async()
+{
+    local args
+    args=$(python3 -c "import yaml; \
+        conf = yaml.load(open('$M0CRATE_CFG'), Loader=yaml.FullLoader); \
+        m = conf['MOTR_CONFIG']; \
+        print('-l %s -h %s -p %s -f %s' % (m['MOTR_LOCAL_ADDR'], m['MOTR_HA_ADDR'], m['PROF'], m['PROCESS_FID']));")
+
+    _info "m0kv $args $@"
+    ${MOTR_UTILS_DIR}/m0kv $args $@ &
+    CLIENT_PID=$!
+}
+
+function m0kv()
+{
+    m0kv_async $@
+    client_wait
+}
+
+function m0kv_run_sync()
+{
+    m0kv index create "${DINDEX}"
+    m0kv -s index put "${DINDEX}" mykey myvalue
+    m0kv -s index get "${DINDEX}" mykey
+    m0kv index drop "${DINDEX}"
+}
+
+function client_run_sync()
+{
+    client_run
+    client_wait
+}
+
+function ctgdump()
+{
+    local svc_num=$1
+    local proc_fid="0x720000000000000${M0D_FIDS_HEX[$svc_num]}"
+    local index=$DINDEX
+    local svc_ep="${M0D_ENDPOINTS[$svc_num]}"
+    local stobs="/var/motr/m0d-${proc_fid}/stobs"
+    local db="/var/motr/m0d-${proc_fid}/db"
+    local rest="-m 524288 -q 16 -w 8  -U -r 134217728  -c /etc/motr/confd.xc"
+    local out_dir=$CTGDUMP_DIR
+    local fid_hex=$(echo "${M0D_FIDS_HEX[$svc_num]}" | awk -F'x' '{ print $2; }')
+    local out_file="${out_dir}/cas-${fid_hex}.dump"
+    local cmd=$CTGDUMP
+
+    local cmd_args="-e libfab:${svc_ep} -A linuxstob:addb-stobs -f ${proc_fid} -T ad -S ${stobs} -D ${db} ${rest} str ${index}"
+
+    mkdir -p $out_dir
+
+    _info "$cmd $cmd_args"
+    $cmd $cmd_args | sort > "$out_file"
+}
+
+function ctg_eq()
+{
+    local left=$1
+    local right=$2
+    local left_hex=$(echo "${M0D_FIDS_HEX[$left]}" | awk -F'x' '{ print $2; }')
+    local right_hex=$(echo "${M0D_FIDS_HEX[$right]}" | awk -F'x' '{ print $2; }')
+
+    local left_file="${CTGDUMP_DIR}/cas-${right_hex}.dump"
+    local right_file="${CTGDUMP_DIR}/cas-${left_hex}.dump"
+
+    if ! diff -u $left_file $right_file; then
+        _info "Catalogues are different. Run 'diff -u $left_file $right_file' to see the difference."
+        return 1
+    else
+        local nr_records=$(wc -l $left_file | cut -f1 -d' ')
+        _info "Compared ${left_hex} with ${right_hex}. Found ${nr_records} identical records."
+        return 0;
+    fi
+}
+
+function check_ctg_consistency()
+{
+    local ref=$VICTIM
+    local targets=(${WITNESSES[@]})
+
+    rm -fR "$CTGDUMP_DIR"
+
+    for i in $(seq 0 $((${#M0D_FIDS_HEX[@]}-1))); do
+        ctgdump $i
+    done
+
+    for i in ${targets[@]}; do
+        if ! ctg_eq $ref $i; then
+            local ref_pid=${M0D_PIDS[$ref]}
+            local tgt_pid=${M0D_PIDS[$i]}
+            _info "Inconsistency detected between $ref and $tgt processes."
+            exit 1
+        fi
+    done
+}
+
+
+# Phase where we power on the cluster.
+function boot_phase()
 {
     local cli_pid
 
+    _info "Phase recovery:boot"
+
     ${MOTR_UTILS_DIR}/m0setup --init-loop-only -s 1 -d ${TEST_ROOT} --pool-width ${POOL_WIDTH} --ipool-width ${IPOOL_WIDTH}
 
     _info "Bootstrapping the cluster using Hare..."
@@ -200,16 +454,112 @@ function main()
 
     _info "Create m0crate configuration..."
     create_m0crate_cfg
+}
 
-    _info "Run the client..."
-    $MOTR_ROOT/motr/m0crate/m0crate -S $M0CRATE_CFG &
-    cli_pid=$!
-    wait ${cli_pid}
-    _info "Client pid: ${cli_pid}"
-    CLIENT_PID=${cli_pid}
-    stop_cluster
+# Phase where we ensure that all m0ds are online
+# from DTM0 perspective.
+function online_phase()
+{
+    _info "Phase recovery:online"
+
+    _info "Wait until every m0d started"
+    while ! expect_trace_lines_from_m0ds "ALL2ALL_STARTED" 1; do
+        :;
+    done
+
+    _info "Wait until every m0d has recovered"
+    while ! expect_trace_lines_from_m0ds "ALL2ALL_DTM_RECOVERED" 1; do
+        :;
+    done
+}
+
+# Phase where we perform recovery.
+function recovery_phase()
+{
+    # 2 * (3 + 1) where:
+    #   2 is the number of witnesses;
+    #   3 is the number of records (mykey1, mykey2, mykey3);
+    #   1 is the EOL.
+    local nr_redo=8
+    local redo_pattern="m0_dtm0_recovery_machine_redo_post   > in-redo"
+    local dtx_done_pattern="dtx_done                             >"
+    local m0kv_wait_file="/tmp/m0kv_wait"
+
+    _info "Phase recovery:recovery"
+    local svc_name="m0d@0x720000000000000${M0D_FIDS_HEX[VICTIM]}.service"
+
+    _info "Create an index"
+    m0kv index create "${DINDEX}"
+
+    _info "Populate the index"
+    m0kv -s index put "${DINDEX}" mykey1 myvalue1
+
+    _info "PUT one key, hang on"
+    m0kv_async -s index put "${DINDEX}" mykey2 myvalue2 \
+        wait "${m0kv_wait_file}" put "${DINDEX}" mykey3 myvalue3
+
+    _info "Wait until mykey2 reached the persistent storage on at least one node."
+    while ! expect_trace_lines_from_cli "$dtx_done_pattern" 1; do
+        :;
+    done
+
+    _info "Kill the victim (${M0D_PIDS[VICTIM]})."
+    kill -9 "${M0D_PIDS[VICTIM]}"
+
+    # XXX We may use the pool machine state change to identify this point.
+    # However, it may not be enough: we need to get to the point
+    # where the even was consumed _fully_ (i.e., Motr sent "ack" back to Hare).
+    _info "Wait a few seconds to ensure everyone learned about it."
+    sleep 10
+
+    # Let the client create a non-fully replicated record.
+    touch "$m0kv_wait_file"
+
+    client_wait
+
+    # TODO: Run ctgcmp to ensure mykey3 does not exist in the victim's storage.
+
+    _info "Resurrect the victim: $svc_name"
+    systemctl start "$svc_name"
+
+    # Victim's PID should be updated.
+    M0D_PIDS=()
+    get_m0d_pids
 
+    _info "Wait until the victim gets started."
+    while ! expect_trace_lines_from_m0d ${VICTIM} "ALL2ALL_STARTED" 1; do
+        :;
+    done
+
+    _info "Wait until the victim gets recovered."
+    while ! expect_trace_lines_from_m0d ${VICTIM} "ALL2ALL_DTM_RECOVERED" 1; do
+        :;
+    done
+
+    _info "Wait until the victim gets recovered."
+    while ! expect_trace_lines_from_m0d ${VICTIM} "ALL2ALL_DTM_RECOVERED" 1; do
+        :;
+    done
+
+    _info "Ensure we got enough REDO messages."
+    while ! expect_trace_lines_from_m0d ${VICTIM} "$redo_pattern" $nr_redo; do
+        :;
+    done
+
+    # TODO: Run ctgcmp to ensure mykey3 exists in the victim's storage.
+}
+
+# Phase where we are gathering artifacts
+# and shutting down the cluster.
+function shutdown_phase()
+{
+    _info "Phase recovery:shutdown"
+
+    _info "m0d pids: ${M0D_PIDS[@]}"
+
+    stop_cluster
     addb_dump
+    check_ctg_consistency
 
     _info "Checking processes exit status..."
     processes_status_check || {
@@ -220,4 +570,144 @@ function main()
     _info "TEST STATUS: PASSED"
 }
 
-main
+function recovery_cycle()
+{
+    _info "Starting the recovery cycle"
+
+    boot_phase && \
+        online_phase && \
+        recovery_phase && \
+        shutdown_phase
+}
+
+function simple_boot_cycle()
+{
+    _info "Starting the simple boot cycle"
+
+    boot_phase &&
+        client_run_sync &&
+        shutdown_phase
+}
+
+function print_help()
+{
+    echo -en "
+    The script allows you to check one of the following cases:
+    '$0 ss'  - 'Simple' bootstrap/shutdown of the 3-process cluster.
+    '$0 rec' - 'Recovery procedures' where the cluster is trying to
+       recover a failed participant.
+    Each case is called a 'cycle', and there are several
+    phases withing each cycle. You can use these phases to manually
+    run pieces of the cycles or individual commands. For example:
+        $0 rec boot
+        $0 rec m0kv index create \"1:5\"
+        $0 rec m0kv -s index put \"1:5\" mykey myvalue
+        $0 rec m0kv -s index put \"1:5\" mykey myvalue
+        $0 rec m0kv -s index get \"1:5\" mykey
+        $0 rec m0kv -s index next \"1:5\" \"\\\0\" 1
+        $0 rec m0kv -s index del \"1:5\" mykey
+        $0 rec stop
+
+    Debugging and testing:
+    1. Use DTM0 UTs to ensure recovery machine works
+    with your changes:
+        sudo ./utils/m0run -d -- m0ut -t dtm0-ut
+    2. Use 'dtm0-remach' gdb command to print backtraces
+    of the coroutines:
+        $ m0trace -i <dir/m0trace.pid> | grep recovery_machine_init
+        .... m=0xabcd
+        $ gdb -p <pid>
+        (gdb) p (struct m0_dtm0_recovery_machine *) 0xabcd
+        \$1 = (struct m0_dtm0_recovery_machine *) 0xabcd
+        (gdb) dtm0-remach $1
+        < ... prints information about machine ...>
+
+    Known issues:
+    1. FOM HUNG warnings for recovery FOM. They have
+    one single state for waiting on something, and because
+    of that you may see this warning.
+    \n";
+}
+
+# Prints various variables used by the
+# script. The function is used only
+# for debugging of the script.
+function print_env()
+{
+    get_params_for_ha_msgs
+    get_m0d_pids
+    echo "M0D_ENDPOINTS: ${M0D_ENDPOINTS[@]}"
+    echo "M0D_FIDS_DEC: ${M0D_FIDS_DEC[@]}"
+    echo "M0D_FIDS_HEX: ${M0D_FIDS_HEX[@]}"
+    echo "M0D_PIDS: ${M0D_PIDS[@]}"
+
+    echo "M0D_CLI_FID_DEC: ${M0D_CLI_FID_DEC}"
+    echo "M0D_CLI_FID_HEX: ${M0D_CLI_FID_HEX}"
+    echo "MOD_CLI_EP: ${MOD_CLI_EP}"
+}
+
+# params:
+# (i) cycle_name Name of the test (cycle)
+# (ii) phase_name Name of a phase in the cycle.
+function main()
+{
+    local cycle_name="$1"
+    local phase_name="$2"
+
+    # Run the whole cycle if phase was not
+    # specified.
+    if [ "x$phase_name" == "x" ]; then
+        case $cycle_name in
+            "ss")
+                simple_boot_cycle;;
+            "rec")
+                recovery_cycle;;
+            "-h")
+                print_help;;
+            "--help")
+                print_help;;
+            "--print-env")
+                print_env;;
+            *)
+                # Run ordinary all2al by default.
+                simple_boot_cycle;;
+        esac
+    else
+        if [[ "x$cycle_name" == "xrec" ]]; then
+            case $phase_name in
+                "boot")
+                    boot_phase;;
+                "online")
+                    online_phase;;
+                "recovery")
+                    recovery_phase;;
+                "shutdown")
+                    shutdown_phase;;
+                "m0kv")
+                    shift;
+                    shift;
+                    m0kv $@;;
+                "stop")
+                    killall -9 lt-m0ham || true;
+                    killall -9 lt-m0d || true;
+                    killall -9 hax || true;
+                    killall -9 lt-m0crate || true;
+                    killall -9 lt-m0kv || true;
+                    killall -9 lt-m0mkfs || true;
+                    hctl shutdown;;
+                *)
+                    echo "Wrong phase: $phase_name"
+                    echo "Use one of the following: "
+                    echo "  boot online recovery shutdown stop"
+                    exit 1;;
+            esac
+        else
+            echo "Unsupported cycle: $cycle_name."
+            echo "Use 'r' or 's' (recovery, simple)."
+            exit 1
+        fi
+    fi
+
+}
+
+main $@
diff --git a/dtm0/it/all2all/cdf.yaml b/dtm0/it/all2all/cdf.yaml
index 85122b73bf1..aea11c4a3ff 100644
--- a/dtm0/it/all2all/cdf.yaml
+++ b/dtm0/it/all2all/cdf.yaml
@@ -2,25 +2,26 @@ nodes:
   - hostname: localhost
     data_iface: eth0
     data_iface_type: tcp
+    transport_type: libfab
     m0_servers:
       - runs_confd: true
         io_disks:
           data: []
       - io_disks:
           data:
-            - /dev/loop0
-            - /dev/loop1
+            - path: /dev/loop0
+            - path: /dev/loop1
       - io_disks:
           data:
-            - /dev/loop2
-            - /dev/loop3
+            - path: /dev/loop2
+            - path: /dev/loop3
       - io_disks:
           data:
-            - /dev/loop4
-            - /dev/loop5
+            - path: /dev/loop4
+            - path: /dev/loop5
     m0_clients:
-        s3: 0
-        other: 1
+        - name: motr_client
+          instances: 1
 
 pools:
   - name: SNS pool
@@ -32,6 +33,7 @@ pools:
   - name: DIX pool
     type: dix  # optional; supported values: "sns" (default), "dix", "md"
     data_units: 1
-    parity_units: 1
+    parity_units: 2
+    spare_units: 0
     allowed_failures: { site: 0, rack: 0, encl: 0, ctrl: 1, disk: 1 }
 
diff --git a/dtm0/linux_kernel/stubs.c b/dtm0/linux_kernel/stubs.c
index da6518142e8..6b5d629633e 100644
--- a/dtm0/linux_kernel/stubs.c
+++ b/dtm0/linux_kernel/stubs.c
@@ -57,3 +57,54 @@ M0_INTERNAL int m0_dtm0_req_post(struct m0_dtm0_service    *svc,
 	(void) wait_for_ack;
 	return 0;
 }
+
+#include "dtm0/recovery.h"
+
+M0_INTERNAL int m0_drm_domain_init(void)
+{
+	return 0;
+}
+
+M0_INTERNAL void m0_drm_domain_fini(void)
+{
+
+}
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_init(struct m0_dtm0_recovery_machine           *m,
+			      const struct m0_dtm0_recovery_machine_ops *ops,
+			      struct m0_dtm0_service                    *svc)
+{
+	(void) m;
+	(void) svc;
+	(void) ops;
+}
+
+M0_INTERNAL int
+m0_dtm0_recovery_machine_start(struct m0_dtm0_recovery_machine *m)
+{
+	(void) m;
+	return 0;
+}
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_stop(struct m0_dtm0_recovery_machine *m)
+{
+	(void) m;
+}
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_fini(struct m0_dtm0_recovery_machine *m)
+{
+	(void) m;
+}
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_redo_post(struct m0_dtm0_recovery_machine *m,
+				   struct dtm0_req_fop             *redo,
+				   struct m0_be_op                 *op)
+{
+	(void) m;
+	(void) redo;
+	(void) op;
+}
diff --git a/dtm0/recovery.c b/dtm0/recovery.c
new file mode 100644
index 00000000000..1a60a36f10b
--- /dev/null
+++ b/dtm0/recovery.c
@@ -0,0 +1,2203 @@
+/* -*- C -*- */
+/*
+ * Copyright (c) 2022 Seagate Technology LLC and/or its Affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For any questions about this software or licensing,
+ * please email opensource@seagate.com or cortx-questions@seagate.com.
+ *
+ */
+
+
+/**
+   @page dtm0br-dld DLD of DTM0 basic recovery
+
+   - @ref DTM0BR-ovw
+   - @ref DTM0BR-def
+   - @ref DTM0BR-req
+   - @ref DTM0BR-depends
+   - @ref DTM0BR-highlights
+   - @subpage DTM0BR-fspec "Functional Specification"
+   - @ref DTM0BR-lspec
+      - @ref DTM0BR-lspec-comps
+      - @ref DTM0BR-lspec-rem-fom
+      - @ref DTM0BR-lspec-loc-fom
+      - @ref DTM0BR-lspec-evc-fom
+      - @ref DTM0BR-lspec-state
+      - @ref DTM0BR-lspec-thread
+      - @ref DTM0BR-lspec-numa
+   - @ref DTM0BR-conformance
+   - @ref DTM0BR-usecases
+   - @ref DTM0BR-ref
+   - @ref DTM0BR-impl-plan
+--->>>
+Max: [* defect *] ut and st sections are missing.
+IvanA: Agree. I will add it once we are more-or-less confident in use-cases and
+       requirements.
+<<<---
+--->>>
+Max: [* style *] usecases section should go to .h file, along with the rest of
+     the sections in .h in DLD template.
+<<<---
+
+
+   <hr>
+   @section DTM0BR-ovw Overview
+   <i>All specifications must start with an Overview section that
+   briefly describes the document and provides any additional
+   instructions or hints on how to best read the specification.</i>
+
+   The document describes the way how DTM0 service is supposed to restore
+   consistency of the data replicated across a certain kind of Motr-based
+   cluster. The term "basic" implies that consistency is restored only
+   for a limited number of use-cases. In the rest of the document,
+   the term "basic" is omitted for simplicity (there are no other kinds of
+   DTM0-based recovery at the moment).
+
+--->>>
+Max: [* defect *] This is an obvious definition, which has very small value.
+     Please make an overview which would bring as much useful information to the
+     reader as it can in just a few lines.
+IvanA: [fixed] I tried to stick to the point, and avoid any "new" terms in
+       the paragraph.
+<<<---
+
+
+   <hr>
+   @section DTM0BR-def Definitions
+   <i>Mandatory.
+   The DLD shall provide definitions of the terms and concepts
+   introduced by the design, as well as the relevant terms used by the
+   specification but described elsewhere.  References to the
+   M0 Glossary and the component's HLD are permitted and encouraged.
+   Agreed upon terminology should be incorporated in the glossary.</i>
+
+--->>>
+Max: [* question *] Are those all the terms that are used here? If no, please
+     add the rest of the terms. Please also consider adding terms that are being
+     used, but which are not in DTM0 HLD.
+IvanA: [fixed] No, it seems like they are not used. I removed this part.
+       I'll populate the "new terms" first, and if there are any redundant
+       information, I'll add references to the HLD.
+<<<---
+
+   Terms and definitions used in the document:
+   - <b>User service</b> is a Motr service that is capable of replicating
+     its data ("user data") across the cluster. The document is focused on
+     CAS (see @ref cas-dld) but this term could be applied to any Motr service
+     that supports CRDT[2] and has behavior similar to CAS.
+     Note, this document does not differentiate actual "clients" and "servers".
+     For example, the Motr client (including DIX) is also considered to be a
+     "User service".
+   - <b>DTM0 service</b> is a Motr-service that helps a user service restore
+     consistency of its replicated data.
+   - <b>Recoverable process</b> is Motr process that has exactly one user
+     service and exactly one DTM0 service in it. Note that in UT environment,
+     a single OS process may have several user/DTM0 services, thus it may
+     have multiple recoverable processes.
+--->>>
+IvanA: [* question *] @Max, "Recoverable process" does not sound good to me.
+       What do you think about "DTM0 process"? or "DTM0-backed process"?
+       I just want to avoid confusion with confd and any other Motr process that
+       does not have a DTM0 service in it. Any ideas on that?
+<<<---
+   - <b>Recovery procedures</b> is a broad term used to point at DTM0 services'
+     reactions to HA notifications about states of DTM0 services. In other
+     words, it references to actions performed by DTM0 services that
+     help to restore consistency of replicated data.
+   - <b>Recovery machine</b> is a state machine running within a DTM0 service
+     that performs recovery procedures. Each recoverable process
+     has a single instance of recovery machine.
+   - <b>Recovery FOM</b> is a long-lived FOM responsible for reaction to
+      HA-provided state changes of a recoverable process. The FOM knows the
+      id (FID) of this process (<b>Recovery FOM id</b>). If the recovery
+      FOM id matches with the id of the process it is running on, then
+      this FOM is called "local". Otherwise, it is called "remote".
+      If a Motr cluster has N recoverable processes in the configuration then
+      each recoverable process has N recovery FOMs (one per remote counterpart
+      plus one local FOM).
+   - <b>Recovery FOM role</b> is a sub-set of states of a recovery FOM. Each
+      recovery FOM may have one of the following roles: remote recovery,
+      local recovery, eviction. The term "role" is often omitted.
+   - <b>Recovery FOM reincarnation</b> is a transition between the roles
+      of a recovery FOM. Reincarnation happens as a reaction to HA notifications
+      about state changes of the corresponding process.
+   - <b>Remote recovery role</b> defines a sub-set of states of a recovery FOM
+      that performs recovery of a remote process (by sending REDO messages).
+   - <b>Local recovery role </b> defines a sub-set of states of a recovery FOM
+      that is responsible for starting and stopping of recovery procedures on
+      the local recoverable process.
+   - <b>Eviction role</b> defines a sub-set of states of a recovery FOM that
+      restores consistency of up to N-1 (N is number of recoverable processes
+      in the configuration) recoverable processes after one of the recoverable
+      processes of the cluster experienced a permanent failure (see HLD[1]).
+   - <b>W-request</b> is a FOP sent from one user service to another that
+      causes modifications of persistent storage. Such a request always
+      contains DTM0-related meta-data (transaction descriptor). W-requests
+      are used to replicate data across the cluster. They are stored in DTM0
+      log, and replayed by DTM0 service in form of REDO messages. In case of
+      CAS, W-requests are PUT and DEL operations
+   - <b>R-request</b> is a user service FOP that does not modify persistent
+      storage. R-requests are allowed to be sent only to the processes that
+      have ONLINE state. In case of CAS, R-requests are GET and NEXT operations.
+--->>>
+Max: [* defect *] The terms are not defined. Please define them.
+IvanA: I populated the list. I'll update the rest of the document
+       in a separate round of fixes.
+<<<---
+
+   <hr>
+   @section DTM0BR-req Requirements
+   <i>Mandatory.
+   The DLD shall state the requirements that it attempts to meet.</i>
+
+   Recovery machine shall meet the requirements defined in the DTM0 HLD[1].
+--->>>
+Max: [* question *] All of them? If no, please put a list here. If so, please
+     put the list anyway.
+<<<---
+   Additionally, it has the following list of low-level requirements:
+--->>>
+Max: [* question *] Why do we have those requirements? Please provide a way to
+     track them to the top-level requirements or requirements for other DTM0
+     subcomponents.
+<<<---
+
+   - @b R.DTM0BR.HA-Aligned State transitions of the recovery SM shall be
+   aligned with the state transitions of Motr processes delivered by
+   the HA subsystem to the recovery machine.
+   - @b R.DTM0BR.Basic Recovery machine shall support only a subset of
+   possible use-cases. The subset is defined in @ref DTM0BR-usecases.
+--->>>
+Max: [* defect *] Usecases are not defined.
+<<<---
+   - @b R.DTM0BR.No-Batching Recovery machine shall replay logs in
+   a synchronous manner one-by-one.
+--->>>
+Max: [* question *] One-by-one log or one-by-one log record? In the latter case:
+     what about records from logs on different nodes?
+<<<---
+--->>>
+Max: [* question *] How those logs or records are ordered (to be able to process
+     them one-by-one)?
+<<<---
+--->>>
+Max: [* question *] What about performance: is it possible for this design to
+     meet performance requirements? In any case please clearly state if it is
+     and why.
+<<<---
+
+   <hr>
+   @section DTM0BR-depends Dependencies
+   <i>Mandatory. Identify other components on which this specification
+   depends.</i>
+
+   The basic recovery machine depends on the following components:
+   - Hare. This component is supposed to own an ordered log of all HA events
+     in the system. Note, this dependency is optional for a certain set
+     of use-cases where events are delivered in the right way even without
+     beign backed up by a log.
+--->>>
+Max: [* defect *] This is not true at the moment. Please specify if it's
+     possible to implement this design without this being true or what is the
+     plan to deal with this.
+IvanA: [fixed] I added a note that this thing is optional for some cases.
+<<<---
+   - DTM0 service. The service provides access to DTM0 log and DTM0 RPC link.
+   - DTM0 log. The log is used to fill in REDO messages.
+   - DTM0 RPC link. The link is used as transport for REDO messages.
+   - Motr HA/conf. HA states of Motr conf objects provide the information
+   about state transitions of local and remote Motr processes.
+   - Motr conf. The conf module provides the list of remote DTM0 services.
+
+   <hr>
+   @section DTM0BR-highlights Design Highlights
+   <i>Mandatory. This section briefly summarizes the key design
+   decisions that are important for understanding the functional and
+   logical specifications, and enumerates topics that need special
+   attention.</i>
+
+   Each DTM0 service has a recovery machine. Each recovery machine contains
+   N recovery FOMs (one per counterpart + one local). Recovery FOMs start
+   listen to various events (HA events, DTM0 log events, RPC replies) and
+   act accordingly (sending out M0_CONF_HA_PROCESS_* events and REDO messages).
+
+   @verbatim
+
+   +-------------------------------------------+
+   | Recoverable process, process fid = 1      |
+   | +---------------------------------------+ |
+   | | DTM0 Service                          | |
+   | |                                       | |
+   | | +-----------------------------------+ | |
+   | | | Recovery machine                  | | |
+   | | |                                   | | |
+   | | | +--------+ +--------+ +--------+  | | |
+   | | | | R_FOM1 | | R_FOM2 | | R_FOM3 |  | | |
+   | | | | FID=1  | | FID=2  | | FID=3  |  | | |
+   | | | +--------+ +--------+ +--------+  | | |
+   | | |                                   | | |
+   | | +-----------------------------------+ | |
+   | +---------------------------------------+ |
+   +-------------------------------------------+
+
+       Recovery machines of one of the processes of a 3-process cluster.
+     The other two processes have the same set of FOMs. FID=N means that
+     the id of this recovery FOM is N. The first one (R_FOM1) is the local
+     recovery FOM of this DTM0 service. The other two (R_FOM2, R_FOM3) are
+     remote recovery FOMs that would send REDOs to the processes 2 and 3.
+
+   @endverbatim
+
+   Each recovery FOM may await on external events:
+
+   @verbatim
+
+   +-----------------------------------------------------+
+   |  Recovery FOM inputs                                |
+   |                                                     |
+   |                      Polling                        |
+   |   ----------------------------------------------    |
+   |   |  HA queue |  | DTM0 log |  | RPC replies   |    |
+   |   |           |  | watcher  |  |               |    |
+   |   |    /|\    |  |          |  |     /|\       |    |
+   |   |     |     |  |   /|\    |  |      |        |    |
+   |   |     |     |  |    |     |  |      |        |    |
+   |   |  conf obj |  | DTM0 log |  | DTM0 RPC link |    |
+   +-----------------------------------------------------+
+
+   @endverbatim
+
+   For example, when a recovery FOM performs recovery of a remote
+   process, it may await on HA queue (to halt sending of REDOs), on
+   DTM0 log watcher (to learn if there are new entries in the log),
+   or on DTM0 RPC link (to learn if next REDO message can be sent).
+
+   Whenever a recovery FOM receives an event from the HA queue, it gets
+   "reincarnated" to serve its particular role in recovery procedures
+   (see "Recovery FOM reincarnation" definition).
+   Incarnation of a recovery FOM is just a sub-state-machine of the FOM.
+   For example, here is the full state machine of R_FOM2:
+
+   @verbatim
+
+   INIT -> AWAIT_HEQ (HA event queue)
+
+   AWAIT_HEQ -> DECIDE_WHAT_TO_DO
+   DECIDE_WHAT_TO_DO -> AWAIT_HEQ
+   DECIDE_WHAT_TO_DO -> RESET
+   DECIDE_WHAT_TO_DO -> FINAL
+
+   RESET -> SEND_REDO
+
+   SEND_REDO -> AWAIT_HEQ_OR_LOG
+   SEND_REDO -> AWAIT_HEQ_OR_RPC
+
+   AWAIT_HEQ_OR_LOG -> SEND_REDO
+   AWAIT_HEQ_OR_LOG -> HEQ
+
+   AWAIT_HEQ_OR_RPC -> SEND_REDO
+   AWAIT_HEQ_OR_RPC -> HEQ
+
+   INIT -------------->  AWAIT_HEQ
+                         |     /|\
+			\|/     |
+                        DECIDE_WHAT_TO_DO --------------> FINAL
+                         |    /|\
+			 |     |               (remote recovery
+                         |     |                 sub-states)
+   //====================|=====|===================================//
+   //  RESET <-----------+     +---< HEQ      (Transitions that    //
+   //    |                                     may happen as       //
+   //    |                                     a reaction to       //
+   //    |    +------------------------+       RECOVERING HA event)//
+   //   \|/  \|/                       |                           //
+   //  SEND_REDO ---------------> AWAIT_ON_HEQ_OR_RPC ---> HEQ     //
+   //      |  /|\                                                  //
+   //     \|/  |                                                   //
+   //     AWAIT_ON_HEQ_OR_LOG -----> HEQ                           //
+   //==============================================================//
+
+                        ...   ...
+			 |    |             (eviction sub-states)
+   //====================|====|====================================//
+   //   RESET <----------+    +----< HEQ     (Transitions that     //
+   //    |                                    may happen as a      //
+   //    |    +------------------+            a reaction to FAILED //
+   //   \|/  \|/                 |            HA event)            //
+   //   SEND_REDO -----> AWAIT_ON_HEQ_OR_RPC ----> HEQ             //
+   //==============================================================//
+   @endverbatim
+
+   The first frame highlighted with "===//" shows the sub-state machine of
+   a recovery FOM that are used to recover a participant that entered
+   RECOVERING HA state. If the same participant enters FAILED state
+   (permanent failure) then another set of sub-states would be involved
+   in recovery procedures (the second frame).
+
+   On the diagram above, the transition DECIDE_WHAT_TO_DO -> RESET
+   marks the point where recovery FOM enters a new incarnation,
+   the transition HEQ -> DECIDE_WHAT_TO_DO marks the end of the life of this
+   incarnation. The set of sub-states is called "role".
+   For example, if a recovery FOM is somewhere inside the first frame
+   (remote recovery) then we say that it has "remote recovery role" and it
+   performs "remote recovery". Correspondingly, the second frame describes
+   "eviction role" or just "eviction".
+
+   The notion of incarnation is used to emphasise that a recovery FOM must
+   always "clean up" its volatile state before processing next HA event.
+   For example, it may await on a pending RPC reply or it may have to reset
+   DTM0 log iterator.
+
+   The local recovery FOM (R_FOM1 in the example), is just a volatile
+   state that captures information about certain points in the log and sends
+   out ::M0_CONF_HA_PROCESS_RECOVERED when it believes that the local process
+   has been fully recovered (see the recovery stop condition below).
+   The local recovery FOM cannot change its role (it cannot recover itself
+   from transient or permanent failures).
+
+   UPD: The design described above has actually been implemented using
+   coroutines, not FSM.  So we do not have these states in the code.  But --
+   coroutines are implemented in the same spirit as outlined above, that is:
+   it would execute all actions needed for a given incarnation, clean up, and
+   only after this -- read next event from HEQ (HA event queue).
+
+
+   <hr>
+   @section DTM0BR-lspec Logical Specification
+   <i>Mandatory.  This section describes the internal design of the component,
+   explaining how the functional specification is met.  Sub-components and
+   diagrams of their interaction should go into this section.  The section has
+   mandatory subsections created using the Doxygen @@subsection command.  The
+   designer should feel free to use additional sub-sectioning if needed, though
+   if there is significant additional sub-sectioning, provide a table of
+   contents here.</i>
+
+   - @ref DTM0BR-lspec-comps
+   - @ref DTM0BR-lspec-rem-fom
+   - @ref DTM0BR-lspec-loc-fom
+   - @ref DTM0BR-lspec-evc-fom
+   - @ref DTM0BR-lspec-state
+   - @ref DTM0BR-lspec-thread
+   - @ref DTM0BR-lspec-numa
+
+   @subsection DLD-lspec-comps Component Overview
+   <i>Mandatory.
+   This section describes the internal logical decomposition.
+   A diagram of the interaction between internal components and
+   between external consumers and the internal components is useful.</i>
+
+   The following diagram shows connections between recovery machine and
+   the other components of the system:
+
+   @dot
+	digraph {
+		rankdir = LR;
+		bgcolor="lightblue"
+		node[shape=box];
+		label = "DTM0 Basic recovery machine components and dependencies";
+		subgraph cluster_recovery_sm {
+			label = "Recovery SM";
+			bgcolor="lightyellow"
+			node[shape=box];
+			rfom[label="Recovery FOM"];
+			E1[shape=none  label="&#8942;" fontsize=30]
+			rfom_another[label="Another Recovery FOM"];
+		}
+
+		subgraph cluster_ha_conf {
+			node[shape=box];
+			label ="HA and Conf";
+			conf_obj[label="Conf object"];
+			ha[label="Motr HA"];
+		}
+
+		drlink[label="DTM0 RPC link"];
+		dlog[label="DTM0 log"];
+
+		conf_obj -> rfom[label="HA decisions"];
+		rfom -> ha [label="Process events"];
+		rfom -> drlink [label="REDO msgs"];
+		dlog -> rfom[label="records"];
+	}
+   @enddot
+--->>>
+Max: [* defect *] Communication with other recovery foms in the same process and
+     in different processes are missing. Please add.
+<<<---
+
+   The following sequence diagram represents an example of linerisation of
+   HA decisions processing:
+   @msc
+   ha,rfom, remote;
+   ha -> rfom [label="RECOVERING"];
+   rfom -> remote [label="REDO(1)"];
+   ha -> rfom [label="TRANSIENT"];
+   remote -> rfom [label="REDO(1).reply(-ETIMEOUT)"];
+   ha -> rfom [label="RECOVERING"];
+   rfom -> rfom [label="Cancel recovery"];
+   rfom -> remote [label="REDO(1)"];
+   remote -> rfom [label="REDO(1).reply(ok)"];
+   @endmsc
+
+   In this example the recovery machine receives a sequence of events
+   (TRANSIENT, RECOVERING, TRANSIENT), and reacts on them:
+   - Starts replay the log by sending REDO(1).
+   - Awaits on the reply.
+   - Cancels recovery.
+   - Re-starts replay by sending REDO(1) again.
+   - Get the reply.
+--->>>
+Max: [* defect *] One example is not enough to describe the algorithm. Please
+     explain how it's done exactly. Maybe not in this section, but somewhere in DLD.
+<<<---
+
+   @subsection DTM0BR-lspec-rem-fom Remote recovery FOM
+   <i>Such sections briefly describes the purpose and design of each
+   sub-component.</i>
+--->>>
+Max: [* defect *] State machine is there, but what the fom actually does is
+     missing. Please add.
+<<<---
+
+   When a recovery FOM detects a state transition to RECOVERING of a remote
+   participant, it transits to the sub-SM called "Remote recovery FOM".
+   In other words, it re-incarnates as "Remote recovery FOM" as soon as
+   the previous incarnation is ready to reach its "terminal" state.
+
+   Remote recovery FOM reacts on the following kinds of events:
+   - Process state transitions (from HA subsystem);
+   - DTM0 log getting new records;
+   - RPC replies (from DTM0 RPC link component).
+
+   Remote recovery FOM has the following states and transitions:
+
+   @verbatim
+   REMOTE_RECOVERY(initial) -> WAITING_ON_HA_OR_RPC
+   REMOTE_RECOVERY(initial) -> WAITING_ON_HA_OR_LOG
+   WAITING_ON_HA_OR_RPC     -> WAITING_ON_HA_OR_LOG
+   WAITING_ON_HA_OR_LOG     -> WAITING_ON_HA_OR_RPC
+   WAITING_ON_HA_OR_RPC     -> NEXT_HA_STATE (terminal)
+   WAITING_ON_HA_OR_LOG     -> NEXT_HA_STATE (terminal)
+   WAITING_ON_HA_OR_RPC     -> SHUTDOWN (terminal)
+   WAITING_ON_HA_OR_LOG     -> SHUTDOWN (terminal)
+
+       Where:
+       REMOTE_RECOVERY is a local initial state (local to the sub-SM);
+       NEXT_HA_STATE is a local terminal state;
+       SHUTDOWN is a global terminal state.
+   @endverbatim
+
+   @subsection DTM0BR-lspec-loc-fom Local recovery FOM
+   <i>Such sections briefly describes the purpose and design of each
+   sub-component.</i>
+
+   Local recovery is used to ensure fairness of overall recovery procedure.
+--->>>
+Max: [* defect *] Definition of "fairness" in this context is missing. Please
+     add.
+<<<---
+--->>>
+Max: [* defect *] Definition of the "fairness" is "ensured" is missing. Please
+     add.
+<<<---
+   Whenever a participant learns that it got all missed log records, it sends
+   M0_CONF_HA_PROCESS_RECOVERED process event to the HA subsystem.
+
+   The point where this event has to be sent is called "recovery stop
+   condition". The local participant (i.e., the one that is being in
+   the RECOVERING state) uses the information about new incoming W-requests,
+   the information about the most recent (txid-wise) log entries on the other
+   participants to make a decision whether recovery shall be stopped or not.
+
+   TODO: describe the details on when and how it should stop.
+--->>>
+Max: [* defect *] You're right, the details are missing.
+<<<---
+
+
+   @subsection DTM0BR-lspec-evc-fom Eviction FOM
+   <i>Such sections briefly describes the purpose and design of each
+   sub-component.</i>
+--->>>
+Max: [* defect *] The purpose of the eviction fom is not described.
+<<<---
+
+   A recovery FOM re-incarnates as eviction FOM (i.e., enters the initial
+--->>>
+Max: [* defect *] A definition of "re-incarnation" is missing.
+<<<---
+   of the corresponding sub-SM) when the HA subsystem notifies about
+--->>>
+Max: [* defect *] A definition of sub-SM is missing.
+<<<---
+   permanent failure (FAILED) on the corresponding participant.
+
+   The local DTM0 log shall be scanned for any record where the FAILED
+--->>>
+Max: [* doc *] Please add a reference to the component which does the scanning.
+<<<---
+--->>>
+Max: [* doc *] Please explain how to scanning is done:
+     - sequentially?
+     - from least recent to most recent?
+     - one fom/thread/whatever or multiple? How the work is distributed?
+     - locks also have to be described somewhere in DLD.
+<<<---
+   participant participated. Such a record shall be replayed to the
+   other participants that are capable of receiving REDO messages.
+--->>>
+Max: [* defect *] What to do with the message if a participant is in TRANSIENT
+     state is not described.
+<<<---
+
+   When the log has been replayed completely, the eviction FOM notifies
+--->>>
+Max: [* defect *] Criteria of "log has been replayed completely" is missing.
+     Please add.
+<<<---
+   the HA subsystem about completion and leaves the sub-SM.
+--->>>
+Max: [* question *] How? Please also describe what is expected from HA.
+<<<---
+
+   TODO: GC of FAILED processes and handling of clients restart.
+
+   @subsection DTM0BR-lspec-state State Specification
+   <i>Mandatory.
+   This section describes any formal state models used by the component,
+   whether externally exposed or purely internal.</i>
+
+   The states of a recovery FOM is defined as a collection of the sub-SM
+   states, and a few global states.
+
+   TODO: state diagram for overall recovery FOM.
+--->>>
+Max: [* doc *] Also distributed state diagram.
+<<<---
+
+   @subsection DTM0BR-lspec-thread Threading and Concurrency Model
+   <i>Mandatory.
+   This section describes the threading and concurrency model.
+   It describes the various asynchronous threads of operation, identifies
+   the critical sections and synchronization primitives used
+   (such as semaphores, locks, mutexes and condition variables).</i>
+
+   Recovery machine revolves around the following kinds of threads:
+--->>>
+Max: [* defect *] Definition of "revolves" is missing.
+<<<---
+   - Locality threads (Recovery FOMs, DTM0 log updates).
+   - RPC machine thread (RPC replies).
+
+   Each recovery FOM is implemented using Motr coroutines library.
+   Within the coroutine ::m0_be_op and/or ::m0_co_op is used to await
+--->>>
+Max: [* question *] RE: "the coroutine": which one?
+<<<---
+   on external events.
+--->>>
+Max: [* suggestion *] A list of such events would help to understand why be/co
+     ops are needed.
+<<<---
+
+   DTM0 log is locked using a mutex (TBD: the log mutex or a separate mutex?)
+--->>>
+Max: [* question *] Entire DTM0 log is locked using a mutex? If no, please add
+     unambiguous sentence. Example: "<pointer to a watcher ot whatever> is
+     protected with a mutex" or "a separate mutex protects concurrent access to
+     <a field>".
+<<<---
+   whenever recovery machine sets up or cleans up its watcher.
+--->>>
+Max: [* defect *] The watcher description/workflow/etc. is missing. Please add.
+<<<---
+
+   Interaction with RPC machine is wrapped by Motr RPC library and DTM0
+--->>>
+Max: [* typo *] s/Interaction/Interaction/
+<<<---
+   RPC link component. It helps to unify the type of completion objects
+   across the FOM code.
+
+   TODO: describe be/co op poll/select.
+
+   @subsection DTM0BR-lspec-numa NUMA optimizations
+   <i>Mandatory for components with programmatic interfaces.
+   This section describes if optimal behavior can be supported by
+   associating the utilizing thread to a single processor.</i>
+
+   There is no so much shared resources outside of DTM0 log and
+   the localities. Scaling and batching is outside of the scope
+--->>>
+Max: [* defect *] The number of localities is equal to the number of CPU cores
+in our default configuration, so whatever uses more than one locality has to
+have some kind of synchronisation.
+<<<---
+--->>>
+Max: [* defect *] It's not clear what "scaling" and "batching" mean in this
+     context. Please explain and/or add a reference where they are explained.
+<<<---
+   of this document.
+--->>>
+Max: [* defect *] FOM locality assignment is missing.
+<<<---
+
+   <hr>
+   @section DTM0BR-conformance Conformance
+   <i>Mandatory.
+   This section cites each requirement in the @ref DTM0BR-req section,
+   and explains briefly how the DLD meets the requirement.</i>
+--->>>
+Max: [* defect *] Top-level requirements from HLD are missing.
+<<<---
+
+   - @b I.DTM0BR.HA-Aligned Recovery machine provides an event queue that
+   is beign consumed orderly by the corresponding FOM.
+--->>>
+Max: [* defect *] The word "queue" is present in this document only here. Please
+     describe the event queue and events somewhere in this DLD.
+<<<---
+--->>>
+Max: [* defect *] It's not clear how "HA subsystem" from the requirement is met
+     by the implementation.
+<<<---
+   - @b I.DTM0BR.Basic Recovery machine supports only basic log replay defined
+   by the remote recovery FOM and its counterpart.
+--->>>
+Max: [* defect *] "basic" term is not defined. Please define.
+<<<---
+   - @b I.DTM0BR.No-Batching Recovery machine achieves it by awaiting on
+   RPC replies for every REDO message sent.
+--->>>
+Max: [* defect *] Maybe I should wait for I.* for performance and availability
+     requirements, but as of now it's not clear how DTM recovery would catch up
+     with the rest of the nodes creating new DTM0 transactions in parallel with
+     DTM recovery.
+<<<---
+
+   <hr>
+   @section DTM0BR-usecases
+   <i>Mandatory. This section describes use-cases for recovery machine.
+   </i>
+
+   TODO: Add use-cases.
+--->>>
+Max: [* defect *] Right, they are missing.
+<<<---
+
+
+   <hr>
+   @section DLD-O Analysis
+   <i>This section estimates the performance of the component, in terms of
+   resource (memory, processor, locks, messages, etc.) consumption,
+   ideally described in big-O notation.</i>
+--->>>
+Max: [* doc *] Please fill this section with references to the requirements.
+<<<---
+
+   <hr>
+   @section DLD-ref References
+   <i>Mandatory. Provide references to other documents and components that
+   are cited or used in the design.
+   In particular a link to the HLD for the DLD should be provided.</i>
+
+   - [1] <a href="https://github.com/Seagate/cortx-motr/blob/documentation/doc/dev/dtm/dtm-hld.org">
+   DTM0 HLD</a>
+   - [2] <a href="https://en.wikipedia.org/wiki/Conflict-free_replicated_data_type">
+
+   <hr>
+   @section DLD-impl-plan Implementation Plan
+   <i>Mandatory.  Describe the steps that should be taken to implement this
+   design.</i>
+
+   - Develop pseudocode-based description of recovery activities.
+   - Identify and add missing concurrency primitives (select/poll for be op).
+   - Use the pseudocode and new primitives to implement a skeleton of the FOMs.
+   - Incorporate RECOVERING and TRANSIENT states with their
+   new meaning as soon as they are available in the upstream code.
+   - Populate the list of use-cases based on available tools.
+   - Populate the list of system tests.
+   - Improve the stop condition based on the use-cases and tests.
+   - Add implementation of eviction FOM.
+--->>>
+Max: [* defect *] No mention of tests being done. Please clearly state when and
+     how the tests are going to be done or where and when it would be possible
+     to find this information.
+<<<---
+ */
+
+
+
+#define M0_TRACE_SUBSYSTEM M0_TRACE_SUBSYS_DTM0
+#include "lib/trace.h"
+#include "dtm0/recovery.h"    /* m0_dtm0_recovery_machine */
+#include "dtm0/service.h"     /* m0_dtm0_service */
+#include "dtm0/drlink.h"      /* m0_dtm0_req_post */
+#include "dtm0/fop.h"         /* dtm0_req_fop */
+#include "be/op.h"            /* m0_be_op */
+#include "be/queue.h"         /* m0_be_queue */
+#include "conf/diter.h"       /* diter */
+#include "conf/helpers.h"     /* m0_confc_root_open */
+#include "conf/obj.h"         /* m0_conf_obj */
+#include "fop/fom.h"          /* m0_fom */
+#include "lib/coroutine.h"    /* m0_co_context */
+#include "lib/memory.h"       /* M0_ALLOC_PTR */
+#include "reqh/reqh.h"        /* m0_reqh2confc */
+#include "rpc/rpc_opcodes.h"  /* M0_DTM0_RECOVERY_FOM_OPCODE */
+#include "lib/string.h"       /* m0_streq */
+#include "be/dtm0_log.h"      /* m0_dtm0_log_rec */
+#include "motr/setup.h"       /* m0_cs_reqh_context */
+#include "addb2/identifier.h" /* M0_AVI_FOM_TO_TX */
+#include "dtm0/addb2.h"       /* M0_AVI_DORM_SM_STATE */
+#include "motr/client_internal.h"      /* struct m0client::m0c_reqh */
+
+enum {
+	/*
+	 * Number of HA event that could be submitted by the HA subsystem
+	 * at once; where:
+	 *   "HA event" means state transition of a single DTM0 process;
+	 *   "at once" means the maximal duration of a tick of any FOM
+	 *     running on the same locality as the recovery FOM that
+	 *     handles HA events.
+	 * XXX: The number was chosen randomly. Update the previous sentence
+	 *      if you want to change the number.
+	 */
+	HEQ_MAX_LEN = 32,
+
+	/*
+	 * Number of HA events and EOL messages that could be submitted
+	 * at once.
+	 * TODO: Modify this comment once we stop putting HA events in this
+	 * queue (with help of be-op-or-set). It shall be EOL-only queue.
+	 */
+	EOLQ_MAX_LEN = 100,
+};
+
+struct recovery_fom {
+	struct m0_fom                    rf_base;
+
+	/* Recovery machine instance that owns this FOM. */
+	struct m0_dtm0_recovery_machine *rf_m;
+
+	/** Subscription to conf obj HA state. */
+	struct m0_clink                  rf_ha_clink;
+
+	/** HA event queue populated by the clink and consumed by the FOM. */
+	struct m0_be_queue               rf_heq;
+
+	struct m0_co_context             rf_coro;
+
+	/** Linkage for m0_dtm0_recovery_machine::rm_foms */
+	struct m0_tlink                  rf_linkage;
+
+	/** Magic for rfom tlist entry. */
+	uint64_t                         rf_magic;
+
+	struct m0_be_queue               rf_eolq;
+
+	struct m0_be_dtm0_log_iter       rf_log_iter;
+
+	/* Target DTM0 service FID (id of this FOM within the machine). */
+	struct m0_fid                    rf_tgt_svc;
+
+	struct m0_fid                    rf_tgt_proc;
+
+	/** Is target DTM0 service the service ::rf_m belongs to? */
+	bool                             rf_is_local;
+
+	/** Is target DTM0 volatile? */
+	bool                            rf_is_volatile;
+
+
+	/** The most recent HA state of this remote DTM0 service. */
+	enum m0_ha_obj_state            rf_last_known_ha_state;
+
+	/**
+	 * The most recent known state of the log on a remote DTM0 service.
+	 * Note, it is impossible to guarantee that this stat is "in-sync"
+	 * with ::rf_last_known_ha_state unless we have HA epochs.
+	 */
+	bool                            rf_last_known_eol;
+};
+
+enum eolq_item_type {
+	EIT_EOL,
+	EIT_HA,
+	EIT_END,
+};
+
+struct eolq_item {
+	enum eolq_item_type  ei_type;
+	struct m0_fid        ei_source;
+	enum m0_ha_obj_state ei_ha_state;
+};
+
+/*
+ * A global variable to set off parts of the code that were added
+ * specifically for the integration (all2all) test script.
+ * Later on, they need to be removed (once we get a better
+ * way of testing).
+ */
+const bool ALL2ALL = true;
+
+M0_TL_DESCR_DEFINE(rfom, "recovery_fom",
+		   static, struct recovery_fom, rf_linkage,
+		   rf_magic, M0_DTM0_RMACH_MAGIC, M0_DTM0_RMACH_HEAD_MAGIC);
+M0_TL_DEFINE(rfom, static, struct recovery_fom);
+
+static int   populate_foms (struct m0_dtm0_recovery_machine *m);
+static void unpopulate_foms(struct m0_dtm0_recovery_machine *m);
+
+static bool recovery_fom_ha_clink_cb(struct m0_clink *clink);
+
+static void recovery_fom_self_fini(struct m0_fom *fom);
+static int  recovery_fom_tick(struct m0_fom *fom);
+
+static void recovery_machine_lock(struct m0_dtm0_recovery_machine *m);
+static void recovery_machine_unlock(struct m0_dtm0_recovery_machine *m);
+static struct recovery_fom *
+recovery_fom_local(struct m0_dtm0_recovery_machine *m);
+
+static bool ha_event_invariant(uint64_t event)
+{
+	return event < M0_NC_NR;
+}
+
+static void addb2_relate(const struct m0_sm *left, const struct m0_sm *right)
+{
+	M0_ADDB2_ADD(M0_AVI_FOM_TO_TX, m0_sm_id_get(left), m0_sm_id_get(right));
+}
+
+
+static struct m0_sm_state_descr recovery_machine_states[] = {
+	[M0_DRMS_INIT] = {
+		.sd_name      = "M0_DRMS_INIT",
+		.sd_allowed   = M0_BITS(M0_DRMS_STOPPED, M0_DRMS_STARTED),
+		.sd_flags     = M0_SDF_INITIAL,
+	},
+	[M0_DRMS_STARTED] = {
+		.sd_name      = "M0_DRMS_STARTED",
+		.sd_allowed   = M0_BITS(M0_DRMS_STOPPED),
+		.sd_flags     = 0,
+	},
+	[M0_DRMS_STOPPED] = {
+		.sd_name      = "M0_DRMS_STOPPED",
+		.sd_allowed   = 0,
+		.sd_flags     = M0_SDF_TERMINAL,
+	},
+};
+
+static struct m0_sm_trans_descr recovery_machine_trans[] = {
+	{ "started",      M0_DRMS_INIT,    M0_DRMS_STARTED },
+	{ "stop-running", M0_DRMS_STARTED, M0_DRMS_STOPPED },
+	{ "stop-idle",    M0_DRMS_INIT,    M0_DRMS_STOPPED },
+};
+
+struct m0_sm_conf m0_drm_sm_conf = {
+	.scf_name      = "recovery_machine",
+	.scf_nr_states = ARRAY_SIZE(recovery_machine_states),
+	.scf_state     = recovery_machine_states,
+	.scf_trans_nr  = ARRAY_SIZE(recovery_machine_trans),
+	.scf_trans     = recovery_machine_trans,
+};
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_init(struct m0_dtm0_recovery_machine           *m,
+			      const struct m0_dtm0_recovery_machine_ops *ops,
+			      struct m0_dtm0_service                    *svc)
+{
+	M0_PRE(m != NULL);
+	M0_ENTRY("m=%p, svc=%p", m, svc);
+	M0_PRE(m0_sm_conf_is_initialized(&m0_drm_sm_conf));
+
+	m->rm_svc = svc;
+	if (ops)
+		m->rm_ops = ops;
+	else
+		m->rm_ops = &m0_dtm0_recovery_machine_default_ops;
+	rfom_tlist_init(&m->rm_rfoms);
+	m0_sm_group_init(&m->rm_sm_group);
+	m0_sm_init(&m->rm_sm, &m0_drm_sm_conf,
+		   M0_DRMS_INIT, &m->rm_sm_group);
+	m0_sm_addb2_counter_init(&m->rm_sm);
+
+	M0_POST(m->rm_ops != NULL);
+	M0_LEAVE();
+}
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_fini(struct m0_dtm0_recovery_machine *m)
+{
+	M0_ENTRY("m=%p", m);
+	recovery_machine_lock(m);
+	M0_ASSERT(M0_IN(m->rm_sm.sm_state, (M0_DRMS_STOPPED, M0_DRMS_INIT)));
+	if (m->rm_sm.sm_state == M0_DRMS_INIT)
+		m0_sm_state_set(&m->rm_sm, M0_DRMS_STOPPED);
+	m0_sm_fini(&m->rm_sm);
+	recovery_machine_unlock(m);
+	/*
+	 * Question: This uses the sm group lock and immediately finalises it,
+	 * which is, in general, wrong (if there can be other threads waiting on
+	 * the lock, then finalisation is incorrect, if no other threads are
+	 * possible at this point, the lock-unlock are not needed). Please add a
+	 * comment explaining why this is correct.
+	 *
+	 * Answer: There are indeed no other threads waiting on the lock.
+	 * Lock-unlock is needed because m0_sm_fini requires it (otherwise
+	 * asserts fail).  The reason there are no other threads below:
+	 *
+	 * See ASSERT above: this function only runs when sm_state is
+	 * M0_DRMS_INIT or M0_DRMS_STOPPED.  When M0_DRMS_INIT, there are no
+	 * other threads _yet_.  When other threads are created -- state becomes
+	 * M0_DRMS_STARTED (threads are recovery FOMs, see
+	 * m0_dtm0_recovery_machine_start()).  When M0_DRMS_STOPPED, there are
+	 * no other threads _already_.  We go to M0_DRMS_STOPPED when the last
+	 * FOM is finalized -- see recovery_fom_self_fini().
+	 */
+	m0_sm_group_fini(&m->rm_sm_group);
+	M0_ASSERT(rfom_tlist_is_empty(&m->rm_rfoms));
+	rfom_tlist_fini(&m->rm_rfoms);
+	M0_LEAVE();
+}
+
+M0_INTERNAL int
+m0_dtm0_recovery_machine_start(struct m0_dtm0_recovery_machine *m)
+{
+	struct recovery_fom *rf;
+	int                  rc;
+
+	/* TODO: Skip initialisation of recovery foms during mkfs. */
+	rc = populate_foms(m);
+	if (rc < 0)
+		return M0_RC(rc);
+
+	M0_ASSERT(rfom_tlist_is_empty(&m->rm_rfoms) ==
+		  (m->rm_local_rfom == NULL));
+
+	m0_tl_for(rfom, &m->rm_rfoms, rf) {
+		m0_fom_queue(&rf->rf_base);
+	} m0_tlist_endfor;
+
+	if (m->rm_local_rfom != NULL) {
+		recovery_machine_lock(m);
+		m0_sm_state_set(&m->rm_sm, M0_DRMS_STARTED);
+		recovery_machine_unlock(m);
+	}
+
+	if (ALL2ALL)
+		M0_LOG(M0_DEBUG, "ALL2ALL_STARTED");
+
+	return M0_RC(rc);
+}
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_stop(struct m0_dtm0_recovery_machine *m)
+{
+	int                  rc;
+	struct recovery_fom *rf;
+
+	recovery_machine_lock(m);
+
+	m0_tl_for(rfom, &m->rm_rfoms, rf) {
+		m0_be_queue_lock(&rf->rf_heq);
+		M0_LOG(M0_DEBUG, "heq_end " FID_F, FID_P(&rf->rf_tgt_svc));
+		/*
+		 * The recovery fom will finalize itself when the queue ends.
+		 */
+		m0_be_queue_end(&rf->rf_heq);
+		m0_be_queue_unlock(&rf->rf_heq);
+	} m0_tlist_endfor;
+
+	/*
+	 * This sm wait will release the sm lock before waiting if needed.
+	 * The same lock is used as the sm lock and machine lock. Please see
+	 * recovery_machine_lock() and m0_dtm0_recovery_machine_init().
+	 * So any other thread that wants to acquire the recovery machine
+	 * lock will have a chance to continue.
+	 */
+	rc = m0_sm_timedwait(&m->rm_sm,
+			     M0_BITS(M0_DRMS_INIT, M0_DRMS_STOPPED),
+			     M0_TIME_NEVER);
+	M0_ASSERT_INFO(rc == 0, "rc=%d", rc);
+	recovery_machine_unlock(m);
+	unpopulate_foms(m);
+}
+
+static struct m0_reqh *
+m0_dtm0_recovery_machine_reqh(struct m0_dtm0_recovery_machine *m)
+{
+	return m->rm_svc->dos_generic.rs_reqh;
+}
+
+static const struct m0_fid *
+recovery_machine_local_id(const struct m0_dtm0_recovery_machine *m)
+{
+	return &m->rm_svc->dos_generic.rs_service_fid;
+}
+
+static int recovery_machine_log_iter_next(struct m0_dtm0_recovery_machine *m,
+					  struct m0_be_dtm0_log_iter *iter,
+					  const struct m0_fid *tgt_svc,
+					  const struct m0_fid *origin_svc,
+					  struct m0_dtm0_log_rec *record)
+{
+	M0_PRE(m->rm_ops->log_iter_next != NULL);
+	return m->rm_ops->log_iter_next(m, iter, tgt_svc, origin_svc, record);
+}
+
+static int recovery_machine_log_iter_init(struct m0_dtm0_recovery_machine *m,
+					  struct m0_be_dtm0_log_iter      *iter)
+{
+	M0_PRE(m->rm_ops->log_iter_init != NULL);
+	return m->rm_ops->log_iter_init(m, iter);
+
+}
+
+static void recovery_machine_log_iter_fini(struct m0_dtm0_recovery_machine *m,
+					   struct m0_be_dtm0_log_iter      *iter)
+{
+	M0_PRE(m->rm_ops->log_iter_fini != NULL);
+	m->rm_ops->log_iter_fini(m, iter);
+}
+
+static void recovery_machine_redo_post(struct m0_dtm0_recovery_machine *m,
+				       struct m0_fom                   *fom,
+				       const struct m0_fid *tgt_proc,
+				       const struct m0_fid *tgt_svc,
+				       struct dtm0_req_fop *redo,
+				       struct m0_be_op *op)
+{
+	M0_PRE(m->rm_ops->redo_post != NULL);
+	m->rm_ops->redo_post(m, fom, tgt_proc, tgt_svc, redo, op);
+}
+
+static void recovery_machine_recovered(struct m0_dtm0_recovery_machine *m,
+				       const struct m0_fid *tgt_proc,
+				       const struct m0_fid *tgt_svc)
+{
+	M0_PRE(m->rm_ops->ha_event_post != NULL);
+	m->rm_ops->ha_event_post(m, tgt_proc, tgt_svc,
+				M0_CONF_HA_PROCESS_DTM_RECOVERED);
+}
+
+static void recovery_machine_lock(struct m0_dtm0_recovery_machine *m)
+{
+	m0_sm_group_lock(&m->rm_sm_group);
+}
+
+static void recovery_machine_unlock(struct m0_dtm0_recovery_machine *m)
+{
+	m0_sm_group_unlock(&m->rm_sm_group);
+}
+
+enum recovery_fom_state {
+	RFS_INIT = M0_FOM_PHASE_INIT,
+	RFS_DONE = M0_FOM_PHASE_FINISH,
+	RFS_WAITING,
+	RFS_FAILED,
+	RFS_NR,
+};
+
+static struct m0_sm_state_descr recovery_fom_states[] = {
+	[RFS_INIT] = {
+		.sd_name      = "RFS_INIT",
+		.sd_allowed   = M0_BITS(RFS_WAITING, RFS_DONE, RFS_FAILED),
+		.sd_flags     = M0_SDF_INITIAL,
+	},
+	/* terminal states */
+	[RFS_DONE] = {
+		.sd_name      = "RFS_DONE",
+		.sd_allowed   = 0,
+		.sd_flags     = M0_SDF_TERMINAL,
+	},
+	/* failure states */
+	[RFS_FAILED] = {
+		.sd_name      = "RFS_FAILED",
+		.sd_allowed   = M0_BITS(RFS_DONE),
+		.sd_flags     = M0_SDF_FAILURE,
+	},
+
+	/* intermediate states */
+	[RFS_WAITING] = {
+		.sd_name    = "RFS_WAITING",
+		.sd_allowed = M0_BITS(RFS_DONE,
+				      RFS_FAILED, RFS_WAITING),
+	},
+};
+
+const static struct m0_sm_conf recovery_fom_conf = {
+	.scf_name      = "recovery_fom",
+	.scf_nr_states = ARRAY_SIZE(recovery_fom_states),
+	.scf_state     = recovery_fom_states,
+};
+
+static size_t recovery_fom_locality(const struct m0_fom *fom)
+{
+	return 1;
+}
+
+static const struct m0_fom_ops recovery_fom_ops = {
+	.fo_fini          = recovery_fom_self_fini,
+	.fo_tick          = recovery_fom_tick,
+	.fo_home_locality = recovery_fom_locality
+};
+
+static struct m0_fom_type recovery_fom_type;
+static const struct m0_fom_type_ops recovery_fom_type_ops = {};
+
+M0_INTERNAL int m0_drm_domain_init(void)
+{
+	int         rc = 0;
+
+	M0_PRE(!m0_sm_conf_is_initialized(&m0_drm_sm_conf));
+
+	m0_fom_type_init(&recovery_fom_type,
+			 M0_DTM0_RECOVERY_FOM_OPCODE,
+			 &recovery_fom_type_ops,
+			 &dtm0_service_type,
+			 &recovery_fom_conf);
+
+	m0_sm_conf_init(&m0_drm_sm_conf);
+	rc = m0_sm_addb2_init(&m0_drm_sm_conf,
+			      M0_AVI_DRM_SM_STATE,
+			      M0_AVI_DRM_SM_COUNTER);
+
+	M0_POST(m0_sm_conf_is_initialized(&m0_drm_sm_conf));
+
+	return M0_RC(rc);
+}
+
+M0_INTERNAL void m0_drm_domain_fini(void)
+{
+	if (m0_sm_conf_is_initialized(&m0_drm_sm_conf)) {
+		m0_sm_addb2_fini(&m0_drm_sm_conf);
+		m0_sm_conf_fini(&m0_drm_sm_conf);
+		M0_POST(!m0_sm_conf_is_initialized(&m0_drm_sm_conf));
+	}
+}
+
+static void eolq_post(struct m0_dtm0_recovery_machine *m,
+		      struct eolq_item                *item)
+{
+	struct recovery_fom *rf = recovery_fom_local(m);
+	m0_be_queue_lock(&rf->rf_eolq);
+	/*
+	 * Do not post any events if we have already recovered this DTM0
+	 * service (eoq is an indicator here).
+	 */
+	if (!rf->rf_eolq.bq_the_end) {
+		/*
+		 * Assumption: the queue never gets full.
+		 * XXX: We could panic in this case. Right now, the HA/REDO FOM
+		 * should get stuck in the tick. Probably, panic is
+		 * a better solution here.
+		 */
+		M0_BE_OP_SYNC(op, M0_BE_QUEUE_PUT(&rf->rf_eolq, &op, item));
+	}
+	m0_be_queue_unlock(&rf->rf_eolq);
+}
+
+static void heq_post(struct recovery_fom *rf, enum m0_ha_obj_state state)
+{
+	uint64_t event = state;
+	m0_be_queue_lock(&rf->rf_heq);
+	/*
+	 * Assumption: the queue never gets full.
+	 * XXX: We could panic in this case. Right now, the HA FOM should get
+	 *      stuck in the tick. Probably, panic is a better solution
+	 *      here.
+	 */
+	M0_BE_OP_SYNC(op, M0_BE_QUEUE_PUT(&rf->rf_heq, &op, &event));
+	m0_be_queue_unlock(&rf->rf_heq);
+
+	M0_LOG(M0_DEBUG, "heq_enq " FID_F " %s ",
+	       FID_P(&rf->rf_tgt_svc),
+	       m0_ha_state2str(state));
+
+	/*
+	 * Recovery machine uses two kinds of queues: HEQ and EOLQ.  HEQ is HA
+	 * Events Queue, every recovery FOM has its own instance; it is used to
+	 * track HA events of a specific participant (tied to this given rfom).
+	 * EOLQ is EOL Queue, queue to track end-of-log events during recovery.
+	 * When we receive EOL from all participants -- we know recovery is
+	 * completed.  There is a nuance though.  If some remote participant
+	 * goes TRANSIENT/FAILED during recovery, we do not need to wait for EOL
+	 * from that side.  The easiest way to deliver this information to local
+	 * recovery FOM is to add an entry in EOLQ (since local RFOM already
+	 * 'listens' on that queue).  So that's what happens below.  heq_post()
+	 * is called when there is an HA event on some participant.  We add the
+	 * HA event itself in HEQ above, and then add it to EOLQ below.  Since
+	 * local recovery FOM is not expecting EOL from itself, we only do
+	 * eolq_post() if HA event does not come from local participant.
+	 */
+	if (!rf->rf_is_local)
+		eolq_post(rf->rf_m,
+			  &(struct eolq_item) {
+			  .ei_type = EIT_HA,
+			  .ei_ha_state = state,
+			  .ei_source = rf->rf_tgt_svc, });
+}
+
+static bool recovery_fom_ha_clink_cb(struct m0_clink *clink)
+{
+	struct recovery_fom *rf        = M0_AMB(rf, clink, rf_ha_clink);
+	struct m0_conf_obj  *proc_conf = container_of(clink->cl_chan,
+						       struct m0_conf_obj,
+						       co_ha_chan);
+	heq_post(rf, proc_conf->co_ha_state);
+	return false;
+}
+
+static int recovery_fom_init(struct recovery_fom             *rf,
+			     struct m0_dtm0_recovery_machine *m,
+			     struct m0_conf_process          *proc_conf,
+			     const struct m0_fid             *target,
+			     bool                             is_volatile)
+{
+	int rc;
+	bool is_local = m0_fid_eq(recovery_machine_local_id(m), target);
+
+	M0_ENTRY("m=%p, rf=%p, tgt=" FID_F ", is_vol=%d, is_local=%d",
+		 m, rf, FID_P(target), !!is_volatile, !!is_local);
+
+	M0_PRE(ergo(is_local, m->rm_local_rfom == NULL));
+
+	rc = m0_be_queue_init(&rf->rf_heq, &(struct m0_be_queue_cfg){
+		.bqc_q_size_max       = HEQ_MAX_LEN,
+		/*
+		 * Two producers:
+		 * 1. Conf-obj HA state updates (heq_post()).
+		 * 2. Stop-and-wait-when-finalising
+		 *    (m0_dtm0_recovery_machine_stop()).
+		 */
+		.bqc_producers_nr_max = 2,
+		/*
+		 * Single consumer - recovery machine FOM (recovery_fom_coro()).
+		 */
+		.bqc_consumers_nr_max = 1,
+		.bqc_item_length      = sizeof(uint64_t),
+	});
+	if (rc != 0)
+		return M0_ERR(rc);
+
+	if (is_local) {
+		rc = m0_be_queue_init(&rf->rf_eolq, &(struct m0_be_queue_cfg){
+				.bqc_q_size_max       = EOLQ_MAX_LEN,
+				/*
+				 * Consumers and producers are the same as for
+				 * rf_heq above.
+				 */
+				.bqc_producers_nr_max = 2,
+				.bqc_consumers_nr_max = 1,
+				.bqc_item_length      = sizeof(struct eolq_item),
+		});
+		if (rc != 0) {
+			m0_be_queue_fini(&rf->rf_heq);
+			return M0_ERR(rc);
+		}
+		M0_ASSERT(!rf->rf_eolq.bq_the_end);
+		m->rm_local_rfom = rf;
+
+		/*
+		 * This is local recovery FOM.  We don't need to wait for EOL
+		 * from ourselves.
+		 */
+		rf->rf_last_known_eol = true;
+		/*
+		 * Local recovery FOM is responsible for accepting REDO
+		 * messages.  We're starting up, we know for sure that we will
+		 * go through RECOVERING phase, so we are defaulting to
+		 * M0_NC_DTM_RECOVERING.
+		 */
+		rf->rf_last_known_ha_state = M0_NC_DTM_RECOVERING;
+	}
+
+	rf->rf_m = m;
+	rf->rf_tgt_svc = *target;
+	rf->rf_tgt_proc = proc_conf->pc_obj.co_id;
+	rf->rf_is_local = is_local;
+	rf->rf_is_volatile = is_volatile;
+
+	rfom_tlink_init(rf);
+	m0_co_context_init(&rf->rf_coro);
+
+	m0_clink_init(&rf->rf_ha_clink, recovery_fom_ha_clink_cb);
+	m0_clink_add_lock(&proc_conf->pc_obj.co_ha_chan, &rf->rf_ha_clink);
+
+	M0_LOG(M0_DEBUG, "Subscribed to " FID_F " with initial state: %s",
+	       FID_P(target), m0_ha_state2str(proc_conf->pc_obj.co_ha_state));
+
+	m0_fom_init(&rf->rf_base, &recovery_fom_type,
+		    &recovery_fom_ops, NULL, NULL,
+		    m0_dtm0_recovery_machine_reqh(m));
+
+	rfom_tlist_add_tail(&m->rm_rfoms, rf);
+
+	return M0_RC(rc);
+}
+
+/*
+ * Mark the queue as ended and drain it until the end.
+ */
+static void m0_be_queue__finish(struct m0_be_queue *bq, struct m0_buf *item)
+{
+	bool got = true;
+
+	m0_be_queue_lock(bq);
+	if (!bq->bq_the_end) {
+		m0_be_queue_end(bq);
+		while (got)
+			M0_BE_OP_SYNC(op, m0_be_queue_get(bq, &op, item, &got));
+	}
+	M0_POST(bq->bq_the_end);
+	m0_be_queue_unlock(bq);
+}
+#define M0_BE_QUEUE__FINISH(bq, item_type) ({                \
+	item_type item;                                      \
+	m0_be_queue__finish(bq, &M0_BUF_INIT_PTR(&item));    \
+})
+
+static void recovery_fom_fini(struct recovery_fom *rf)
+{
+	M0_ENTRY("m=%p, rf= %p", rf->rf_m, rf);
+	m0_clink_del_lock(&rf->rf_ha_clink);
+	m0_clink_fini(&rf->rf_ha_clink);
+	m0_co_context_fini(&rf->rf_coro);
+	rfom_tlink_fini(rf);
+	if (rf->rf_is_local) {
+		rf->rf_m->rm_local_rfom = NULL;
+		m0_be_queue_fini(&rf->rf_eolq);
+	}
+	m0_be_queue_fini(&rf->rf_heq);
+	M0_LEAVE();
+}
+
+static void recovery_fom_self_fini(struct m0_fom *fom)
+{
+	struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+	struct m0_dtm0_recovery_machine *m = rf->rf_m;
+	bool                 is_stopped;
+
+	M0_ENTRY("fom=%p, m=%p, rf=%p", fom, m, rf);
+	recovery_machine_lock(m);
+	rfom_tlist_remove(rf);
+	is_stopped = rfom_tlist_is_empty(&m->rm_rfoms);
+	recovery_fom_fini(rf);
+	m0_fom_fini(fom);
+	m0_free(rf);
+	if (is_stopped)
+		m0_sm_move(&m->rm_sm, 0, M0_DRMS_STOPPED);
+	recovery_machine_unlock(m);
+	M0_LEAVE("is_stopped=%d", !!is_stopped);
+}
+
+static int recovery_fom_add(struct m0_dtm0_recovery_machine *m,
+			    struct m0_conf_process          *proc_conf,
+			    const struct m0_fid             *target,
+			    bool                             is_volatile)
+{
+	struct recovery_fom *rf;
+	int                  rc;
+
+	M0_ALLOC_PTR(rf);
+	if (rf != NULL) {
+		recovery_machine_lock(m);
+		rc = recovery_fom_init(rf, m, proc_conf, target, is_volatile);
+		recovery_machine_unlock(m);
+	} else
+		rc = M0_ERR(-ENOMEM);
+
+	return M0_RC(rc);
+}
+
+static struct recovery_fom *
+recovery_fom_by_svc_find(struct m0_dtm0_recovery_machine *m,
+			 const struct m0_fid             *tgt_svc)
+{
+	return m0_tl_find(rfom, rf, &m->rm_rfoms,
+			  m0_fid_eq(tgt_svc, &rf->rf_tgt_svc));
+}
+
+static struct recovery_fom *
+recovery_fom_by_svc_find_lock(struct m0_dtm0_recovery_machine *m,
+			      const struct m0_fid             *tgt_svc)
+{
+	struct recovery_fom *rf;
+	recovery_machine_lock(m);
+	rf = recovery_fom_by_svc_find(m, tgt_svc);
+	recovery_machine_unlock(m);
+	return rf;
+}
+
+static struct recovery_fom *
+recovery_fom_local(struct m0_dtm0_recovery_machine *m)
+{
+	M0_PRE(m != NULL);
+	return m->rm_local_rfom;
+}
+
+static void unpopulate_foms(struct m0_dtm0_recovery_machine *m)
+{
+	struct recovery_fom *rf;
+
+	M0_ENTRY("m=%p", m);
+
+	recovery_machine_lock(m);
+	/*
+	 * TODO: assert that list is empty instead of this teardown
+	 */
+	m0_tl_teardown(rfom, &m->rm_rfoms, rf) {
+		recovery_fom_fini(rf);
+		m0_free(rf);
+	}
+	recovery_machine_unlock(m);
+
+	M0_LEAVE();
+}
+
+static bool conf_obj_is_process(const struct m0_conf_obj *obj)
+{
+	return m0_conf_obj_type(obj) == &M0_CONF_PROCESS_TYPE;
+}
+
+static bool is_svc_volatile(const struct m0_confc        *confc,
+			    const struct m0_fid          *svc_fid)
+{
+	struct m0_conf_service *svc;
+	struct m0_conf_obj     *obj;
+	const char            **param;
+
+	if (M0_FI_ENABLED("always_false"))
+		return false;
+
+	obj = m0_conf_cache_lookup(&confc->cc_cache, svc_fid);
+	M0_ASSERT(obj != NULL);
+
+	svc = M0_CONF_CAST(obj, m0_conf_service);
+
+	M0_ASSERT(svc->cs_params != NULL);
+
+	for (param = svc->cs_params; *param != NULL; ++param) {
+		if (m0_streq(*param, "origin:in-volatile"))
+			return true;
+		else if (m0_streq(*param, "origin:in-persistent"))
+			return false;
+	}
+
+	M0_IMPOSSIBLE("Service origin is not defined in the config?");
+}
+
+static int populate_foms(struct m0_dtm0_recovery_machine *m)
+{
+	struct m0_reqh_service *service = &m->rm_svc->dos_generic;
+	struct m0_confc        *confc = m0_reqh2confc(service->rs_reqh);
+	struct m0_conf_obj     *obj;
+	struct m0_conf_root    *root;
+	struct m0_conf_diter    it;
+	struct m0_conf_process *proc_conf;
+	struct m0_fid           svc_fid;
+	int                     rc;
+	struct recovery_fom    *rf;
+
+	M0_ENTRY("recovery machine=%p", m);
+
+	/** UT workaround */
+	if (!m0_confc_is_inited(confc)) {
+		M0_LOG(M0_WARN, "confc is not initiated!");
+		return M0_RC(0);
+	}
+
+	rc = m0_confc_root_open(confc, &root) ?:
+		m0_conf_diter_init(&it, confc,
+				   &root->rt_obj,
+				   M0_CONF_ROOT_NODES_FID,
+				   M0_CONF_NODE_PROCESSES_FID);
+	if (rc != 0)
+		goto out;
+
+	while ((rc = m0_conf_diter_next_sync(&it, conf_obj_is_process)) > 0) {
+		obj = m0_conf_diter_result(&it);
+		proc_conf = M0_CONF_CAST(obj, m0_conf_process);
+		rc = m0_conf_process2service_get(confc,
+						 &proc_conf->pc_obj.co_id,
+						 M0_CST_DTM0, &svc_fid);
+		if (rc != 0)
+			continue;
+
+		rc = recovery_fom_add(m, proc_conf, &svc_fid,
+				      is_svc_volatile(confc, &svc_fid));
+		if (rc != 0)
+			break;
+	}
+
+	m0_conf_diter_fini(&it);
+
+	/*
+	 * It is a workaround to propagate the current HA state.
+	 */
+	m0_confc_close(&root->rt_obj);
+	rc = m0_confc_root_open(confc, &root) ?:
+		m0_conf_diter_init(&it, confc,
+				   &root->rt_obj,
+				   M0_CONF_ROOT_NODES_FID,
+				   M0_CONF_NODE_PROCESSES_FID);
+	if (rc != 0)
+		goto out;
+
+	while ((rc = m0_conf_diter_next_sync(&it, conf_obj_is_process)) > 0) {
+		obj = m0_conf_diter_result(&it);
+		proc_conf = M0_CONF_CAST(obj, m0_conf_process);
+		rc = m0_conf_process2service_get(confc,
+						 &proc_conf->pc_obj.co_id,
+						 M0_CST_DTM0, &svc_fid);
+		if (rc != 0)
+			continue;
+
+		rf = recovery_fom_by_svc_find(m, &svc_fid);
+		if (rf == NULL)
+			break;
+
+		if (!rf->rf_is_local)
+			heq_post(rf, proc_conf->pc_obj.co_ha_state);
+	}
+
+	m0_conf_diter_fini(&it);
+	/* end of workaround */
+
+out:
+	if (root != NULL)
+		m0_confc_close(&root->rt_obj);
+	if (rc != 0)
+		unpopulate_foms(m);
+	return M0_RC(rc);
+}
+
+static struct m0_co_context *CO(struct m0_fom *fom)
+{
+	struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+	return &rf->rf_coro;
+}
+
+#define F M0_CO_FRAME_DATA
+
+static void heq_await(struct m0_fom *fom, enum m0_ha_obj_state *out, bool *eoq)
+{
+	struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+
+	M0_CO_REENTER(CO(fom),
+		      struct m0_be_op op;
+		      bool            got;
+		      uint64_t        state;);
+
+	F(got) = false;
+	F(state) = 0;
+	M0_SET0(&F(op));
+	m0_be_op_init(&F(op));
+	m0_be_queue_lock(&rf->rf_heq);
+	M0_BE_QUEUE_GET(&rf->rf_heq, &F(op), &F(state), &F(got));
+	m0_be_queue_unlock(&rf->rf_heq);
+	M0_CO_YIELD_RC(CO(fom), m0_be_op_tick_ret(&F(op), fom, RFS_WAITING));
+	m0_be_op_fini(&F(op));
+
+
+	if (F(got)) {
+		M0_LOG(M0_DEBUG, "heq_deq " FID_F " %s" ,
+		       FID_P(&rf->rf_tgt_svc),
+		       m0_ha_state2str(F(state)));
+		M0_ASSERT(ha_event_invariant(F(state)));
+		*out = F(state);
+	} else {
+		M0_LOG(M0_DEBUG, "heq_deq " FID_F, FID_P(&rf->rf_tgt_svc));
+		*eoq = true;
+	}
+}
+
+static void eolq_await(struct m0_fom *fom, struct eolq_item *out)
+{
+	struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+
+	M0_CO_REENTER(CO(fom),
+		      struct m0_be_op op;
+		      bool            got;
+		      struct eolq_item item;);
+
+	F(got) = false;
+	M0_SET0(&F(item));
+	M0_SET0(&F(op));
+	m0_be_op_init(&F(op));
+	m0_be_queue_lock(&rf->rf_eolq);
+	M0_BE_QUEUE_GET(&rf->rf_eolq, &F(op), &F(item), &F(got));
+	m0_be_queue_unlock(&rf->rf_eolq);
+	M0_CO_YIELD_RC(CO(fom), m0_be_op_tick_ret(&F(op), fom, RFS_WAITING));
+	m0_be_op_fini(&F(op));
+
+	*out = F(got) ? F(item) : (struct eolq_item) { .ei_type = EIT_END };
+}
+
+/**
+ * Restore missing transactions on remote participant.
+ *
+ * Implements part of recovery process.  Healthy ONLINE participant will iterate
+ * through the local DTM log and send all needed REDOs to a remote peer.
+ */
+static void dtm0_restore(struct m0_fom        *fom,
+			 enum m0_ha_obj_state *out,
+			 bool                 *eoq)
+{
+	struct recovery_fom   *rf = M0_AMB(rf, fom, rf_base);
+	struct m0_dtm0_log_rec record;
+	struct dtm0_req_fop    redo;
+	int                    rc = 0;
+
+	M0_CO_REENTER(CO(fom),
+		      struct m0_fid       initiator;
+		      struct m0_be_op     reply_op;
+		      bool                next;
+		      );
+
+	recovery_machine_log_iter_init(rf->rf_m, &rf->rf_log_iter);
+
+	/* XXX: race condition in the case where we are stopping the FOM. */
+	F(initiator) = recovery_fom_local(rf->rf_m)->rf_tgt_svc;
+
+	M0_SET0(&F(reply_op));
+	m0_be_op_init(&F(reply_op));
+
+	do {
+		M0_SET0(&record);
+		rc = recovery_machine_log_iter_next(rf->rf_m, &rf->rf_log_iter,
+						    &rf->rf_tgt_svc, NULL,
+						    &record);
+		/* Any value except zero means that we should stop recovery. */
+		F(next) = rc == 0;
+		redo = (struct dtm0_req_fop) {
+			.dtr_msg       = DTM_REDO,
+			.dtr_initiator = F(initiator),
+			.dtr_payload   = record.dlr_payload,
+			.dtr_txr       = record.dlr_txd,
+			.dtr_flags     = F(next) ? 0 : M0_BITS(M0_DMF_EOL),
+		};
+
+		/*
+		 * TODO: there is extra memcpy happening here -- first copy done
+		 * in recovery_machine_log_iter_next (it clones the record),
+		 * second copy is done in recovery_machine_redo_post() below.
+		 * If this proves to be too inefficient, we can eliminate extra
+		 * copies.
+		 */
+		recovery_machine_redo_post(rf->rf_m, &rf->rf_base,
+					   &rf->rf_tgt_proc, &rf->rf_tgt_svc,
+					   &redo, &F(reply_op));
+
+		M0_LOG(M0_DEBUG, "out-redo: (m=%p) " REDO_F,
+		       rf->rf_m, REDO_P(&redo));
+		M0_CO_YIELD_RC(CO(fom), m0_be_op_tick_ret(&F(reply_op),
+							  fom, RFS_WAITING));
+		m0_be_op_reset(&F(reply_op));
+	} while (F(next));
+
+	m0_be_op_fini(&F(reply_op));
+
+	recovery_machine_log_iter_fini(rf->rf_m, &rf->rf_log_iter);
+	M0_SET0(&rf->rf_log_iter);
+
+	M0_CO_FUN(CO(fom), heq_await(fom, out, eoq));
+}
+
+static void remote_recovery_fom_coro(struct m0_fom *fom)
+{
+	struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+
+	M0_CO_REENTER(CO(fom),
+		      struct m0_be_op op;
+		      bool            eoq;
+		      enum m0_ha_obj_state state;
+		      void (*action)(struct m0_fom *fom,
+				     enum m0_ha_obj_state *out, bool *eoq);
+		      );
+
+	F(eoq) = false;
+	F(state) = M0_NC_UNKNOWN;
+
+	while (!F(eoq)) {
+		M0_LOG(M0_DEBUG, "remote recovery fom=%p, svc_fid=" FID_F ", "
+		       " proc_fid=" FID_F " handles %s state.", fom,
+		       FID_P(&rf->rf_tgt_svc), FID_P(&rf->rf_tgt_proc),
+		       m0_ha_state2str(F(state)));
+
+		switch (F(state)) {
+		case M0_NC_DTM_RECOVERING:
+			F(action) = rf->rf_is_volatile ? heq_await :
+							 dtm0_restore;
+			break;
+		case M0_NC_FAILED:
+			if (ALL2ALL)
+				M0_LOG(M0_WARN, "Eviction is not supported.");
+			else
+				M0_IMPOSSIBLE("Eviction is not supported.");
+			/*
+			 * Fall-through is intentional here.  Eviction code will
+			 * be added here in the future.
+			 */
+		default:
+			F(action) = heq_await;
+			break;
+		}
+
+		M0_CO_FUN(CO(fom), F(action)(fom, &F(state), &F(eoq)));
+	}
+
+	m0_fom_phase_set(fom, RFS_DONE);
+}
+
+static bool was_log_replayed(struct recovery_fom *rf)
+{
+	bool outcome;
+	/*
+	 * XXX: Clients are ignored by the recovery stop condition.
+	 * Once HA and Motr are able to properly handle client
+	 * restart, they can be brought back into the condition.
+	 */
+	bool is_na = rf->rf_is_volatile;
+
+	/*
+	 * Some of unit tests need to temporarily avoid the "ignore" described
+	 * above.  In particular, we have a test where client is sending EOL to
+	 * server, and we want it to be received and handled (e.g.
+	 * remach-reboot-server).
+	 */
+	if (m0_dtm0_is_expecting_redo_from_client()) {
+		M0_LOG(M0_DEBUG, "Expect EOL from client");
+		is_na = false;
+	}
+
+	outcome = ergo(!rf->rf_is_local && !is_na,
+			    M0_IN(rf->rf_last_known_ha_state,
+				  (M0_NC_ONLINE, M0_NC_DTM_RECOVERING)) &&
+			    rf->rf_last_known_eol);
+	return outcome;
+}
+
+static void rec_cond_trace(struct m0_dtm0_recovery_machine *m)
+{
+	struct recovery_fom *rf;
+
+	if (!m0_tl_exists(rfom, rf, &m->rm_rfoms, !rf->rf_is_local))
+		M0_LOG(M0_WARN, "Recovery cannot be completed because there "
+		       "are no remote DTM0 services.");
+
+	m0_tl_for(rfom, &m->rm_rfoms, rf) {
+		M0_LOG(M0_DEBUG,
+		       "id=" FID_F ", is_volatile=%d, "
+		       "is_local=%d, state=%s, got_eol=%d => %d",
+		       FID_P(&rf->rf_tgt_svc),
+		       (int) rf->rf_is_volatile,
+		       (int) rf->rf_is_local,
+		       m0_ha_state2str(rf->rf_last_known_ha_state),
+		       (int) rf->rf_last_known_eol,
+		       (int) was_log_replayed(rf));
+	} m0_tl_endfor;
+}
+
+static bool is_local_recovery_completed(struct m0_dtm0_recovery_machine *m)
+{
+	M0_ENTRY();
+	rec_cond_trace(m);
+	return M0_RC(m0_tl_exists(rfom, r, &m->rm_rfoms, !r->rf_is_local) &&
+		m0_tl_forall(rfom, r, &m->rm_rfoms, was_log_replayed(r)));
+}
+
+static void remote_state_update(struct recovery_fom    *rf,
+				const struct eolq_item *item)
+{
+	M0_ENTRY("rf=%p, " FID_F " state: %s, eol: %d",
+		 rf, FID_P(&rf->rf_tgt_svc),
+		 m0_ha_state2str(rf->rf_last_known_ha_state),
+		 (int) rf->rf_last_known_eol);
+
+	switch (item->ei_type) {
+	case EIT_HA:
+		M0_LOG(M0_DEBUG, "new_state=%s",
+		       m0_ha_state2str(item->ei_ha_state));
+		rf->rf_last_known_ha_state = item->ei_ha_state;
+		/* Clear the EOL flag if the remote is dead. */
+		if (M0_IN(item->ei_ha_state, (M0_NC_TRANSIENT, M0_NC_FAILED)))
+			rf->rf_last_known_eol = false;
+		break;
+	case EIT_EOL:
+		M0_LOG(M0_DEBUG, "new_eol=1");
+		rf->rf_last_known_eol = true;
+		break;
+	default:
+		M0_IMPOSSIBLE("Wrong eolq item type %d?", item->ei_type);
+		break;
+	}
+
+	M0_LEAVE();
+}
+
+static void local_recovery_fom_coro(struct m0_fom *fom)
+{
+	struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+	struct recovery_fom *remote_rf;
+
+	M0_CO_REENTER(CO(fom),
+		      struct m0_be_op      eolq_op;
+		      bool                 eoq;
+		      bool                 recovered;
+		      struct eolq_item     item;
+		      enum m0_ha_obj_state state;);
+
+	F(eoq) = false;
+	/*
+	 * A DTM0 service without persistent storage does not need
+	 * REDOs.
+	 * mkfs does not require DTM0 support as well.
+	 */
+	F(recovered) = rf->rf_is_volatile;
+
+	/* Wait until the moment where we should start recovery. */
+	do {
+		M0_CO_FUN(CO(fom), heq_await(fom, &F(state), &F(eoq)));
+		if (F(eoq))
+			goto out;
+	} while (F(state) != M0_NC_DTM_RECOVERING);
+
+	while (!F(recovered)) {
+		M0_CO_FUN(CO(fom), eolq_await(fom, &F(item)));
+		M0_ASSERT(F(item).ei_type != EIT_END);
+
+		recovery_machine_lock(rf->rf_m);
+		remote_rf = recovery_fom_by_svc_find(rf->rf_m,
+						     &F(item).ei_source);
+		if (remote_rf == NULL) {
+			/* XXX: machine is stopping? */
+			recovery_machine_unlock(rf->rf_m);
+			break;
+		}
+		M0_ASSERT(remote_rf != rf);
+		remote_state_update(remote_rf, &F(item));
+
+		F(recovered) = is_local_recovery_completed(rf->rf_m);
+		recovery_machine_unlock(rf->rf_m);
+	}
+
+	/*
+	 * At this point we do not expect any more EOL messages, so we 'end'
+	 * and flush the queue to ensure it.
+	 */
+	M0_BE_QUEUE__FINISH(&rf->rf_eolq, typeof(F(item)));
+
+	/*
+	 * When F(recovered) is true, we know we received all needed EOL
+	 * messages and have recovered everything.  If it is false -- at this
+	 * point, this can only mean that recovery machine is shutting down, and
+	 * there is nothing we need to do further.
+	 */
+	if (F(recovered)) {
+		/*
+		 * Emit "RECOVERED". It shall cause HA to tell us to transit
+		 * from RECOVERING to ONLINE.
+		 */
+		recovery_machine_recovered(rf->rf_m,
+					   &rf->rf_tgt_proc, &rf->rf_tgt_svc);
+
+		do {
+			M0_CO_FUN(CO(fom), heq_await(fom, &F(state),
+						     &F(eoq)));
+			M0_ASSERT(ergo(!F(eoq), M0_IN(F(state),
+						      (M0_NC_TRANSIENT,
+						       M0_NC_ONLINE))));
+		} while (!F(eoq));
+	}
+
+out:
+	/*
+	 * This is not a duplication to the similar call above -- FINISH is
+	 * indempotent call, and we want to ensure that it's called on all paths
+	 * leading out of this function.
+	 */
+	M0_BE_QUEUE__FINISH(&rf->rf_eolq, typeof(F(item)));
+	m0_fom_phase_set(fom, RFS_DONE);
+}
+
+static void recovery_fom_coro(struct m0_fom *fom)
+{
+	struct recovery_fom *rf = M0_AMB(rf, fom, rf_base);
+
+	M0_CO_REENTER(CO(fom));
+
+	addb2_relate(&rf->rf_m->rm_sm, &fom->fo_sm_phase);
+
+	if (rf->rf_is_local)
+		M0_CO_FUN(CO(fom), local_recovery_fom_coro(fom));
+	else
+		M0_CO_FUN(CO(fom), remote_recovery_fom_coro(fom));
+}
+
+static int recovery_fom_tick(struct m0_fom *fom)
+{
+	int rc;
+	M0_CO_START(CO(fom));
+	recovery_fom_coro(fom);
+	rc = M0_CO_END(CO(fom));
+	M0_POST(M0_IN(rc, (0, M0_FSO_AGAIN, M0_FSO_WAIT)));
+	return rc ?: M0_FSO_WAIT;
+}
+
+#undef F
+
+M0_INTERNAL void
+m0_ut_remach_heq_post(struct m0_dtm0_recovery_machine *m,
+		      const struct m0_fid             *tgt_svc,
+		      enum m0_ha_obj_state             state)
+{
+	struct recovery_fom *rf = recovery_fom_by_svc_find_lock(m, tgt_svc);
+	M0_ASSERT_INFO(rf != NULL,
+		       "Trying to post HA event to a wrong service?");
+	heq_post(rf, state);
+}
+
+M0_INTERNAL void
+m0_ut_remach_populate(struct m0_dtm0_recovery_machine *m,
+		      struct m0_conf_process          *procs,
+		      const struct m0_fid             *svcs,
+		      bool                            *is_volatile,
+		      uint64_t                         objs_nr)
+{
+	uint64_t i;
+	int rc;
+
+	for (i = 0; i < objs_nr; ++i) {
+		rc = recovery_fom_add(m, procs + i, svcs + i, is_volatile[i]);
+		M0_ASSERT(rc == 0);
+	}
+}
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_redo_post(struct m0_dtm0_recovery_machine *m,
+				   struct dtm0_req_fop             *redo,
+				   struct m0_be_op                 *op)
+{
+	bool                 is_eol =
+		!!(redo->dtr_flags & M0_BITS(M0_DMF_EOL));
+	bool                 is_eviction =
+		!!(redo->dtr_flags & M0_BITS(M0_DMF_EVICTION));
+	const struct m0_fid *initiator = &redo->dtr_initiator;
+	struct eolq_item     item = {};
+	struct recovery_fom *rf;
+
+	M0_ENTRY("in-redo (m=%p): " REDO_F, m, REDO_P(redo));
+
+	if (is_eol) {
+		M0_ASSERT_INFO(!is_eviction,
+			       "TODO: Eviction is not handled yet.");
+
+		rf = recovery_fom_local(m);
+		if (rf != NULL) {
+			M0_ASSERT_INFO(equi(is_eviction, !rf->rf_is_local),
+				       "Participant cannot evict itself.");
+			item = (struct eolq_item) {
+				.ei_type = EIT_EOL,
+				.ei_source = *initiator,
+			};
+			m0_be_queue_lock(&rf->rf_eolq);
+			M0_ASSERT_INFO(!rf->rf_eolq.bq_the_end,
+				       "REDOs are not allowed if local recovery"
+				       " has already been finished.");
+			M0_BE_QUEUE_PUT(&rf->rf_eolq, op, &item);
+			m0_be_queue_unlock(&rf->rf_eolq);
+		} else {
+			M0_LOG(M0_WARN,
+			       "REDO received but svc is not RECOVERING yet");
+			m0_be_op_active(op);
+			m0_be_op_done(op);
+		}
+	} else {
+		M0_LOG(M0_DEBUG, "A non-EOL REDO was ignored.");
+		m0_be_op_active(op);
+		m0_be_op_done(op);
+	}
+
+	M0_LEAVE();
+}
+
+static int default_log_iter_init(struct m0_dtm0_recovery_machine *m,
+				 struct m0_be_dtm0_log_iter      *iter)
+{
+	m0_be_dtm0_log_iter_init(iter, m->rm_svc->dos_log);
+	return 0;
+}
+
+static void default_log_iter_fini(struct m0_dtm0_recovery_machine *m,
+				  struct m0_be_dtm0_log_iter      *iter)
+{
+	struct recovery_fom *rf = M0_AMB(rf, iter, rf_log_iter);
+	M0_ASSERT(rf->rf_m == m);
+	m0_be_dtm0_log_iter_fini(iter);
+}
+
+static bool participated(const struct m0_dtm0_log_rec *record,
+			 const struct m0_fid          *svc)
+{
+	return m0_exists(i, record->dlr_txd.dtd_ps.dtp_nr,
+			 m0_fid_eq(&record->dlr_txd.dtd_ps.dtp_pa[i].p_fid,
+				   svc));
+}
+
+static int default_log_iter_next(struct m0_dtm0_recovery_machine *m,
+				 struct m0_be_dtm0_log_iter      *iter,
+				 const struct m0_fid             *tgt_svc,
+				 const struct m0_fid             *origin_svc,
+				 struct m0_dtm0_log_rec          *record)
+{
+	struct m0_be_dtm0_log *log = m->rm_svc->dos_log;
+	int rc;
+
+	/* XXX: not supported yet */
+	M0_ASSERT(origin_svc == NULL);
+
+	m0_mutex_lock(&log->dl_lock);
+
+	/* Filter out records where tgt_svc is not a participant. */
+	do {
+		M0_SET0(record);
+		rc = m0_be_dtm0_log_iter_next(iter, record);
+		if (rc == 0) {
+			if (participated(record, tgt_svc))
+				break;
+			else
+				m0_dtm0_log_iter_rec_fini(record);
+		}
+	} while (rc == 0);
+
+	m0_mutex_unlock(&log->dl_lock);
+
+	/* XXX: error codes will be adjusted separately. */
+	switch (rc) {
+	case 0:
+		return 0;
+	default:
+		return M0_ERR(rc);
+	}
+}
+
+/*
+ * TODO: It was copy-pasted from setup.c (see cs_ha_process_event)!
+ * Export cs_ha_process_event instead of using this thing.
+ */
+static void server_process_event(struct m0_motr                *cctx,
+				 enum m0_conf_ha_process_event  event)
+{
+	enum m0_conf_ha_process_type type;
+
+	type = cctx->cc_mkfs ? M0_CONF_HA_PROCESS_M0MKFS :
+			       M0_CONF_HA_PROCESS_M0D;
+	if (cctx->cc_ha_is_started && !cctx->cc_no_conf &&
+	    cctx->cc_motr_ha.mh_link != NULL) {
+		m0_conf_ha_process_event_post(&cctx->cc_motr_ha.mh_ha,
+		                              cctx->cc_motr_ha.mh_link,
+		                              &cctx->cc_reqh_ctx.rc_fid,
+		                              m0_process(), event, type);
+	}
+}
+
+/* TODO: copy-pasted from client_init.c (see ha_process_event)!
+ * Actually, clients should not send RECOVERED but at this moment,
+ * the SM on the Hare side cannot properly handle this.
+ */
+static void client_process_event(struct m0_client *m0c,
+				 enum m0_conf_ha_process_event  event)
+{
+	const enum m0_conf_ha_process_type type = M0_CONF_HA_PROCESS_OTHER;
+	if (m0c->m0c_motr_ha.mh_link != NULL)
+		m0_conf_ha_process_event_post(&m0c->m0c_motr_ha.mh_ha,
+					      m0c->m0c_motr_ha.mh_link,
+					      &m0c->m0c_process_fid,
+					      m0_process(), event, type);
+}
+
+static void default_ha_event_post(struct m0_dtm0_recovery_machine *m,
+				  const struct m0_fid             *tgt_proc,
+				  const struct m0_fid             *tgt_svc,
+				  enum m0_conf_ha_process_event    event)
+{
+	struct m0_reqh *reqh;
+	struct m0_client *m0c;
+	(void) tgt_proc;
+	(void) tgt_svc;
+
+	if (ALL2ALL) {
+		M0_LOG(M0_DEBUG, "ALL2ALL_DTM_RECOVERED");
+	}
+
+	M0_ASSERT_INFO(m->rm_local_rfom != NULL,
+		       "It is impossible to emit an HA event without local "
+		       "recovery FOM up and running.");
+	reqh = m0_fom_reqh(&m->rm_local_rfom->rf_base);
+
+	if (!m->rm_local_rfom->rf_is_volatile) {
+		M0_ASSERT_INFO(m0_cs_reqh_context(reqh) != NULL,
+			       "A fully-functional motr process must "
+			       "have a reqh ctx.");
+		server_process_event(m0_cs_ctx_get(reqh), event);
+	} else {
+		m0c = M0_AMB(m0c, reqh, m0c_reqh);
+		client_process_event(m0c, event);
+	}
+
+}
+
+static void default_redo_post(struct m0_dtm0_recovery_machine *m,
+			      struct m0_fom                   *fom,
+			      const struct m0_fid             *tgt_proc,
+			      const struct m0_fid             *tgt_svc,
+			      struct dtm0_req_fop             *redo,
+			      struct m0_be_op                 *op)
+{
+	int rc;
+
+	rc = m0_dtm0_req_post(m->rm_svc, op, redo, tgt_svc, fom, true);
+	/*
+	 * We assume "perfect" links between ONLINE/RECOVERING processes.
+	 * If the link is not perfect then let's just kill the process
+	 * that is not able to send out REDOs.
+	 */
+	M0_ASSERT(rc == 0);
+}
+
+M0_INTERNAL const struct m0_dtm0_recovery_machine_ops
+		m0_dtm0_recovery_machine_default_ops = {
+	.log_iter_init = default_log_iter_init,
+	.log_iter_fini = default_log_iter_fini,
+	.log_iter_next = default_log_iter_next,
+
+	.redo_post     = default_redo_post,
+	.ha_event_post = default_ha_event_post,
+};
+
+#undef M0_TRACE_SUBSYSTEM
+/*
+ *  Local variables:
+ *  c-indentation-style: "K&R"
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ *  fill-column: 80
+ *  scroll-step: 1
+ *  End:
+ */
+/*
+ * vim: tabstop=8 shiftwidth=8 noexpandtab textwidth=80 nowrap
+ */
diff --git a/dtm0/recovery.h b/dtm0/recovery.h
new file mode 100644
index 00000000000..d83f74482fa
--- /dev/null
+++ b/dtm0/recovery.h
@@ -0,0 +1,183 @@
+/* -*- C -*- */
+/*
+ * Copyright (c) 2021 Seagate Technology LLC and/or its Affiliates
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * For any questions about this software or licensing,
+ * please email opensource@seagate.com or cortx-questions@seagate.com.
+ *
+ */
+
+
+#pragma once
+
+#ifndef __MOTR_DTM0_RECOVERY_H__
+#define __MOTR_DTM0_RECOVERY_H__
+
+#include "lib/tlist.h" /* m0_tl */
+#include "ha/note.h"   /* m0_ha_obj_state */
+#include "conf/ha.h"   /* m0_conf_ha_process_event */
+#include "sm/sm.h"     /* m0_sm, m0_sm_group */
+
+/* imports */
+struct m0_dtm0_service;
+struct m0_conf_process;
+struct dtm0_req_fop;
+struct m0_be_dtm0_log_iter;
+struct m0_dtm0_log_rec;
+struct m0_be_op;
+struct m0_fom;
+
+/* exports */
+struct m0_dtm0_recovery_machine_ops;
+struct m0_dtm0_recovery_machine;
+struct recovery_fom;
+
+enum m0_dtm0_recovery_machine_states {
+	M0_DRMS_INIT,
+	M0_DRMS_STARTED,
+	M0_DRMS_STOPPED,
+	M0_DRMS_NR,
+};
+
+struct m0_dtm0_recovery_machine_ops {
+	/**
+	 * Post a REDO message to the target DTM0 service.
+	 *
+	 * The expectation is that `redo` structure is either processed
+	 * immediately, before function completes, or it is cloned and stored
+	 * for future use.  Caller will destroy all content of `redo` structure
+	 * right after the call to redo_post().
+	 */
+	void (*redo_post)(struct m0_dtm0_recovery_machine *m,
+			  struct m0_fom                   *fom,
+			  const struct m0_fid             *tgt_proc,
+			  const struct m0_fid             *tgt_svc,
+			  struct dtm0_req_fop             *redo,
+			  struct m0_be_op                 *op);
+
+	/**
+	 * Post a conf ha process event.
+	 */
+	void (*ha_event_post)(struct m0_dtm0_recovery_machine *m,
+			      const struct m0_fid             *tgt_proc,
+			      const struct m0_fid             *tgt_svc,
+			      enum m0_conf_ha_process_event    event);
+
+	int (*log_iter_init)(struct m0_dtm0_recovery_machine *m,
+			     struct m0_be_dtm0_log_iter      *iter);
+
+	/**
+	 * Get next log record (or -ENOENT) from the local DTM0 log.
+	 * @param[in] tgt_svc DTM0 service this REDO shall be sent to.
+	 * @param[in,opt] origin_svc DTM0 service to be selected. When
+	 *                           this parameter is set to non-NULL,
+	 *                           the iter is supposed to select only
+	 *                           the log records that were originated
+	 *                           on this particular service.
+	 */
+	int (*log_iter_next)(struct m0_dtm0_recovery_machine   *m,
+			     struct m0_be_dtm0_log_iter        *iter,
+			     const struct m0_fid               *tgt_svc,
+			     const struct m0_fid               *origin_svc,
+			     struct m0_dtm0_log_rec            *record);
+
+	void (*log_iter_fini)(struct m0_dtm0_recovery_machine *m,
+			      struct m0_be_dtm0_log_iter      *iter);
+};
+
+
+struct m0_dtm0_recovery_machine {
+	struct m0_dtm0_service                    *rm_svc;
+	struct m0_tl                               rm_rfoms;
+	const struct m0_dtm0_recovery_machine_ops *rm_ops;
+	struct recovery_fom                       *rm_local_rfom;
+
+	struct m0_sm_group                         rm_sm_group;
+	struct m0_sm                               rm_sm;
+};
+
+M0_EXTERN const struct m0_dtm0_recovery_machine_ops
+			m0_dtm0_recovery_machine_default_ops;
+
+M0_INTERNAL int m0_drm_domain_init(void);
+
+M0_INTERNAL void m0_drm_domain_fini(void);
+
+/**
+ * Initialise a recovery machine.
+ * @param ops Recovery machine operations (such as sending of REDOs and some
+ *            others) that have to be used instead of the default ops.
+ *            These ops may be used to alter the effects of recovery machine
+ *            decisions on the system (for example, in UTs).
+ */
+M0_INTERNAL void
+m0_dtm0_recovery_machine_init(struct m0_dtm0_recovery_machine           *m,
+			      const struct m0_dtm0_recovery_machine_ops *ops,
+			      struct m0_dtm0_service                    *svc);
+
+M0_INTERNAL int
+m0_dtm0_recovery_machine_start(struct m0_dtm0_recovery_machine *m);
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_stop(struct m0_dtm0_recovery_machine *m);
+
+M0_INTERNAL void
+m0_dtm0_recovery_machine_fini(struct m0_dtm0_recovery_machine *m);
+
+/**
+ * Post a REDO message into recovery machine.
+ *
+ * Recovery machine needs to know what REDO messages were received by the local
+ * Motr process. It helps to properly advance the state of the local process.
+ * For example, the transition RECOVERING -> ONLINE may happen only when it
+ * receives enough REDO messages with EOL flag set (or simply EOL messages).
+ *
+ * Note, at this moment recovery machine sends out REDO messages but it does not
+ * apply incoming REDO messages. It must be done elsewhere. However, there is an
+ * ongoing effort to change this by getting rid of self-sufficient REDO FOMs.
+ * After these changes are done, recovery machine as a module will be fully
+ * responsible for sending and executing REDO messages.
+ */
+M0_INTERNAL void
+m0_dtm0_recovery_machine_redo_post(struct m0_dtm0_recovery_machine *m,
+				   struct dtm0_req_fop             *redo,
+				   struct m0_be_op                 *op);
+
+/* UT-related API */
+M0_INTERNAL void
+m0_ut_remach_heq_post(struct m0_dtm0_recovery_machine *m,
+		      const struct m0_fid             *tgt_svc,
+		      enum m0_ha_obj_state             state);
+M0_INTERNAL void
+m0_ut_remach_populate(struct m0_dtm0_recovery_machine *m,
+		      struct m0_conf_process          *procs,
+		      const struct m0_fid             *svcs,
+		      bool                            *is_volatile,
+		      uint64_t                         objs_nr);
+
+#endif /* __MOTR_DTM0_RECOVERY_H__ */
+
+/*
+ *  Local variables:
+ *  c-indentation-style: "K&R"
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ *  fill-column: 80
+ *  scroll-step: 1
+ *  End:
+ */
+/*
+ * vim: tabstop=8 shiftwidth=8 noexpandtab textwidth=80 nowrap
+ */
diff --git a/dtm0/service.c b/dtm0/service.c
index 75e979abcc3..a046a48cf28 100644
--- a/dtm0/service.c
+++ b/dtm0/service.c
@@ -179,8 +179,12 @@ static int dtm0_process_disconnect(struct dtm0_process *process)
 	if (M0_IS0(&process->dop_rlink))
 		return M0_RC(0);
 
-	rc = m0_rpc_link_is_connected(&process->dop_rlink) ?
-		m0_rpc_link_disconnect_sync(&process->dop_rlink, timeout) : 0;
+	if (m0_rpc_link_is_connected(&process->dop_rlink)) {
+		m0_rpc_conn_sessions_cancel(&process->dop_rlink.rlk_conn);
+		rc = m0_rpc_link_disconnect_sync(&process->dop_rlink, timeout);
+	} else
+		rc = 0;
+
 
 	if (M0_IN(rc, (0, -ETIMEDOUT, -ECANCELED))) {
 		/*
@@ -263,7 +267,7 @@ static int volatile_log_init(struct m0_dtm0_service *dtm0)
 		rc = m0_be_dtm0_log_init(dtm0->dos_log, NULL,
 					 &dtm0->dos_clk_src, false);
 		if (rc != 0)
-			m0_be_dtm0_log_free(&dtm0->dos_log);
+			m0_be_dtm0_log_free0(&dtm0->dos_log);
 	}
 	return rc;
 }
@@ -330,10 +334,33 @@ static int dtm_service__origin_fill(struct m0_reqh_service *service)
 	return M0_RC_INFO(rc, "origin=%d", dtm0->dos_origin);
 }
 
+/*
+ * Certain UTs manually control the lifetime of recovery machine.
+ * When manual start-stop is disabled, DTM0 service automatically
+ * starts-stops the machine.
+ */
+static bool is_manual_ss_enabled(void)
+{
+	return M0_FI_ENABLED("ut");
+}
+
 static int dtm0_service_start(struct m0_reqh_service *service)
 {
+	struct m0_dtm0_service *dtms = to_dtm(service);
+	int                     rc;
+
         M0_PRE(service != NULL);
-        return dtm_service__origin_fill(service);
+        rc = dtm_service__origin_fill(service);
+	if (rc != 0)
+		return M0_ERR(rc);
+
+	if (ENABLE_DTM0 && !is_manual_ss_enabled()) {
+		m0_dtm0_recovery_machine_init(&dtms->dos_remach,
+					      NULL, dtms);
+		rc = m0_dtm0_recovery_machine_start(&dtms->dos_remach);
+	}
+
+	return M0_RC(rc);
 }
 
 static void dtm0_service_prepare_to_stop(struct m0_reqh_service *reqh_rs)
@@ -342,6 +369,8 @@ static void dtm0_service_prepare_to_stop(struct m0_reqh_service *reqh_rs)
 
 	M0_PRE(reqh_rs != NULL);
 	dtms = M0_AMB(dtms, reqh_rs, dos_generic);
+	if (ENABLE_DTM0 && !is_manual_ss_enabled())
+		m0_dtm0_recovery_machine_stop(&dtms->dos_remach);
 	dtm0_service_conns_term(dtms);
 }
 
@@ -358,8 +387,11 @@ static void dtm0_service_stop(struct m0_reqh_service *service)
 	if (dtm0->dos_origin == DTM0_ON_VOLATILE && dtm0->dos_log != NULL) {
 		m0_be_dtm0_log_clear(dtm0->dos_log);
 		m0_be_dtm0_log_fini(dtm0->dos_log);
-		m0_be_dtm0_log_free(&dtm0->dos_log);
+		m0_be_dtm0_log_free0(&dtm0->dos_log);
 	}
+
+	if (ENABLE_DTM0 && !is_manual_ss_enabled())
+		m0_dtm0_recovery_machine_fini(&dtm0->dos_remach);
 }
 
 static void dtm0_service_fini(struct m0_reqh_service *service)
@@ -377,12 +409,14 @@ M0_INTERNAL int m0_dtm0_stype_init(void)
 				M0_AVI_DTX0_SM_STATE, M0_AVI_DTX0_SM_COUNTER) ?:
 		m0_dtm0_fop_init() ?:
 		m0_reqh_service_type_register(&dtm0_service_type) ?:
-		m0_dtm0_rpc_link_mod_init();
+		m0_dtm0_rpc_link_mod_init() ?:
+		m0_drm_domain_init();
 }
 
 M0_INTERNAL void m0_dtm0_stype_fini(void)
 {
 	extern struct m0_sm_conf m0_dtx_sm_conf;
+	m0_drm_domain_fini();
 	m0_dtm0_rpc_link_mod_fini();
 	m0_reqh_service_type_unregister(&dtm0_service_type);
 	m0_dtm0_fop_fini();
@@ -411,6 +445,11 @@ m0_dtm0_service_find(const struct m0_reqh *reqh)
 	return rh_srv == NULL ? NULL : to_dtm(rh_srv);
 }
 
+M0_INTERNAL bool m0_dtm0_is_expecting_redo_from_client(void)
+{
+	return M0_FI_ENABLED("ut");
+}
+
 M0_INTERNAL bool m0_dtm0_in_ut(void)
 {
 	return M0_FI_ENABLED("ut");
diff --git a/dtm0/service.h b/dtm0/service.h
index 087f0b91b4c..6801bd21e21 100644
--- a/dtm0/service.h
+++ b/dtm0/service.h
@@ -27,6 +27,7 @@
 
 #include "reqh/reqh_service.h"
 #include "dtm0/clk_src.h"
+#include "dtm0/recovery.h"
 
 struct m0_be_dtm0_log;
 struct dtm0_req_fop;
@@ -42,19 +43,20 @@ enum m0_dtm0_service_origin {
  * DTM0 service structure
  */
 struct m0_dtm0_service {
-	struct m0_reqh_service       dos_generic;
-	struct m0_tl                 dos_processes;
-	enum m0_dtm0_service_origin  dos_origin;
-	uint64_t                     dos_magix;
-	struct m0_dtm0_clk_src       dos_clk_src;
-	struct m0_be_dtm0_log       *dos_log;
-	/*
+	struct m0_reqh_service          dos_generic;
+	struct m0_tl                    dos_processes;
+	enum m0_dtm0_service_origin     dos_origin;
+	uint64_t                        dos_magix;
+	struct m0_dtm0_clk_src          dos_clk_src;
+	struct m0_be_dtm0_log          *dos_log;
+	struct m0_dtm0_recovery_machine dos_remach;
+	/**
 	 * A queue for DTM_TEST message for drlink UTs.
 	 * The UTs are fully responsible for the queue init/fini/get.
 	 * DTM_TEST fom puts dtm0_req_fop::dtr_txr::dtd_id::dti_fid to the
 	 * queue.
 	 */
-	struct m0_be_queue          *dos_ut_queue;
+	struct m0_be_queue             *dos_ut_queue;
 };
 
 extern struct m0_reqh_service_type dtm0_service_type;
@@ -89,6 +91,7 @@ m0_dtm0_service_find(const struct m0_reqh *reqh);
 M0_INTERNAL struct m0_dtm0_service *m0_dtm0_fom2service(struct m0_fom *fom);
 
 M0_INTERNAL bool m0_dtm0_in_ut(void);
+M0_INTERNAL bool m0_dtm0_is_expecting_redo_from_client(void);
 
 #endif /* __MOTR_DTM0_SERVICE_H__ */
 
diff --git a/dtm0/ut/drlink.c b/dtm0/ut/drlink.c
index 7c3b5cd7064..a26aae6ae7a 100644
--- a/dtm0/ut/drlink.c
+++ b/dtm0/ut/drlink.c
@@ -60,6 +60,7 @@ void m0_dtm0_ut_drlink_simple(void)
 	int                       rc;
 	int                       i;
 	int                       j;
+	struct m0_be_queue       *q;
 
 	M0_ALLOC_PTR(udh);
 	M0_ASSERT(udh != NULL);
@@ -93,16 +94,16 @@ void m0_dtm0_ut_drlink_simple(void)
 	for (i = 0; i < DTM0_UT_DRLINK_SIMPLE_POST_NR; ++i)
 		m0_be_op_init(&op[i]);
 
-	M0_ALLOC_PTR(svc->dos_ut_queue);
-	M0_UT_ASSERT(svc->dos_ut_queue != 0);
-	rc = m0_be_queue_init(svc->dos_ut_queue,
-			      &(struct m0_be_queue_cfg){
+	M0_ALLOC_PTR(q);
+	M0_UT_ASSERT(q != 0);
+	rc = m0_be_queue_init(q, &(struct m0_be_queue_cfg){
 			.bqc_q_size_max = DTM0_UT_DRLINK_SIMPLE_POST_NR,
 			.bqc_producers_nr_max = DTM0_UT_DRLINK_SIMPLE_POST_NR,
 			.bqc_consumers_nr_max = 1,
 			.bqc_item_length = sizeof fid[0],
                               });
 	M0_UT_ASSERT(rc == 0);
+	svc->dos_ut_queue = q;
 
 	for (i = 0; i < DTM0_UT_DRLINK_SIMPLE_POST_NR; ++i) {
 		rc = m0_dtm0_req_post(udh->udh_server_dtm0_service,
@@ -131,8 +132,6 @@ void m0_dtm0_ut_drlink_simple(void)
 		M0_UT_ASSERT(found);
 	}
 	m0_be_op_fini(&op_out);
-	m0_be_queue_fini(svc->dos_ut_queue);
-	m0_free(svc->dos_ut_queue);
 	for (i = 0; i < DTM0_UT_DRLINK_SIMPLE_POST_NR; ++i)
 		M0_UT_ASSERT(m0_fid_eq(&fid[i], &M0_FID0));
 	m0_free(fid);
@@ -144,6 +143,8 @@ void m0_dtm0_ut_drlink_simple(void)
 	m0_free(fop);
 
 	m0_ut_dtm0_helper_fini(udh);
+	m0_be_queue_fini(q);
+	m0_free(q);
 	m0_free(udh);
 
 }
diff --git a/dtm0/ut/main.c b/dtm0/ut/main.c
index 5ce0bdd2e8e..349d4e36eb1 100644
--- a/dtm0/ut/main.c
+++ b/dtm0/ut/main.c
@@ -30,7 +30,9 @@
 #include "ut/ut.h"
 #include "cas/cas.h"
 #include "cas/cas_xc.h"
+#include "dtm0/recovery.h"
 
+#include "dtm0/ut/helper.h"
 
 enum {
 	NUM_CAS_RECS = 10,
@@ -99,18 +101,728 @@ static void cas_xcode_test(void)
 				       buf, len);
 	M0_UT_ASSERT(rc == 0);
 
-    m0_xcode_free_obj(&M0_XCODE_OBJ(m0_cas_op_xc, op_out));
+	m0_xcode_free_obj(&M0_XCODE_OBJ(m0_cas_op_xc, op_out));
+}
+
+
+enum ut_sides {
+	UT_SIDE_SRV,
+	UT_SIDE_CLI,
+	UT_SIDE_NR
+};
+
+enum ut_client_persistence {
+	UT_CP_UNSPECIFIED,
+	UT_CP_VOLATILE_CLIENT,
+	UT_CP_PERSISTENT_CLIENT,
+};
+
+struct m0_fid g_service_fids[UT_SIDE_NR];
+
+struct ut_remach {
+	bool                             use_real_log;
+	enum ut_client_persistence       cp;
+
+	struct m0_ut_dtm0_helper         udh;
+
+	struct m0_dtm0_service          *svcs[UT_SIDE_NR];
+	struct m0_be_op                  recovered[UT_SIDE_NR];
+
+	/* Client-side stubs for conf objects. */
+	struct m0_conf_process           cli_procs[UT_SIDE_NR];
+	struct m0_mutex                  cli_proc_guards[UT_SIDE_NR];
+};
+
+struct ha_thought {
+	enum ut_sides        who;
+	enum m0_ha_obj_state what;
+};
+#define HA_THOUGHT(_who, _what) (struct ha_thought) { \
+	.who = _who, .what = _what                    \
+}
+
+static struct m0_dtm0_service *ut_remach_svc_get(struct ut_remach *um,
+						 enum ut_sides     side)
+{
+	M0_UT_ASSERT(side < UT_SIDE_NR);
+	M0_UT_ASSERT(um->svcs[side] != NULL);
+	return um->svcs[side];
+}
+
+static struct m0_dtm0_recovery_machine *ut_remach_get(struct ut_remach *um,
+						      enum ut_sides     side)
+{
+	return &ut_remach_svc_get(um, side)->dos_remach;
+}
+
+static struct m0_fid *ut_remach_fid_get(enum ut_sides side)
+{
+	M0_UT_ASSERT(side < UT_SIDE_NR);
+	return &g_service_fids[side];
+}
+
+static enum ut_sides ut_remach_side_get(const struct m0_fid *svc)
+{
+	enum ut_sides side;
+
+	for (side = 0; side < UT_SIDE_NR; ++side) {
+		if (m0_fid_eq(ut_remach_fid_get(side), svc))
+			break;
+	}
+
+	M0_UT_ASSERT(side < UT_SIDE_NR);
+	return side;
+}
+
+static struct ut_remach *ut_remach_from(struct m0_dtm0_recovery_machine *m,
+					const struct m0_fid *svc_fid)
+{
+	struct ut_remach          *um = NULL;
+	struct m0_dtm0_service    *svc = M0_AMB(svc, m, dos_remach);
+	struct m0_reqh            *reqh = svc->dos_generic.rs_reqh;
+	struct m0_reqh_context    *rx = svc->dos_generic.rs_reqh_ctx;
+	struct m0_rpc_client_ctx  *cli_ctx;
+	struct m0_motr            *motr_ctx;
+	struct m0_rpc_server_ctx  *srv_ctx;
+	struct m0_ut_dtm0_helper  *udh;
+	enum ut_sides              side = ut_remach_side_get(svc_fid);
+
+	if (rx == NULL) {
+		M0_UT_ASSERT(side == UT_SIDE_CLI);
+		cli_ctx = M0_AMB(cli_ctx, reqh, rcx_reqh);
+		udh = M0_AMB(udh, cli_ctx, udh_cctx);
+		um = M0_AMB(um, udh, udh);
+	} else {
+		M0_UT_ASSERT(side == UT_SIDE_SRV);
+		motr_ctx = M0_AMB(motr_ctx, rx, cc_reqh_ctx);
+		srv_ctx = M0_AMB(srv_ctx, motr_ctx, rsx_motr_ctx);
+		udh = M0_AMB(udh, srv_ctx, udh_sctx);
+		um = M0_AMB(um, udh, udh);
+	}
+
+	M0_UT_ASSERT(um != NULL);
+	M0_UT_ASSERT(ut_remach_get(um, side) == m);
+	return um;
+}
+
+static void ut_remach_log_add_sync(struct ut_remach       *um,
+				   enum ut_sides           side,
+				   struct m0_dtm0_tx_desc *txd,
+				   struct m0_buf          *payload)
+{
+	struct m0_dtm0_recovery_machine *m = ut_remach_get(um, side);
+	struct m0_dtm0_service  *svc   = m->rm_svc;
+	struct m0_be_dtm0_log   *log   = svc->dos_log;
+	struct m0_be_tx         *tx    = NULL;
+	struct m0_be_seg        *seg   = log->dl_seg;
+	struct m0_be_tx_credit   cred  = {};
+	struct m0_be_ut_backend *ut_be;
+	int rc;
+
+	if (log->dl_is_persistent) {
+		M0_UT_ASSERT(svc->dos_generic.rs_reqh_ctx != NULL);
+		ut_be = &svc->dos_generic.rs_reqh_ctx->rc_be;
+		m0_be_dtm0_log_credit(M0_DTML_EXECUTED, txd, payload, seg,
+				      NULL, &cred);
+		M0_ALLOC_PTR(tx);
+		M0_UT_ASSERT(tx != NULL);
+		m0_be_ut_tx_init(tx, ut_be);
+		m0_be_tx_prep(tx, &cred);
+		rc = m0_be_tx_open_sync(tx);
+		M0_UT_ASSERT(rc == 0);
+	}
+
+	m0_mutex_lock(&log->dl_lock);
+	rc = m0_be_dtm0_log_update(log, tx, txd, payload);
+	M0_UT_ASSERT(rc == 0);
+	m0_mutex_unlock(&log->dl_lock);
+
+	if (log->dl_is_persistent) {
+		m0_be_tx_close_sync(tx);
+		m0_be_tx_fini(tx);
+		m0_free(tx);
+	}
+}
+
+
+static void um_dummy_log_redo_post(struct m0_dtm0_recovery_machine *m,
+				   struct m0_fom                   *fom,
+				   const struct m0_fid *tgt_proc,
+				   const struct m0_fid *tgt_svc,
+				   struct dtm0_req_fop *redo,
+				   struct m0_be_op *op)
+{
+	struct ut_remach   *um = NULL;
+	struct m0_dtm0_recovery_machine *counterpart = NULL;
+
+	um = ut_remach_from(m, &redo->dtr_initiator);
+	counterpart = ut_remach_get(um, ut_remach_side_get(tgt_svc));
+	m0_dtm0_recovery_machine_redo_post(counterpart, redo, op);
+	M0_UT_ASSERT(m0_be_op_is_done(op));
+}
+
+static void um_real_log_redo_post(struct m0_dtm0_recovery_machine *m,
+				  struct m0_fom                   *fom,
+				  const struct m0_fid *tgt_proc,
+				  const struct m0_fid *tgt_svc,
+				  struct dtm0_req_fop *redo,
+				  struct m0_be_op *op)
+{
+	struct ut_remach   *um = NULL;
+	struct m0_dtm0_recovery_machine *counterpart = NULL;
+	enum ut_sides tgt_side = ut_remach_side_get(tgt_svc);
+	struct m0_dtm0_service *svc;
+	struct m0_be_ut_backend *ut_be;
+
+	um = ut_remach_from(m, &redo->dtr_initiator);
+	counterpart = ut_remach_get(um, tgt_side);
+
+	/* Empty REDOs are allowed only when EOL is set. */
+	M0_UT_ASSERT(ergo(m0_dtm0_tx_desc_is_none(&redo->dtr_txr),
+			  !!(redo->dtr_flags & M0_BITS(M0_DMF_EOL))));
+
+	/* Emulate REDO FOM: update the log */
+	if (!m0_dtm0_tx_desc_is_none(&redo->dtr_txr))
+		ut_remach_log_add_sync(um, tgt_side, &redo->dtr_txr,
+				       &redo->dtr_payload);
+
+	m0_dtm0_recovery_machine_redo_post(counterpart, redo, op);
+	M0_UT_ASSERT(m0_be_op_is_done(op));
+
+	/*
+	 * It is a sordid but simple way of making ::ut_remach_log_add_sync
+	 * work:
+	 * RPC client does not have a fully-funcitonal context, so that
+	 * sm-based BE logic cannot progress because there is no BE associated
+	 * with the corresponding FOM (fom -> reqh -> context -> be).
+	 * However, both sides share the same set of localities,
+	 * so that we can sit down right here and wait until everything
+	 * is completed.
+	 * It might be slow and dangerous but it is enough for a simple test.
+	 */
+	if (!m0_dtm0_tx_desc_is_none(&redo->dtr_txr) &&
+	    tgt_side == UT_SIDE_SRV) {
+		svc = counterpart->rm_svc;
+		ut_be = &svc->dos_generic.rs_reqh_ctx->rc_be;
+		m0_be_ut_backend_sm_group_asts_run(ut_be);
+		m0_be_ut_backend_thread_exit(ut_be);
+	}
+}
+
+static int um_dummy_log_iter_next(struct m0_dtm0_recovery_machine *m,
+				  struct m0_be_dtm0_log_iter *iter,
+				  const struct m0_fid *tgt_svc,
+				  const struct m0_fid *origin_svc,
+				  struct m0_dtm0_log_rec *record)
+{
+	M0_SET0(record);
+	return -ENOENT;
+}
+
+static int um_dummy_log_iter_init(struct m0_dtm0_recovery_machine *m,
+				  struct m0_be_dtm0_log_iter *iter)
+{
+	(void) m;
+	(void) iter;
+	return 0;
+}
+
+static void um_dummy_log_iter_fini(struct m0_dtm0_recovery_machine *m,
+				   struct m0_be_dtm0_log_iter *iter)
+{
+	(void) m;
+	(void) iter;
+	/* nothing to do */
+}
+
+void um_ha_event_post(struct m0_dtm0_recovery_machine *m,
+		      const struct m0_fid             *tgt_proc,
+		      const struct m0_fid             *tgt_svc,
+		      enum m0_conf_ha_process_event    event)
+{
+	struct ut_remach *um   = ut_remach_from(m, tgt_svc);
+	enum ut_sides     side = ut_remach_side_get(tgt_svc);
+
+	switch (event) {
+	case M0_CONF_HA_PROCESS_DTM_RECOVERED:
+		m0_be_op_done(&um->recovered[side]);
+		break;
+	default:
+		M0_UT_ASSERT(false);
+	}
+}
+
+/*
+ * Unicast an HA thought to a particular side.
+ */
+static void ut_remach_ha_tells(struct ut_remach *um,
+			       const struct ha_thought *t,
+			       enum ut_sides     whom)
+{
+	m0_ut_remach_heq_post(ut_remach_get(um, whom),
+			      ut_remach_fid_get(t->who), t->what);
+}
+
+/*
+ * Multicast an HA thought to all the sides.
+ */
+static void ut_remach_ha_thinks(struct ut_remach        *um,
+				const struct ha_thought *t)
+{
+	enum ut_sides side;
+
+	for (side = 0; side < UT_SIDE_NR; ++side)
+		ut_remach_ha_tells(um, t, side);
+}
+
+static const struct m0_dtm0_recovery_machine_ops*
+ut_remach_ops_get(struct ut_remach *um)
+{
+	static struct m0_dtm0_recovery_machine_ops dummy_log_ops = {};
+	static struct m0_dtm0_recovery_machine_ops real_log_ops = {};
+	static bool initialized = false;
+
+	if (!initialized) {
+		/* Dummy log operations */
+		dummy_log_ops = m0_dtm0_recovery_machine_default_ops;
+
+		dummy_log_ops.log_iter_next  = um_dummy_log_iter_next;
+		dummy_log_ops.log_iter_init  = um_dummy_log_iter_init;
+		dummy_log_ops.log_iter_fini  = um_dummy_log_iter_fini;
+
+		dummy_log_ops.redo_post      = um_dummy_log_redo_post;
+		dummy_log_ops.ha_event_post  = um_ha_event_post;
+
+		/* Real log operations */
+		real_log_ops = m0_dtm0_recovery_machine_default_ops;
+
+		real_log_ops.redo_post      = um_real_log_redo_post;
+		real_log_ops.ha_event_post  = um_ha_event_post;
+		/*
+		 * Do not reassign log ops, as we need to deal with real DTM0
+		 * log.
+		 */
+
+		initialized = true;
+	}
+
+	return um->use_real_log ? &real_log_ops : &dummy_log_ops;
+}
+
+static void ut_srv_remach_init(struct ut_remach *um)
+{
+	m0_dtm0_recovery_machine_init(ut_remach_get(um, UT_SIDE_SRV),
+				      ut_remach_ops_get(um),
+				      ut_remach_svc_get(um, UT_SIDE_SRV));
+}
+
+static void ut_cli_remach_conf_obj_init(struct ut_remach *um)
+{
+	int i;
+
+	for (i = 0; i < UT_SIDE_NR; ++i) {
+		m0_mutex_init(&um->cli_proc_guards[i]);
+		m0_chan_init(&um->cli_procs[i].pc_obj.co_ha_chan,
+			     &um->cli_proc_guards[i]);
+	}
+}
+
+static void ut_cli_remach_conf_obj_fini(struct ut_remach *um)
+{
+	int i;
+
+	for (i = 0; i < UT_SIDE_NR; ++i) {
+		m0_chan_fini_lock(&um->cli_procs[i].pc_obj.co_ha_chan);
+		m0_mutex_fini(&um->cli_proc_guards[i]);
+	}
+}
+
+
+static void ut_cli_remach_init(struct ut_remach *um)
+{
+	ut_cli_remach_conf_obj_init(um);
+
+	m0_dtm0_recovery_machine_init(ut_remach_get(um, UT_SIDE_CLI),
+				      ut_remach_ops_get(um),
+				      um->svcs[UT_SIDE_CLI]);
+}
+
+static void ut_srv_remach_fini(struct ut_remach *um)
+{
+	m0_dtm0_recovery_machine_fini(ut_remach_get(um, UT_SIDE_SRV));
+}
+
+static void ut_cli_remach_fini(struct ut_remach *um)
+{
+	m0_dtm0_recovery_machine_fini(ut_remach_get(um, UT_SIDE_CLI));
+	ut_cli_remach_conf_obj_fini(um);
+}
+
+static void ut_remach_start(struct ut_remach *um)
+{
+	enum ut_sides side;
+	int           rc;
+	bool          is_volatile[UT_SIDE_NR] = {
+		[UT_SIDE_SRV] = false,
+		[UT_SIDE_CLI] = um->cp == UT_CP_VOLATILE_CLIENT,
+	};
+
+	m0_ut_remach_populate(ut_remach_get(um, UT_SIDE_CLI), um->cli_procs,
+			      g_service_fids, is_volatile, UT_SIDE_NR);
+	for (side = 0; side < UT_SIDE_NR; ++side) {
+		rc = m0_dtm0_recovery_machine_start(ut_remach_get(um, side));
+		M0_ASSERT(rc == 0);
+	}
+}
+
+static void ut_remach_stop(struct ut_remach *um)
+{
+	enum ut_sides side;
+	for (side = 0; side < UT_SIDE_NR; ++side)
+		m0_dtm0_recovery_machine_stop(ut_remach_get(um, side));
+}
+
+static void ut_remach_init(struct ut_remach *um)
+{
+	int i;
+
+	M0_UT_ASSERT(M0_IN(um->cp, (UT_CP_PERSISTENT_CLIENT,
+				    UT_CP_VOLATILE_CLIENT)));
+
+	for (i = 0; i < ARRAY_SIZE(um->recovered); ++i) {
+		m0_be_op_init(um->recovered + i);
+		m0_be_op_active(um->recovered + i);
+	}
+
+	m0_fi_enable("m0_dtm0_in_ut", "ut");
+	m0_fi_enable("is_manual_ss_enabled", "ut");
+	m0_fi_enable("m0_dtm0_is_expecting_redo_from_client", "ut");
+	if (um->cp == UT_CP_PERSISTENT_CLIENT)
+		m0_fi_enable("is_svc_volatile", "always_false");
+
+	m0_ut_dtm0_helper_init(&um->udh);
+
+	g_service_fids[UT_SIDE_SRV] = um->udh.udh_server_dtm0_fid;
+	g_service_fids[UT_SIDE_CLI] = um->udh.udh_client_dtm0_fid;
+
+	M0_UT_ASSERT(um->udh.udh_server_dtm0_service != NULL);
+	um->svcs[UT_SIDE_SRV] = um->udh.udh_server_dtm0_service;
+	M0_UT_ASSERT(um->udh.udh_client_dtm0_service != NULL);
+	um->svcs[UT_SIDE_CLI] = um->udh.udh_client_dtm0_service;
+
+	ut_srv_remach_init(um);
+	ut_cli_remach_init(um);
+}
+
+static void ut_remach_fini(struct ut_remach *um)
+{
+	int i;
+
+	ut_cli_remach_fini(um);
+	ut_srv_remach_fini(um);
+	m0_ut_dtm0_helper_fini(&um->udh);
+	if (um->cp == UT_CP_PERSISTENT_CLIENT)
+		m0_fi_disable("is_svc_volatile", "always_false");
+	m0_fi_disable("m0_dtm0_is_expecting_redo_from_client", "ut");
+	m0_fi_disable("is_manual_ss_enabled", "ut");
+	m0_fi_disable("m0_dtm0_in_ut", "ut");
+	for (i = 0; i < ARRAY_SIZE(um->recovered); ++i) {
+		if (!m0_be_op_is_done(um->recovered + i))
+			m0_be_op_done(um->recovered + i);
+		m0_be_op_fini(um->recovered + i);
+	}
+
+	M0_SET_ARR0(g_service_fids);
+}
+
+static void ut_remach_reset_srv(struct ut_remach *um)
+{
+	int                              rc;
+	struct m0_dtm0_recovery_machine *m = ut_remach_get(um, UT_SIDE_SRV);
+
+	m0_dtm0_recovery_machine_stop(m);
+	m0_dtm0_recovery_machine_fini(m);
+	m0_dtm0_recovery_machine_init(m, ut_remach_ops_get(um),
+				      ut_remach_svc_get(um, UT_SIDE_SRV));
+	rc = m0_dtm0_recovery_machine_start(m);
+	M0_UT_ASSERT(rc == 0);
+}
+
+static void ut_remach_log_gen_sync(struct ut_remach *um,
+				   enum ut_sides side,
+				   uint64_t ts_start,
+				   uint64_t records_nr)
+{
+	struct m0_dtm0_tx_desc           txd = {};
+	struct m0_buf                    payload = {};
+	int                              rc;
+	int                              i;
+
+	rc = m0_dtm0_tx_desc_init(&txd, 1);
+	M0_UT_ASSERT(rc == 0);
+	txd.dtd_ps.dtp_pa[0] = (struct m0_dtm0_tx_pa) {
+		.p_state = M0_DTPS_EXECUTED,
+		.p_fid = *ut_remach_fid_get(UT_SIDE_SRV),
+	};
+	txd.dtd_id = (struct m0_dtm0_tid) {
+		.dti_ts.dts_phys = 0,
+		.dti_fid = *ut_remach_fid_get(UT_SIDE_CLI),
+	};
+
+	for (i = 0; i < records_nr; ++i) {
+		txd.dtd_id.dti_ts.dts_phys = ts_start + i;
+		ut_remach_log_add_sync(um, side, &txd, &payload);
+	}
+
+	m0_dtm0_tx_desc_fini(&txd);
+}
+
+/*
+ * Ensures that DTM0 log A is a subset of DTM0 log B; and,
+ * optionally, that A has at exactly "expected_records_nr" log records
+ * (if expected_records_nr < 0 then this check is omitted).
+ * Note, pairs (tid, payload) are used as comarison keys. The states
+ * of participants and the other fields are ignored.
+ */
+static void log_subset_verify(struct ut_remach *um,
+			      int               expected_records_nr,
+			      enum ut_sides     a_side,
+			      enum ut_sides     b_side)
+{
+	struct m0_be_dtm0_log     *a_log =
+		ut_remach_get(um, a_side)->rm_svc->dos_log;
+	struct m0_be_dtm0_log     *b_log =
+		ut_remach_get(um, b_side)->rm_svc->dos_log;
+	struct m0_be_dtm0_log_iter a_iter;
+	struct m0_dtm0_log_rec     a_record;
+	struct m0_dtm0_log_rec    *b_record;
+	struct m0_buf             *a_buf;
+	struct m0_buf             *b_buf;
+	struct m0_dtm0_tid        *tid;
+	int                        rc;
+	uint64_t                   actual_records_nr = 0;
+
+	m0_mutex_lock(&a_log->dl_lock);
+	m0_mutex_lock(&b_log->dl_lock);
+
+	m0_be_dtm0_log_iter_init(&a_iter, a_log);
+
+	while (true) {
+		rc = m0_be_dtm0_log_iter_next(&a_iter, &a_record);
+		M0_UT_ASSERT(M0_IN(rc, (0, -ENOENT)));
+		if (rc == -ENOENT)
+			break;
+		M0_UT_ASSERT(rc == 0);
+		tid = &a_record.dlr_txd.dtd_id;
+		b_record = m0_be_dtm0_log_find(b_log, tid);
+		M0_UT_ASSERT(b_record != NULL);
+		a_buf = &a_record.dlr_payload;
+		b_buf = &b_record->dlr_payload;
+		M0_UT_ASSERT(equi(m0_buf_is_set(a_buf), m0_buf_is_set(b_buf)));
+		M0_UT_ASSERT(ergo(m0_buf_is_set(a_buf),
+				  m0_buf_eq(a_buf, b_buf)));
+		m0_dtm0_log_iter_rec_fini(&a_record);
+		actual_records_nr++;
+	}
+
+	M0_UT_ASSERT(ergo(expected_records_nr >= 0,
+			  expected_records_nr == actual_records_nr));
+
+	m0_mutex_unlock(&b_log->dl_lock);
+	m0_mutex_unlock(&a_log->dl_lock);
+}
+
+/* Case: Ensure the machine initialised properly. */
+static void remach_init_fini(void)
+{
+	struct ut_remach um = { .cp = UT_CP_PERSISTENT_CLIENT };
+	ut_remach_init(&um);
+	ut_remach_fini(&um);
+}
+
+/* Case: Ensure the machine is able to start/stop. */
+static void remach_start_stop(void)
+{
+	struct ut_remach um = { .cp = UT_CP_PERSISTENT_CLIENT };
+	ut_remach_init(&um);
+	ut_remach_start(&um);
+	ut_remach_stop(&um);
+	ut_remach_fini(&um);
+}
+
+static void ut_remach_boot(struct ut_remach *um)
+{
+	const struct ha_thought starting[] = {
+		HA_THOUGHT(UT_SIDE_CLI, M0_NC_TRANSIENT),
+		HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT),
+
+		HA_THOUGHT(UT_SIDE_CLI, M0_NC_DTM_RECOVERING),
+		HA_THOUGHT(UT_SIDE_SRV, M0_NC_DTM_RECOVERING),
+	};
+	const struct ha_thought started[] = {
+		HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE),
+		HA_THOUGHT(UT_SIDE_SRV, M0_NC_ONLINE),
+	};
+	int                     i;
+
+	ut_remach_init(um);
+	ut_remach_start(um);
+
+	for (i = 0; i < ARRAY_SIZE(starting); ++i)
+		ut_remach_ha_thinks(um, starting + i);
+
+	for (i = 0; i < ARRAY_SIZE(um->recovered); ++i)
+		m0_be_op_wait(um->recovered + i);
+
+	for (i = 0; i < ARRAY_SIZE(started); ++i)
+		ut_remach_ha_thinks(um, started + i);
+}
+
+static void ut_remach_shutdown(struct ut_remach *um)
+{
+	ut_remach_stop(um);
+	M0_UT_ASSERT(m0_be_op_is_done(&um->recovered[UT_SIDE_SRV]));
+	M0_UT_ASSERT(m0_be_op_is_done(&um->recovered[UT_SIDE_CLI]));
+	ut_remach_fini(um);
+}
+
+/* Use-case: gracefull boot and shutdown of 2-node cluster. */
+static void remach_boot_cluster(enum ut_client_persistence cp)
+{
+	struct ut_remach um = { .cp = cp };
+
+	ut_remach_boot(&um);
+	ut_remach_shutdown(&um);
+}
+
+static void remach_boot_cluster_ss(void)
+{
+	remach_boot_cluster(UT_CP_PERSISTENT_CLIENT);
+}
+
+static void remach_boot_cluster_cs(void)
+{
+	remach_boot_cluster(UT_CP_VOLATILE_CLIENT);
+}
+
+/* Use-case: re-boot an ONLINE node. */
+static void remach_reboot_server(void)
+{
+	struct ut_remach um = { .cp = UT_CP_PERSISTENT_CLIENT };
+
+	ut_remach_boot(&um);
+
+	m0_be_op_reset(um.recovered + UT_SIDE_SRV);
+	m0_be_op_active(um.recovered + UT_SIDE_SRV);
+	ut_remach_reset_srv(&um);
+
+	ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT));
+	ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE),
+			   UT_SIDE_SRV);
+	ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV,
+					     M0_NC_DTM_RECOVERING));
+	m0_be_op_wait(um.recovered + UT_SIDE_SRV);
+	ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_ONLINE));
+
+	ut_remach_shutdown(&um);
+}
+
+/* Use-case: reboot a node when it started to recover. */
+static void remach_reboot_twice(void)
+{
+	struct ut_remach um = { .cp = UT_CP_PERSISTENT_CLIENT };
+
+	ut_remach_boot(&um);
+
+	m0_be_op_reset(um.recovered + UT_SIDE_SRV);
+	m0_be_op_active(um.recovered + UT_SIDE_SRV);
+	ut_remach_reset_srv(&um);
+
+	/*
+	 * Do not tell the client about failure.
+	 * No REDOs would be sent, so that we can see what happens
+	 * in the case where recovery machine has to be stopped
+	 * in the middle of awaiting for REDOs.
+	 */
+	ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT),
+			   UT_SIDE_SRV);
+	ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE),
+			   UT_SIDE_SRV);
+	ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_DTM_RECOVERING),
+			   UT_SIDE_SRV);
+	ut_remach_reset_srv(&um);
+	M0_UT_ASSERT(!m0_be_op_is_done(&um.recovered[UT_SIDE_SRV]));
+
+	ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT));
+	ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE),
+			   UT_SIDE_SRV);
+	ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV,
+					     M0_NC_DTM_RECOVERING));
+	m0_be_op_wait(um.recovered + UT_SIDE_SRV);
+	ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_ONLINE));
+
+	ut_remach_shutdown(&um);
+}
+
+/* Use-case: replay an empty DTM0 log. */
+static void remach_boot_real_log(void)
+{
+	struct ut_remach um = {
+		.cp = UT_CP_PERSISTENT_CLIENT,
+		.use_real_log = true
+	};
+	ut_remach_boot(&um);
+	ut_remach_shutdown(&um);
+}
+
+/* Use-case: replay a non-empty client log to the server. */
+static void remach_real_log_replay(void)
+{
+	struct ut_remach um = {
+		.cp = UT_CP_PERSISTENT_CLIENT,
+		.use_real_log = true
+	};
+	/* cafe bell */
+	const uint64_t since = 0xCAFEBELL;
+	const uint64_t records_nr = 10;
+
+	ut_remach_boot(&um);
+
+	ut_remach_log_gen_sync(&um, UT_SIDE_CLI, since, records_nr);
+
+	m0_be_op_reset(um.recovered + UT_SIDE_SRV);
+	m0_be_op_active(um.recovered + UT_SIDE_SRV);
+	ut_remach_reset_srv(&um);
+
+	ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_TRANSIENT));
+	ut_remach_ha_tells(&um, &HA_THOUGHT(UT_SIDE_CLI, M0_NC_ONLINE),
+			   UT_SIDE_SRV);
+	ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV,
+					     M0_NC_DTM_RECOVERING));
+	m0_be_op_wait(um.recovered + UT_SIDE_SRV);
+	log_subset_verify(&um, records_nr, UT_SIDE_CLI, UT_SIDE_SRV);
+	ut_remach_ha_thinks(&um, &HA_THOUGHT(UT_SIDE_SRV, M0_NC_ONLINE));
+
+	ut_remach_shutdown(&um);
 }
 
 extern void m0_dtm0_ut_drlink_simple(void);
 extern void m0_dtm0_ut_domain_init_fini(void);
 
 struct m0_ut_suite dtm0_ut = {
-        .ts_name = "dtm0-ut",
-        .ts_tests = {
-                { "xcode",         &cas_xcode_test },
-                { "drlink-simple", &m0_dtm0_ut_drlink_simple },
-                { "domain_init-fini", &m0_dtm0_ut_domain_init_fini },
+	.ts_name = "dtm0-ut",
+	.ts_tests = {
+		{ "xcode",                  cas_xcode_test        },
+		{ "drlink-simple",         &m0_dtm0_ut_drlink_simple },
+		{ "domain_init-fini",      &m0_dtm0_ut_domain_init_fini },
+		{ "remach-init-fini",       remach_init_fini      },
+		{ "remach-start-stop",      remach_start_stop     },
+		{ "remach-boot-cluster-ss", remach_boot_cluster_ss },
+		{ "remach-boot-cluster-cs", remach_boot_cluster_cs },
+		{ "remach-reboot-server",   remach_reboot_server  },
+		{ "remach-reboot-twice",    remach_reboot_twice   },
+		{ "remach-boot-real-log",   remach_boot_real_log  },
+		{ "remach-real-log-replay", remach_real_log_replay  },
 		{ NULL, NULL },
 	}
 };
@@ -124,3 +836,6 @@ struct m0_ut_suite dtm0_ut = {
  *  scroll-step: 1
  *  End:
  */
+/*
+ * vim: tabstop=8 shiftwidth=8 noexpandtab textwidth=80 nowrap
+ */
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 309160c73e5..fcd43532e96 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -200,10 +200,11 @@ M0_INTERNAL void m0_bitmap_onwire_fini(struct m0_bitmap_onwire *ow_map)
 M0_INTERNAL void m0_bitmap_store(const struct m0_bitmap *im_map,
                                  struct m0_bitmap_onwire *ow_map)
 {
-	size_t s = M0_BITMAP_WORDS(im_map->b_nr);
+	size_t s; 
 
 	M0_PRE(im_map != NULL && ow_map != NULL);
 	M0_PRE(im_map->b_words != NULL);
+	s = M0_BITMAP_WORDS(im_map->b_nr);
 	M0_PRE(s == ow_map->bo_size);
 
 	memcpy(ow_map->bo_words, im_map->b_words,
diff --git a/lib/user_space/timer.c b/lib/user_space/timer.c
index de098357149..8c72f3115ef 100644
--- a/lib/user_space/timer.c
+++ b/lib/user_space/timer.c
@@ -52,6 +52,10 @@ static const m0_time_t zero_time = M0_MKTIME(0, 0);
 /** Clock source for M0_TIMER_HARD. @see timer_posix_set() */
 static int	       clock_source_timer = -1;
 
+/* Number of allocated HARD timers */
+static unsigned long long hard_timer_alloc = 0;
+static unsigned long long hard_timer_free = 0;
+
 /**
    Typed list of m0_timer_tid structures.
  */
@@ -193,9 +197,19 @@ static int timer_posix_init(struct m0_timer *timer)
 	}
 	rc = timer_create(clock_source_timer, &se, &ptimer);
 	/* preserve timer->t_ptimer if timer_create() isn't succeeded */
-	if (rc == 0)
+	if (rc == 0) {
 		timer->t_ptimer = ptimer;
-	return M0_RC(rc);
+		hard_timer_alloc++;
+	} else {
+		rc = M0_ERR(-errno);
+		M0_LOG(M0_ERROR, "Failed to allocate HARD timer (%d), "
+		       "alloc=%lld, free=%lld, inuse=%lld",
+		       rc,
+		       hard_timer_alloc,
+		       hard_timer_free,
+		       hard_timer_alloc - hard_timer_free);
+	}
+	return rc != 0 ? M0_ERR(rc) : 0;
 }
 
 /**
@@ -210,6 +224,7 @@ static void timer_posix_fini(timer_t posix_timer)
 	 * timer_delete() can fail iff timer->t_ptimer isn't valid timer ID.
 	 */
 	M0_ASSERT_INFO(rc == 0, "rc = %d", rc);
+	hard_timer_free++;
 }
 
 static m0_time_t timer_time_to_realtime(m0_time_t expire)
diff --git a/lib/ut/cookie.c b/lib/ut/cookie.c
index 2c9ab7c0e33..45e1ebdb7e3 100644
--- a/lib/ut/cookie.c
+++ b/lib/ut/cookie.c
@@ -141,7 +141,7 @@ static void addr_sanity(const uint64_t *addr, bool sane, bool aligned)
 void test_cookie(void)
 {
 	int		   insane_cnt = 0;
-	uint64_t	   automatic;
+	uint64_t	   automatic = 0;
 	uint64_t	  *dynamic;
 	uint64_t	   i;
 	struct m0_cookie   cookie_test;
diff --git a/motr/idx_dix.c b/motr/idx_dix.c
index bcf71de729a..7bee805a564 100644
--- a/motr/idx_dix.c
+++ b/motr/idx_dix.c
@@ -71,6 +71,14 @@ struct dix_req {
 	struct m0_op_idx        *idr_oi;
 	struct m0_sm_ast         idr_ast;
 	struct m0_clink          idr_clink;
+
+	/**
+	 * Clink that connects STABLE/FAILURE states of DTX with this dix_req.
+	 * Note, the clink is always initialised but not always attached
+	 * to a dtx.
+	 */
+	struct m0_clink          idr_dtx_clink;
+
 	/**
 	 * Starting key for NEXT operation.
 	 * It's allocated internally and key value is copied from user.
@@ -614,6 +622,7 @@ static int dix_mreq_create(struct m0_op_idx  *oi,
 	M0_ALLOC_PTR(req);
 	if (req == NULL)
 		return M0_ERR(-ENOMEM);
+	m0_clink_init(&req->idr_dtx_clink, dixreq_clink_dtx_cb);
 	if (idx_is_distributed(oi)) {
 		m0_dix_meta_req_init(&req->idr_mreq, op_dixc(oi),
 				     oi->oi_sm_grp);
@@ -651,14 +660,13 @@ static int dix_req_create(struct m0_op_idx  *oi,
 
 	M0_ALLOC_PTR(req);
 	if (req != NULL) {
+		m0_clink_init(&req->idr_dtx_clink, dixreq_clink_dtx_cb);
 		if (idx_is_distributed(oi)) {
 			m0_dix_req_init(&req->idr_dreq, op_dixc(oi),
 					oi->oi_sm_grp);
 			to_dix_map(&oi->oi_oc.oc_op, &req->idr_dreq);
 			req->idr_dreq.dr_dtx = oi->oi_dtx;
-			m0_clink_init(&req->idr_clink,
-				      oi->oi_dtx != NULL ?
-				      dixreq_clink_dtx_cb : dixreq_clink_cb);
+			m0_clink_init(&req->idr_clink, dixreq_clink_cb);
 
 			/* Store oi for dix callbacks to update SYNC records. */
 			if (M0_IN(oi->oi_oc.oc_op.op_code,
@@ -690,6 +698,7 @@ static void dix_req_destroy(struct dix_req *req)
 	} else {
 		m0_cas_req_fini(&req->idr_creq);
 	}
+	m0_clink_fini(&req->idr_dtx_clink);
 	m0_free(req);
 	M0_LEAVE();
 }
@@ -718,6 +727,7 @@ static void dixreq_completed_post(struct dix_req *req, int rc)
 	struct m0_op_idx *oi = req->idr_oi;
 
 	M0_ENTRY();
+	M0_ASSERT(req->idr_ast.sa_cb != dixreq_completed_ast);
 	oi->oi_ar.ar_rc = rc;
 	req->idr_ast.sa_cb = dixreq_completed_ast;
 	req->idr_ast.sa_datum = req;
@@ -826,108 +836,42 @@ static bool dix_meta_req_clink_cb(struct m0_clink *cl)
 	return false;
 }
 
-static void dixreq_stable_ast(struct m0_sm_group *grp, struct m0_sm_ast *ast)
-{
-	struct dix_req          *req = ast->sa_datum;
-	struct m0_op_idx        *oi = req->idr_oi;
-	int                      rc = oi->oi_ar.ar_rc;
-
-	M0_ENTRY();
-	oi->oi_ar.ar_ast.sa_cb = (rc == 0) ? idx_op_ast_stable : NULL;
-	M0_ASSERT(grp == oi->oi_sm_grp);
-	dix_req_destroy(req);
-	idx_op_ast_stable(oi->oi_sm_grp, &oi->oi_ar.ar_ast);
-	M0_LEAVE();
-}
-
-static void dixreq_stable_post(struct dix_req *req, int rc)
+static bool dix_req_is_completed(struct dix_req *dix_req)
 {
-	struct m0_op_idx *oi = req->idr_oi;
+	struct m0_dix_req *dreq = &dix_req->idr_dreq;
+	struct m0_op_idx  *oi   = dix_req->idr_oi;
+	struct m0_dtx     *dtx  = oi->oi_dtx;
+	bool               is_executed =
+		M0_IN(dreq->dr_sm.sm_state, (DIXREQ_FAILURE, DIXREQ_FINAL));
+	bool               is_stable   = dtx == NULL ||
+		M0_IN(m0_dtx0_sm_state(dtx), (M0_DDS_STABLE, M0_DDS_FAILED));
 
-	M0_ENTRY();
-	oi->oi_ar.ar_rc = rc;
-	req->idr_ast.sa_cb = dixreq_stable_ast;
-	req->idr_ast.sa_datum = req;
-	M0_ASSERT_INFO(req->idr_ast.sa_next == NULL,
-		       "Stable() ast cannot be armed before Executed() "
-		       "is completed. Ensure EXECUTED_ALL -> STABLE transition"
-		       "does not happen within the same ast tick");
-	m0_sm_ast_post(oi->oi_sm_grp, &req->idr_ast);
-	M0_LEAVE();
-}
-
-static void dixreq_executed_post(struct dix_req *req, int rc)
-{
-	struct m0_op_idx *oi = req->idr_oi;
-
-	M0_ENTRY();
-
-	M0_ASSERT_INFO(rc == 0, "TODO: Failures are not handled here.");
-	oi->oi_ar.ar_rc = rc;
-
-	/* XXX: DTX cannot be canceled (as per the current design),
-	 * so that once we got a reply, we prohibit any kind of cancelation.
-	 * The originator should m0_panic itself in case if something needs
-	 * to be canceled. It will be re-started and continue its execution.
-	 */
-	oi->oi_in_completion = true;
-	oi->oi_ar.ar_ast.sa_cb = idx_op_ast_executed;
-	M0_ASSERT(req->idr_dreq.dr_dtx->tx_dtx->dd_sm.sm_grp == oi->oi_sm_grp);
-	M0_ASSERT(m0_sm_group_is_locked(oi->oi_sm_grp));
-	idx_op_ast_executed(oi->oi_sm_grp, &oi->oi_ar.ar_ast);
-	M0_LEAVE();
+	M0_ENTRY("is_executed=%d, is_stable=%d", !!is_executed, !!is_stable);
+	return M0_RC(is_executed && is_stable);
 }
 
 static bool dixreq_clink_dtx_cb(struct m0_clink *cl)
 {
-	struct dix_req          *dix_req = M0_AMB(dix_req, cl, idr_clink);
+	struct dix_req          *dix_req = M0_AMB(dix_req, cl, idr_dtx_clink);
 	struct m0_op_idx        *oi = dix_req->idr_oi;
-	struct m0_sm            *req_sm = M0_AMB(req_sm, cl->cl_chan, sm_chan);
 	struct m0_dix_req       *dreq = &dix_req->idr_dreq;
-	struct m0_dtx           *dtx = oi->oi_dtx;
-	enum m0_dtm0_dtx_state   state;
-	int                      i;
-
-	M0_PRE(M0_IN(oi->oi_oc.oc_op.op_code, (M0_IC_PUT, M0_IC_DEL)));
-	M0_PRE(dtx != NULL);
-
-	state = m0_dtx0_sm_state(dtx);
 
-	if (!M0_IN(state, (M0_DDS_EXECUTED_ALL, M0_DDS_STABLE, M0_DDS_FAILED)))
+	/*
+	 * Filter out the situations where dtx exists but m0_op_idx
+	 * is not yet attached to dix_req.
+	 */
+	if (oi == NULL)
 		return false;
 
-	switch (state) {
-	case M0_DDS_EXECUTED_ALL:
-		/* TODO: we have a single kv pair; probably, it does not have
-		 * to be a loop.
-		 */
-		for (i = 0; i < m0_dix_req_nr(dreq); i++) {
-			oi->oi_rcs[i] = dreq->dr_items[i].dxi_rc;
-		}
-		/* XXX: We cannot use m0_dix_generic_rc here because the
-		 * precondition fails in this case. At this point error
-		 * handling is not covered here, and probably the error
-		 * code needs to be first propogated from DIX to DTX and
-		 * then it needs to be passed here as dtx.dd_sm.sm_rc.
-		 */
-		dixreq_executed_post(dix_req, dreq->dr_sm.sm_rc);
-		break;
-	case M0_DDS_STABLE:
-		M0_ASSERT_INFO(m0_dix_generic_rc(dreq) == 0,
-			       "TODO: DIX failures are not supported.");
-
-		M0_ASSERT_INFO(m0_forall(idx, m0_dix_req_nr(dreq),
-					 m0_dix_item_rc(dreq, idx) == 0),
-			       "TODO: failed executions of individual items "
-			       "are not supported yet.");
-
-		dixreq_stable_post(dix_req, m0_dix_generic_rc(dreq));
+	M0_PRE(M0_IN(oi->oi_oc.oc_op.op_code, (M0_IC_PUT, M0_IC_DEL)));
+	M0_PRE(oi->oi_dtx != NULL);
+
+	if (M0_IN(m0_dtx0_sm_state(oi->oi_dtx),
+		  (M0_DDS_STABLE, M0_DDS_FAILED)) && m0_clink_is_armed(cl))
 		m0_clink_del(cl);
-		break;
-	case M0_DDS_FAILED:
-		M0_IMPOSSIBLE("DTX failures are not supported so far.");
-	default:
-		M0_IMPOSSIBLE("Only Executed and Stable are allowed so far.");
+
+	if (dix_req_is_completed(dix_req)) {
+		dixreq_completed_post(dix_req, m0_dix_generic_rc(dreq));
 	}
 
 	return false;
@@ -985,15 +929,20 @@ static bool dixreq_clink_cb(struct m0_clink *cl)
 		}
 	}
 
-	dixreq_completed_post(dix_req, rc);
+	if (dix_req_is_completed(dix_req))
+		dixreq_completed_post(dix_req, rc);
 	return false;
 }
 
 static void dix_req_immed_failure(struct dix_req *req, int rc)
 {
+	struct m0_op_idx *oi = req->idr_oi;
+
 	M0_ENTRY();
 	M0_PRE(rc != 0);
 	m0_clink_del(&req->idr_clink);
+	if (oi->oi_dtx != NULL)
+		m0_clink_del(&req->idr_dtx_clink);
 	dixreq_completed_post(req, rc);
 	M0_LEAVE();
 }
@@ -1119,9 +1068,10 @@ static void dix_dreq_prepare(struct dix_req   *req,
 			     struct m0_op_idx *oi)
 {
 	dix_build(oi, dix);
-	m0_clink_add(oi->oi_dtx != NULL ?
-		     &oi->oi_dtx->tx_dtx->dd_sm.sm_chan :
-		     &req->idr_dreq.dr_sm.sm_chan, &req->idr_clink);
+	m0_clink_add(&req->idr_dreq.dr_sm.sm_chan, &req->idr_clink);
+	if (oi->oi_dtx != NULL)
+		m0_clink_add(&oi->oi_dtx->tx_dtx->dd_sm.sm_chan,
+			     &req->idr_dtx_clink);
 }
 
 static void dix_put_ast(struct m0_sm_group *grp, struct m0_sm_ast *ast)
@@ -1271,8 +1221,13 @@ M0_INTERNAL int m0__idx_cancel(struct m0_op_idx *oi)
 }
 
 static void dix_set_idx_flags(struct m0_op_idx *oi)
-{ 
-	if (ENABLE_DTM0)
+{
+	/* XXX:
+	 * For some reason, "if(ENABLE_DTM0)" was added here, but
+	 * it wasn't properly tested. Disabling it until the code
+	 * is covered with tests.
+	 */
+	if (0)
 		oi->oi_flags |= M0_OIF_SKIP_LAYOUT;
 
 	if (!(oi->oi_flags & M0_OIF_SKIP_LAYOUT))
diff --git a/motr/m0kv/index.c b/motr/m0kv/index.c
index 294ceacd15f..a6158a0b3c3 100644
--- a/motr/m0kv/index.c
+++ b/motr/m0kv/index.c
@@ -30,6 +30,7 @@
 #define M0_TRACE_SUBSYSTEM M0_TRACE_SUBSYS_CLIENT
 #include <stdio.h>                    /* FILE */
 #include <uuid/uuid.h>                /* uuid_generate */
+#include <unistd.h>                   /* access(), sleep() */
 #include "lib/assert.h"               /* M0_ASSERT */
 #include "lib/errno.h"
 #include "lib/memory.h"
@@ -79,6 +80,7 @@ static int instance_init(struct params *params)
 		.cc_params = params,
 		.cc_conf = {
 			.mc_is_oostore     = true,
+			.mc_is_addb_init   = true,
 			.mc_is_read_verify = false,
 			.mc_local_addr     = params->cp_local_addr,
 			.mc_ha_addr        = params->cp_ha_addr,
@@ -161,6 +163,25 @@ static int genv(char *filename, int cnt, int size)
 	return 0;
 }
 
+static int wait_file(const char *expected_file)
+{
+	int rc;
+
+	m0_console_printf("Awaiting: 'touch %s' \n", expected_file);
+
+	while (access(expected_file, F_OK ) != 0) {
+		rc = -errno;
+		M0_ASSERT_INFO(rc == -ENOENT, "Wrong file or directory? "
+			       "file=%s, rc=%d", expected_file, rc);
+		sleep(1);
+	}
+
+	m0_console_printf("File detected: '%s'\n", expected_file);
+	(void) remove(expected_file);
+
+	return 0;
+}
+
 static void log_hex_val(const char *tag, void *buf, int size)
 {
 	int i;
@@ -274,6 +295,9 @@ static int cmd_exec(struct index_cmd *cmd)
 	case GENV:
 		rc = genv(cmd->ic_filename, cmd->ic_cnt, cmd->ic_len);
 		break;
+	case WLF:
+		rc = wait_file(cmd->ic_filename);
+		break;
 	default:
 		rc = M0_ERR(-EINVAL);
 		M0_ASSERT(0);
diff --git a/motr/m0kv/index.h b/motr/m0kv/index.h
index e24d067eb22..f75b66ccc86 100644
--- a/motr/m0kv/index.h
+++ b/motr/m0kv/index.h
@@ -43,7 +43,8 @@ enum {
 	GET,  /* Get record.        */
 	NXT,  /* Next record.       */
 	GENF, /* Generate FID-file. */
-	GENV  /* Generate VAL-file. */
+	GENV, /* Generate VAL-file. */
+	WLF,  /* Wait for a file to appear. */
 };
 
 enum {
diff --git a/motr/m0kv/index_parser.c b/motr/m0kv/index_parser.c
index 8865e614f2a..cc975fad001 100644
--- a/motr/m0kv/index_parser.c
+++ b/motr/m0kv/index_parser.c
@@ -61,6 +61,7 @@ static const struct command_descr commands[] = {
 	{ GENF, "genf",   "genf CNT FILE, generate file with several FID" },
 	{ GENV, "genv",   "genv CNT SIZE FILE, generate file with several "
 			  "KEY_PARAM/VAL_PARAM. Note: SIZE > 16" },
+	{ WLF,  "wait",   "wait FILE, await a file to appear" },
 };
 
 static int command_id(const char *name)
@@ -362,6 +363,11 @@ static int command_assign(struct index_cmd *cmd, int *argc, char ***argv)
 		++*params;
 		*argc -= 3;
 		break;
+	case WLF:
+		cmd->ic_filename = **params;
+		++*params;
+		--*argc;
+		break;
 	default:
 		M0_IMPOSSIBLE("Wrong command");
 	}
@@ -409,6 +415,9 @@ static bool command_is_valid(struct index_cmd *cmd)
 		     cmd->ic_cnt != 0 &&
 		     cmd->ic_len != 0;
 		break;
+	case WLF:
+		rc = cmd->ic_filename != NULL;
+		break;
 	default:
 		M0_IMPOSSIBLE("Wrong command.");
 	}
diff --git a/motr/magic.h b/motr/magic.h
index ecc23779ed2..8b4eebc85d2 100644
--- a/motr/magic.h
+++ b/motr/magic.h
@@ -430,6 +430,10 @@ enum m0_magic_satchel {
 	M0_DTM_CAT_MAGIX = 0x33acce551b1e4277,
 	/* m0_dtm::d_excited::t_magic (flooded baboo) */
 	M0_DTM_EXC_MAGIX = 0x33f100dedbab0077,
+	/* recovery_fom::rf_magic (solidified 66) */
+	M0_DTM0_RMACH_MAGIC = 0x335011D1F1ED6677,
+	/* rfom_tl::td_head_magic (seeded cobble) */
+	M0_DTM0_RMACH_HEAD_MAGIC = 0x335EEDEDC0BB1E77,
 
 /* Failure Domains */
 	/* m0_fd_perm_cache::fpc_magic (fascia doodia) */
diff --git a/motr/ut/idx_dix.c b/motr/ut/idx_dix.c
index 11d89873dac..252bef10863 100644
--- a/motr/ut/idx_dix.c
+++ b/motr/ut/idx_dix.c
@@ -1118,7 +1118,7 @@ static void dtm0_ut_cas_op_prepare(const struct m0_fid    *cfid,
 	}
 }
 
-static void dtm0_ut_send_redo(const struct m0_fid *ifid,
+static void dtm0_ut_send_redo(const struct m0_fid *ifid, uint32_t sdev_id,
 			      uint64_t *key, uint64_t *val)
 {
 	int                     rc;
@@ -1142,8 +1142,6 @@ static void dtm0_ut_send_redo(const struct m0_fid *ifid,
 	 * Ivan Alekhin.
 	 */
 	struct m0_fom           zero_fom_to_be_deleted = {};
-	/* Extreme hack to convert index fid to component catalogue fid. */
-	uint32_t                sdev_idx = 10;
 
 	m0_dtm0_clk_src_init(&dcs, M0_DTM0_CS_PHYS);
 	m0_dtm0_clk_src_now(&dcs, &now);
@@ -1162,7 +1160,7 @@ static void dtm0_ut_send_redo(const struct m0_fid *ifid,
 		.dti_fid = cli_dtm0_fid
 	};
 
-	m0_dix_fid_convert_dix2cctg(ifid, &cctg_fid, sdev_idx);
+	m0_dix_fid_convert_dix2cctg(ifid, &cctg_fid, sdev_id);
 
 	dtm0_ut_cas_op_prepare(&cctg_fid, &cas_op, &cas_rec, key, val, &txr);
 
@@ -1216,7 +1214,59 @@ static void dtm0_ut_read_and_check(uint64_t key, uint64_t val)
 	m0_free0(&rcs);
 }
 
-static void st_dtm0_r(void)
+static uint32_t dtm0_ut_cas_sdev_id_get(void)
+{
+	struct m0_fid           srv_cas_fid;
+	struct m0_fid           srv_proc_fid;
+	struct m0_confc        *confc = m0_reqh2confc(&ut_m0c->m0c_reqh);
+	struct m0_conf_obj     *obj;
+	struct m0_conf_service *service;
+	struct m0_conf_obj     *sdevs_dir;
+	struct m0_conf_sdev    *sdev;
+	bool                    found = false;
+	uint32_t                sdev_id = 0;
+	int                     rc;
+
+	rc = m0_fid_sscanf(ut_m0_config.mc_process_fid, &srv_proc_fid);
+	M0_UT_ASSERT(rc == 0);
+	rc = m0_conf_process2service_get(confc, &srv_proc_fid,
+					 M0_CST_CAS, &srv_cas_fid);
+	M0_UT_ASSERT(rc == 0);
+
+	obj = m0_conf_cache_lookup(&confc->cc_cache, &srv_cas_fid);
+	M0_ASSERT(obj != NULL);
+
+	service = M0_CONF_CAST(obj, m0_conf_service);
+	M0_ASSERT(service != NULL);
+
+	sdevs_dir = &service->cs_sdevs->cd_obj;
+	rc = m0_confc_open_sync(&sdevs_dir, sdevs_dir, M0_FID0);
+	M0_ASSERT(rc == 0);
+
+	obj = NULL;
+	while ((rc = m0_confc_readdir_sync(sdevs_dir, &obj)) > 0) {
+		sdev = M0_CONF_CAST(obj, m0_conf_sdev);
+		if (!found) {
+			sdev_id = sdev->sd_dev_idx;
+			found = true;
+		} else {
+			/*
+			 * Single device attached to the CAS service
+			 * is a standard configuration for now, don't
+			 * support several attached devices in the UT.
+			 */
+			M0_IMPOSSIBLE("Do not support several CAS devices.");
+		}
+	}
+
+	m0_confc_close(sdevs_dir);
+
+	M0_ASSERT(found);
+
+	return sdev_id;
+}
+
+static void st_dtm0_r_common(uint32_t sdev_id)
 {
 	m0_time_t rem;
 	uint64_t  key = 111;
@@ -1227,7 +1277,7 @@ static void st_dtm0_r(void)
 
 	idx_setup();
 	exec_one_by_one(1, M0_IC_PUT);
-	dtm0_ut_send_redo(&duc.duc_ifid, &key, &val);
+	dtm0_ut_send_redo(&duc.duc_ifid, sdev_id, &key, &val);
 
 	/* XXX dirty hack, but now we don't have completion notification */
 	rem = 2ULL * M0_TIME_ONE_SECOND;
@@ -1238,6 +1288,27 @@ static void st_dtm0_r(void)
 	idx_teardown();
 }
 
+static void st_dtm0_r(void)
+{
+	/* CAS sdev id from the configuration. */
+	uint32_t sdev_id = dtm0_ut_cas_sdev_id_get();
+	st_dtm0_r_common(sdev_id);
+}
+
+static void st_dtm0_r_wrong_sdev(void)
+{
+	/*
+	 * Random CAS sdev id, should be fixed by REDO handler.
+	 * In a real system participants will send the REDO messages
+	 * with their own CAS devices IDs during recovery, so the REDO
+	 * handler of the process to be recovered needs to get the CAS
+	 * device ID attached to a local CAS service and set it in an
+	 * operation CAS ID.
+	 */
+	uint32_t sdev_id = dtm0_ut_cas_sdev_id_get() + 12345;
+	st_dtm0_r_common(sdev_id);
+}
+
 struct m0_ut_suite ut_suite_mt_idx_dix = {
 	.ts_name   = "idx-dix-mt",
 	.ts_owners = "Anatoliy",
@@ -1252,6 +1323,7 @@ struct m0_ut_suite ut_suite_mt_idx_dix = {
 		{ "dtm0_e_then_s",  st_dtm0_e_then_s, "Ivan"     },
 		{ "dtm0_c",         st_dtm0_c,        "Ivan"     },
 		{ "dtm0_r",         st_dtm0_r,        "Sergey"   },
+		{ "dtm0_r_wrong_sdev", st_dtm0_r_wrong_sdev, "Sergey" },
 		{ NULL, NULL }
 	}
 };
diff --git a/rpc/conn.c b/rpc/conn.c
index 57b2ddefd56..27aff8dbd79 100644
--- a/rpc/conn.c
+++ b/rpc/conn.c
@@ -1349,10 +1349,18 @@ static bool rpc_conn__on_service_event_cb(struct m0_clink *clink)
 	M0_PRE(m0_rpc_conn2svc(conn) == obj);
 	M0_LOG(M0_DEBUG, "obj->co_ha_state = %d", obj->co_ha_state);
 	/*
+	 * At this moment, M0_NC_TRANSIENT means that some volatile state
+	 * associated with that process was lost (its memory, packets,
+	 * connections -- anything).
+	 * In this case, the best thing we can do is to cancel
+	 * all the sessions and let the user to re-connect.
+	 * M0_NC_FAILED means the same thing plus some persistent state
+	 * was lost.
+	 * Previous outdated comment:
 	 * Ignore M0_NC_TRANSIENT state to keep items re-sending until service
 	 * gets M0_NC_ONLINE or becomes M0_NC_FAILED finally.
 	 */
-	if (obj->co_ha_state == M0_NC_FAILED)
+	if (M0_IN(obj->co_ha_state, (M0_NC_FAILED, M0_NC_TRANSIENT)))
 		m0_rpc_conn_sessions_cancel(conn);
 	/**
 	 * @todo See if to __conn_ha_unsubscribe() right now, but not wait until
diff --git a/rpc/rpc_opcodes.h b/rpc/rpc_opcodes.h
index 0310fa2123d..cb0ba05b60f 100644
--- a/rpc/rpc_opcodes.h
+++ b/rpc/rpc_opcodes.h
@@ -358,6 +358,7 @@ enum M0_RPC_OPCODES {
 	M0_ISCSERVICE_EXEC_OPCODE           = 1072,
 	M0_DTM0_RLINK_OPCODE                = 1073,
 	M0_FDMI_SOURCE_DOCK_TIMER_OPCODE    = 1074,
+	M0_DTM0_RECOVERY_FOM_OPCODE         = 1075,
 
 	M0_OPCODES_NR                       = 2048
 } M0_XCA_ENUM;
diff --git a/scripts/gdb/gdb-extensions b/scripts/gdb/gdb-extensions
index 40790d4d9e6..c40ef48b9ad 100644
--- a/scripts/gdb/gdb-extensions
+++ b/scripts/gdb/gdb-extensions
@@ -113,3 +113,37 @@ define runever
     end
   end
 end
+
+define coro-bt
+	set $c = ((struct m0_co_context *) ($arg0))
+	set $i = 0
+	while ($i < $c->mc_frame)
+		list *$c->mc_stack[$i]
+		set $i = $i + 1
+	end
+end
+
+document coro-bt
+	Prints the backtrace of a coroutine context.
+end
+
+define rfom-print
+	set $s = ((struct recovery_fom *) ($arg0))
+	printf "p=%p, v=%d, l=%d, ", $s, (int) $s->rf_is_volatile, (int) $s->rf_is_local
+	printf "eol=%d, ", (int) $s->rf_last_known_eol
+	printf "ha=%d, end=%d, ", (int) $s->rf_last_known_ha_state, (int) $s->rf_eolq.bq_the_end
+	printf "coro=%p \n", ($s->rf_coro.mc_stack[$s->rf_coro.mc_frame - 1])
+	coro-bt &$s->rf_coro
+end
+
+define drm-print
+	set $r = (struct m0_dtm0_recovery_machine *) ($arg0)
+	m0-list-print &($arg0)->rm_rfoms struct recovery_fom rf_linkage rfom-print
+end
+
+document drm-print
+	Prints parts of DTM0 recovery machine state in human-readable format.
+	The only argument is a pointer to a recovery machine.
+	The function prints some properties of recovery FOMs and the
+	backtraces of their coroutines.
+
diff --git a/scripts/install/opt/seagate/cortx/motr/bin/motr_mini_prov.py b/scripts/install/opt/seagate/cortx/motr/bin/motr_mini_prov.py
index 3db5129eba0..696c416b12a 100644
--- a/scripts/install/opt/seagate/cortx/motr/bin/motr_mini_prov.py
+++ b/scripts/install/opt/seagate/cortx/motr/bin/motr_mini_prov.py
@@ -59,6 +59,7 @@
 CMD_RETRY_COUNT = 5
 MEM_THRESHOLD = 4*1024*1024*1024
 CVG_COUNT_KEY = "num_cvg"
+MOTR_M0D_MIN_RPC_RECVQ_LEN = 64
 
 class MotrError(Exception):
     """ Generic Exception with error code and output """
@@ -494,6 +495,7 @@ def update_copy_motr_config_file(self):
                    ("MOTR_M0D_DATA_DIR", f"{MOTR_M0D_DATA_DIR}"),
                    ("MOTR_M0D_CONF_XC", f"{MOTR_M0D_CONF_XC}"),
                    ("MOTR_M0D_ADDB_STOB_DIR", f"{MOTR_M0D_ADDB_STOB_DIR}"),
+                   ("MOTR_M0D_MIN_RPC_RECVQ_LEN", f"{MOTR_M0D_MIN_RPC_RECVQ_LEN}"),
                    ("MOTR_M0D_TRACE_DIR", f"{MOTR_M0D_TRACE_DIR}")]
     update_config_file(self, f"{MOTR_SYS_CFG}", config_kvs)
     # Copy config file to new path
diff --git a/utils/m0hagen b/utils/m0hagen
index 08210b4466e..966e370add9 100755
--- a/utils/m0hagen
+++ b/utils/m0hagen
@@ -295,7 +295,7 @@ yaml.add_implicit_resolver(Fid.yaml_tag, Fid.re_conf,
 
 # enum m0_ha_obj_state
 ha_states = ['unknown', 'online', 'failed', 'transient', 'repair', 'repaired',
-             'rebalance']
+             'rebalance', 'dtm-recovering']
 
 
 class HAMsgNVec(XCodeable):