diff --git a/.xperior/testds/motr-single_tests.yaml b/.xperior/testds/motr-single_tests.yaml index cda557d09c0..b8c5d57ad76 100644 --- a/.xperior/testds/motr-single_tests.yaml +++ b/.xperior/testds/motr-single_tests.yaml @@ -19,8 +19,1268 @@ --- Tests: - - id : 00userspace-tests - script : 'm0 run-ut' + - id : 00userspace-tests_libm0-ut + script : 'm0 run-ut -t libm0-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_addb2-base + script : 'm0 run-ut -t addb2-base' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_addb2-consumer + script : 'm0 run-ut -t addb2-consumer' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_addb2-histogram + script : 'm0 run-ut -t addb2-histogram' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_addb2-net + script : 'm0 run-ut -t addb2-net' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_addb2-storage + script : 'm0 run-ut -t addb2-storage' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_addb2-sys + script : 'm0 run-ut -t addb2-sys' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_di-ut + script : 'm0 run-ut -t di-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_balloc-ut + script : 'm0 run-ut -t balloc-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_be-ut + script : 'm0 run-ut -t be-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_buffer_pool_ut + script : 'm0 run-ut -t buffer_pool_ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_bulk-client-ut + script : 'm0 run-ut -t bulk-client-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_bulk-server-ut + script : 'm0 run-ut -t bulk-server-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_bytecount-ut + script : 'm0 run-ut -t bytecount-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_dtm0-ut + script : 'm0 run-ut -t dtm0-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_btree-ut + script : 'm0 run-ut -t btree-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_capa-ut + script : 'm0 run-ut -t capa-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_cas-client + script : 'm0 run-ut -t cas-client' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_cas-service + script : 'm0 run-ut -t cas-service' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_client-ut + script : 'm0 run-ut -t client-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_obj-ut + script : 'm0 run-ut -t obj-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_io-ut + script : 'm0 run-ut -t io-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_io-nw-xfer-ut + script : 'm0 run-ut -t io-nw-xfer-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_io-pargrp-ut + script : 'm0 run-ut -t io-pargrp-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_io-req-ut + script : 'm0 run-ut -t io-req-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_io-req-fop-ut + script : 'm0 run-ut -t io-req-fop-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_sync-ut + script : 'm0 run-ut -t sync-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_idx-ut + script : 'm0 run-ut -t idx-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_idx-dix + script : 'm0 run-ut -t idx-dix' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_idx-dix-mt + script : 'm0 run-ut -t idx-dix-mt' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_layout-ut + script : 'm0 run-ut -t layout-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_helpers-ufid-ut + script : 'm0 run-ut -t helpers-ufid-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_cm-cp-ut + script : 'm0 run-ut -t cm-cp-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_cm-ut + script : 'm0 run-ut -t cm-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_cob-ut + script : 'm0 run-ut -t cob-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_cob-foms-ut + script : 'm0 run-ut -t cob-foms-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_conf-ut + script : 'm0 run-ut -t conf-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_conf-load-ut + script : 'm0 run-ut -t conf-load-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_conf-pvers-ut + script : 'm0 run-ut -t conf-pvers-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_confc-ut + script : 'm0 run-ut -t confc-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_conf-glob-ut + script : 'm0 run-ut -t conf-glob-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_conf-diter-ut + script : 'm0 run-ut -t conf-diter-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_confstr-ut + script : 'm0 run-ut -t confstr-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_conf-validation-ut + script : 'm0 run-ut -t conf-validation-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_conf-walk-ut + script : 'm0 run-ut -t conf-walk-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rconfc-ut + script : 'm0 run-ut -t rconfc-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rpc-connection-ut + script : 'm0 run-ut -t rpc-connection-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_dix-client-ut + script : 'm0 run-ut -t dix-client-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_dix-cm-iter + script : 'm0 run-ut -t dix-cm-iter' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_dtm-nucleus-ut + script : 'm0 run-ut -t dtm-nucleus-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_dtm-transmit-ut + script : 'm0 run-ut -t dtm-transmit-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_dtm-dtx-ut + script : 'm0 run-ut -t dtm-dtx-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_dtm0-clk-src-ut + script : 'm0 run-ut -t dtm0-clk-src-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_dtm0-log-ut + script : 'm0 run-ut -t dtm0-log-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_failure_domains_tree-ut + script : 'm0 run-ut -t failure_domains_tree-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_failure_domains-ut + script : 'm0 run-ut -t failure_domains-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_fis-ut + script : 'm0 run-ut -t fis-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_fdmi-filterc-ut + script : 'm0 run-ut -t fdmi-filterc-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_fdmi-pd-ut + script : 'm0 run-ut -t fdmi-pd-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_fdmi-sd-ut + script : 'm0 run-ut -t fdmi-sd-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_fdmi-fol-ut + script : 'm0 run-ut -t fdmi-fol-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_fdmi-fol-fini-ut + script : 'm0 run-ut -t fdmi-fol-fini-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_fdmi-filter-eval-ut + script : 'm0 run-ut -t fdmi-filter-eval-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_fit-ut + script : 'm0 run-ut -t fit-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_fol-ut + script : 'm0 run-ut -t fol-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_fom-timedwait-ut + script : 'm0 run-ut -t fom-timedwait-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rpc-formation-ut + script : 'm0 run-ut -t rpc-formation-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_ha-ut + script : 'm0 run-ut -t ha-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_ha-state-ut + script : 'm0 run-ut -t ha-state-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_ios-bufferpool-ut + script : 'm0 run-ut -t ios-bufferpool-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_isc-api-ut + script : 'm0 run-ut -t isc-api-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_isc-service-ut + script : 'm0 run-ut -t isc-service-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rpc-item-ut + script : 'm0 run-ut -t rpc-item-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rpc-item-source-ut + script : 'm0 run-ut -t rpc-item-source-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_layout-ut + script : 'm0 run-ut -t layout-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_layout-access-plan-ut + script : 'm0 run-ut -t layout-access-plan-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rpc-link-ut + script : 'm0 run-ut -t rpc-link-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_fop-lock-ut + script : 'm0 run-ut -t fop-lock-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_fom-stats-ut + script : 'm0 run-ut -t fom-stats-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_net-bulk-if + script : 'm0 run-ut -t net-bulk-if' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_net-bulk-mem + script : 'm0 run-ut -t net-bulk-mem' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_net-lnet + script : 'm0 run-ut -t net-lnet' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_libfab-ut + script : 'm0 run-ut -t libfab-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_net-misc-ut + script : 'm0 run-ut -t net-misc-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_net-module + script : 'm0 run-ut -t net-module' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_net-test + script : 'm0 run-ut -t net-test' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_net-prov-ut + script : 'm0 run-ut -t net-prov-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_m0d-ut + script : 'm0 run-ut -t m0d-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_mdservice-ut + script : 'm0 run-ut -t mdservice-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_module-ut + script : 'm0 run-ut -t module-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_ms-fom-ut + script : 'm0 run-ut -t ms-fom-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rpc-packet-encdec-ut + script : 'm0 run-ut -t rpc-packet-encdec-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_parity_math-ut + script : 'm0 run-ut -t parity_math-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_parity_math_ssse3-ut + script : 'm0 run-ut -t parity_math_ssse3-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_poolmach-ut + script : 'm0 run-ut -t poolmach-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_reqh-ut + script : 'm0 run-ut -t reqh-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_reqh-fop-allow-ut + script : 'm0 run-ut -t reqh-fop-allow-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_reqh-service-ut + script : 'm0 run-ut -t reqh-service-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_reqh-service-ctx-ut + script : 'm0 run-ut -t reqh-service-ctx-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rm-ut + script : 'm0 run-ut -t rm-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rm-rcredits-ut + script : 'm0 run-ut -t rm-rcredits-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rm-rwlock-ut + script : 'm0 run-ut -t rm-rwlock-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rpc-at + script : 'm0 run-ut -t rpc-at' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rpc-machine-ut + script : 'm0 run-ut -t rpc-machine-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rpc-rcv-session-ut + script : 'm0 run-ut -t rpc-rcv-session-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rpc-lib-ut + script : 'm0 run-ut -t rpc-lib-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rpc-conn-pool-ut + script : 'm0 run-ut -t rpc-conn-pool-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_rpc-session-ut + script : 'm0 run-ut -t rpc-session-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_sm-ut + script : 'm0 run-ut -t sm-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_snscm_xform-ut + script : 'm0 run-ut -t snscm_xform-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_snscm_storage-ut + script : 'm0 run-ut -t snscm_storage-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_sns-cm-repair-ut + script : 'm0 run-ut -t sns-cm-repair-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_snscm_net-ut + script : 'm0 run-ut -t snscm_net-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_sns-file-lock-ut + script : 'm0 run-ut -t sns-file-lock-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_spiel-ut + script : 'm0 run-ut -t spiel-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_spiel-ci-ut + script : 'm0 run-ut -t spiel-ci-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_sss-ut + script : 'm0 run-ut -t sss-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_stats-ut + script : 'm0 run-ut -t stats-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_spiel-conf-ut + script : 'm0 run-ut -t spiel-conf-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_stob-ut + script : 'm0 run-ut -t stob-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_storage-dev-ut + script : 'm0 run-ut -t storage-dev-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_udb-ut + script : 'm0 run-ut -t udb-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_xcode_bufvec_fop-ut + script : 'm0 run-ut -t xcode_bufvec_fop-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_ff2c-ut + script : 'm0 run-ut -t ff2c-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_xcode-ut + script : 'm0 run-ut -t xcode-ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_pi_ut + script : 'm0 run-ut -t pi_ut' + dir : src/scripts + executor : Xperior::Executor::MotrTest + #executor : Xperior::Executor::Skip + sandbox : /var/motr/m0ut + groupname: 01motr-single-node + polltime : 15 + timeout : 2400 + + - id : 00userspace-tests_libconsole-ut + script : 'm0 run-ut -t libconsole-ut' dir : src/scripts executor : Xperior::Executor::MotrTest #executor : Xperior::Executor::Skip diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 05c4eb4f6ee..ce508c10fba 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -175,5 +175,5 @@ Refer to the [Motr Coding Style Guide](../dev/doc/coding-style.md) and the [CORT You can reach out to us with your questions, feedback, and comments through our CORTX Communication Channels: -- Join our CORTX-Open Source Slack Channel to interact with your fellow community members and gets your questions answered. [![Slack Channel](https://img.shields.io/badge/chat-on%20Slack-blue)](https://join.slack.com/t/cortxcommunity/shared_invite/zt-femhm3zm-yiCs5V9NBxh89a_709FFXQ?) +- Join our CORTX-Open Source Slack Channel to interact with community members and gets your questions answered. [![Slack Channel](https://img.shields.io/badge/chat-on%20Slack-blue)](https://join.slack.com/t/cortxcommunity/shared_invite/zt-femhm3zm-yiCs5V9NBxh89a_709FFXQ?) - If you'd like to contact us directly, drop us a mail at cortx-questions@seagate.com. diff --git a/SUPPORT.md b/SUPPORT.md index 579cc597c4b..ade5842bfad 100644 --- a/SUPPORT.md +++ b/SUPPORT.md @@ -3,6 +3,6 @@ Looking for support for CORTX parent or a repository ? Consider some of these resources: - Join our CORTX-Open Source Slack channel [![Slack](https://img.shields.io/badge/chat-on%20Slack-blue")](https://join.slack.com/t/cortxcommunity/shared_invite/zt-femhm3zm-yiCs5V9NBxh89a_709FFXQ?) to interact with community members and gets your questions answered. -- Join [GitHub Discussions](https://github.com/Seagate/cortx-motr/discussions) to ask, answer, and discuss topics with your fellow CORTX contributors! +- Join [GitHub Discussions](https://github.com/Seagate/cortx-motr/discussions) to ask, answer, and discuss topics with CORTX contributors! - If you'd like to contact us directly, drop us a mail at [cortx-questions@seagate.com](mailto:cortx-questions@seagate.com) . - We like to highlight the work and contributions of our community members—if you have solved an interesting challenge, or you are interested in sharing your experience or use cases, we want to talk to you! Please email our Community Manager [rachel.novak@seagate.com](mailto:rachel.novak@seagate.com) or [schedule a meeting with us](https://outlook.office365.com/owa/calendar/CORTXCommunity@seagate.com/bookings/s/x8yMn2ODxUCOdhxvXkH4FA2) to share. diff --git a/be/btree.c b/be/btree.c index ec31a8d3bb9..7581f1701f4 100644 --- a/be/btree.c +++ b/be/btree.c @@ -1893,10 +1893,10 @@ M0_INTERNAL void m0_btree_lrulist_set_lru_config(int64_t slow_lru_mem_release, M0_ASSERT(lru_space_wm_high >= lru_space_wm_target && lru_space_wm_target >= lru_space_wm_low); - M0_LOG(M0_INFO, "Btree LRU List Watermarks: Low - %"PRIi64" Mid - " + M0_LOG(M0_NOTICE, "Btree LRU List Watermarks: Low - %"PRIi64" Mid - " "%"PRIi64" High - %"PRIi64" \n", lru_space_wm_low, lru_space_wm_target, lru_space_wm_high); - M0_LOG(M0_INFO, "Btree LRU List trickle release: %s \n", + M0_LOG(M0_NOTICE, "Btree LRU List trickle release: %s \n", lru_trickle_release_en ? "true" : "false"); } @@ -2025,7 +2025,6 @@ static int64_t tree_get(struct node_op *op, struct segaddr *addr, int nxt) return nxt; } - /** * Returns the tree to the free tree pool if the reference count for this tree * reaches zero. @@ -8780,7 +8779,7 @@ M0_INTERNAL int64_t m0_btree_lrulist_purge_check(enum m0_btree_purge_user user, if (lru_space_used < lru_space_wm_low) { /** Do nothing. */ if (user == M0_PU_EXTERNAL) - M0_LOG(M0_INFO, "Skipping memory release since used " + M0_LOG(M0_ALWAYS, "Skipping memory release since used " "space is below threshold requested size=%"PRId64 " used space=%"PRId64, size, lru_space_used); lru_trickle_release_mode = false; @@ -8806,7 +8805,7 @@ M0_INTERNAL int64_t m0_btree_lrulist_purge_check(enum m0_btree_purge_user user, purged_size = m0_btree_lrulist_purge(size_to_purge, size_to_purge != 0 ? 0 : M0_BTREE_TRICKLE_NUM_NODES); - M0_LOG(M0_INFO, " Below critical External user Purge," + M0_LOG(M0_ALWAYS, " Below critical External user Purge," " requested size=%"PRId64" used space=%"PRId64 " purged size=%"PRId64, size, lru_space_used, purged_size); @@ -8827,7 +8826,7 @@ M0_INTERNAL int64_t m0_btree_lrulist_purge_check(enum m0_btree_purge_user user, purged_size = m0_btree_lrulist_purge(size_to_purge, (lru_trickle_release_mode && size_to_purge == 0) ? M0_BTREE_TRICKLE_NUM_NODES : 0); - M0_LOG(M0_INFO, " Above critical purge, User=%s requested size=" + M0_LOG(M0_ALWAYS, " Above critical purge, User=%s requested size=" "%"PRId64" used space=%"PRIu64" purged size=" "%"PRIu64, user == M0_PU_BTREE ? "btree" : "external", size, lru_space_used, purged_size); diff --git a/cas/ctg_store.c b/cas/ctg_store.c index 0b0b56642d3..4f1c3fb7e80 100644 --- a/cas/ctg_store.c +++ b/cas/ctg_store.c @@ -169,6 +169,10 @@ static int ctg_op_exec (struct m0_ctg_op *ctg_op, int next_phase); static int ctg_meta_exec (struct m0_ctg_op *ctg_op, const struct m0_fid *fid, int next_phase); +static int ctg_dead_exec (struct m0_ctg_op *ctg_op, + struct m0_cas_ctg *ctg, + const struct m0_buf *key, + int next_phase); static int ctg_exec (struct m0_ctg_op *ctg_op, struct m0_cas_ctg *ctg, const struct m0_buf *key, @@ -479,7 +483,7 @@ int m0_ctg_create(struct m0_be_seg *seg, struct m0_be_tx *tx, bt.vsize = sizeof(struct meta_value); break; case CTT_DEADIDX: - bt.ksize = sizeof(struct meta_value); + bt.ksize = sizeof(struct generic_key *) + sizeof(void *); bt.vsize = sizeof(void *); break; case CTT_CTIDX: @@ -966,6 +970,11 @@ static void ctg_store_release(struct m0_ref *ref) M0_ENTRY(); m0_mutex_fini(&ctg_store->cs_state_mutex); + /* TODO: Clean up every index in memory tree allocation upon any CAS + operation on the index */ + ctg_fini(ctg_store->cs_state->cs_meta); + ctg_fini(ctg_store->cs_ctidx); + ctg_fini(ctg_store->cs_dead_index); ctg_store->cs_state = NULL; ctg_store->cs_ctidx = NULL; m0_long_lock_fini(&ctg_store->cs_del_lock); @@ -1382,7 +1391,7 @@ static int ctg_op_exec_normal(struct m0_ctg_op *ctg_op, int next_phase) * m0_be_btree_insert_inplace() have 0 there. */ - vsize = sizeof(struct generic_value); + vsize = sizeof(void *); rec.r_key.k_data = M0_BUFVEC_INIT_BUF(&k_ptr, &ksize); rec.r_val = M0_BUFVEC_INIT_BUF(&v_ptr, &vsize); rec.r_crc_type = M0_BCT_NO_CRC; @@ -1425,6 +1434,8 @@ static int ctg_op_exec_normal(struct m0_ctg_op *ctg_op, int next_phase) &kv_op, tx)); m0_be_op_done(beop); break; + case CTG_OP_COMBINE(CO_DEL, CT_DEAD_INDEX): + ksize = sizeof(struct generic_key *) + sizeof(void *); case CTG_OP_COMBINE(CO_DEL, CT_BTREE): case CTG_OP_COMBINE(CO_DEL, CT_META): m0_be_op_active(beop); @@ -1649,13 +1660,49 @@ M0_INTERNAL int m0_ctg_dead_index_insert(struct m0_ctg_op *ctg_op, struct m0_cas_ctg *ctg, int next_phase) { - ctg_op->co_ctg = m0_ctg_dead_index(); - ctg_op->co_ct = CT_DEAD_INDEX; ctg_op->co_opcode = CO_PUT; /* Dead index value is empty */ ctg_op->co_val = M0_BUF_INIT0; /* Dead index key is a pointer to a catalogue */ - return ctg_exec(ctg_op, ctg, &M0_BUF_INIT_PTR(&ctg), next_phase); + return ctg_dead_exec(ctg_op, ctg, &M0_BUF_INIT_PTR(&ctg), next_phase); +} + +M0_INTERNAL int m0_ctg_dead_delete(struct m0_ctg_op *ctg_op, + struct m0_cas_ctg *ctg, + const struct m0_buf *key, + int next_phase) +{ + M0_PRE(ctg_op != NULL); + M0_PRE(ctg != NULL); + M0_PRE(key != NULL); + M0_PRE(ctg_op->co_beop.bo_sm.sm_state == M0_BOS_INIT); + + ctg_op->co_opcode = CO_DEL; + + return ctg_dead_exec(ctg_op, ctg, key, next_phase); +} + +static int ctg_dead_exec(struct m0_ctg_op *ctg_op, + struct m0_cas_ctg *ctg, + const struct m0_buf *key, + int next_phase) +{ + int ret = M0_FSO_AGAIN; + + ctg_op->co_ctg = m0_ctg_dead_index(); + ctg_op->co_ct = CT_DEAD_INDEX; + + if (!M0_IN(ctg_op->co_opcode, (CO_MIN, CO_TRUNC, CO_DROP)) && + (ctg_op->co_opcode != CO_CUR || + ctg_op->co_cur_phase != CPH_NEXT)) + ctg_op->co_rc = ctg_kbuf_get(&ctg_op->co_key, key, true); + + if (ctg_op->co_rc != 0) + m0_fom_phase_set(ctg_op->co_fom, next_phase); + else + ret = ctg_op_exec(ctg_op, next_phase); + + return ret; } static int ctg_exec(struct m0_ctg_op *ctg_op, @@ -1666,7 +1713,7 @@ static int ctg_exec(struct m0_ctg_op *ctg_op, int ret = M0_FSO_AGAIN; ctg_op->co_ctg = ctg; - ctg_op->co_ct = CT_BTREE; + ctg_op->co_ct = CT_BTREE; if (!M0_IN(ctg_op->co_opcode, (CO_MIN, CO_TRUNC, CO_DROP)) && (ctg_op->co_opcode != CO_CUR || diff --git a/cas/ctg_store.h b/cas/ctg_store.h index ab5bb859c04..3b4a80c47ea 100644 --- a/cas/ctg_store.h +++ b/cas/ctg_store.h @@ -390,6 +390,18 @@ M0_INTERNAL int m0_ctg_dead_index_insert(struct m0_ctg_op *ctg_op, struct m0_cas_ctg *ctg, int next_phase); +/** + * Deletes 'ctg' from "dead index" catalogue. + * + * @param ctg_op Catalogue operation context. + * @param ctg Catalogue to be deleted from "dead index" catalogue. + * @param next_phase Next phase of caller FOM. + */ +M0_INTERNAL int m0_ctg_dead_delete(struct m0_ctg_op *ctg_op, + struct m0_cas_ctg *ctg, + const struct m0_buf *key, + int next_phase); + /** * Looks up a catalogue in meta catalogue. * diff --git a/cas/index_gc.c b/cas/index_gc.c index 2ed530ba046..2d1df6c05c9 100644 --- a/cas/index_gc.c +++ b/cas/index_gc.c @@ -361,6 +361,12 @@ static int cgc_fom_tick(struct m0_fom *fom0) fom->cg_ctg_op_initialized = true; result = m0_ctg_drop(ctg_op, fom->cg_ctg, CGC_LOCK_DEAD_INDEX); + /* + * Free the memory allocated for the root node after + * destroying the tree. + */ + if (result == M0_FSO_AGAIN) + m0_free0(&fom->cg_ctg->cc_tree); } else { M0_LOG(M0_DEBUG, "out of credits, commit & restart"); m0_long_unlock(m0_ctg_lock(m0_ctg_dead_index()), @@ -387,14 +393,13 @@ static int cgc_fom_tick(struct m0_fom *fom0) m0_ctg_op_fini(ctg_op); m0_ctg_op_init(ctg_op, fom0, 0); fom->cg_ctg_op_initialized = true; - fom->cg_ctg_key = M0_BUF_INIT(M0_CAS_CTG_KEY_HDR_SIZE, - &fom->cg_ctg); + fom->cg_ctg_key = M0_BUF_INIT_PTR(&fom->cg_ctg); /* * Now completely forget this ctg by deleting its descriptor * from "dead index" catalogue. */ - result = m0_ctg_delete(ctg_op, m0_ctg_dead_index(), - &fom->cg_ctg_key, CGC_SUCCESS); + result = m0_ctg_dead_delete(ctg_op, m0_ctg_dead_index(), + &fom->cg_ctg_key, CGC_SUCCESS); break; case CGC_SUCCESS: m0_long_unlock(m0_ctg_lock(m0_ctg_dead_index()), diff --git a/cas/service.c b/cas/service.c index 6558af0ef4d..5489a4edaf1 100644 --- a/cas/service.c +++ b/cas/service.c @@ -1151,6 +1151,22 @@ static int cas_dtm0_logrec_add(struct m0_fom *fom0, int i; int rc; + /* + * It is impossible to commit a transaction without DTM0 service up and + * running. + */ + if (dtms == NULL) { + static uint32_t count = 0; + if (count == 0) { + M0_LOG(M0_FATAL, "DTM is enabled but is not " + "configured in conf. Skip " + "DTM now. Please Check!"); + count++; /* Only print the message at the first time. */ + } + return 0; /* FIXME but now let's skip it if no DTM service. */ + } + M0_ASSERT(dtms != NULL); + for (i = 0; i < msg->dtd_ps.dtp_nr; ++i) { if (m0_fid_eq(&msg->dtd_ps.dtp_pa[i].p_fid, &dtms->dos_generic.rs_service_fid)) { diff --git a/cas/ut/client_ut.c b/cas/ut/client_ut.c index 3194c4c30dd..235650c3773 100644 --- a/cas/ut/client_ut.c +++ b/cas/ut/client_ut.c @@ -89,6 +89,8 @@ static char *cas_startup_cmd[] = { "-w", "10", "-F", "-f", M0_UT_CONF_PROCESS, "-c", M0_SRC_PATH("cas/ut/conf.xc") + /* FIXME If DTM is enabled, the above conf.xc must be updated to include + * DTM0 services. */ }; static const char *cdbnames[] = { "cas1" }; diff --git a/cas/ut/service_ut.c b/cas/ut/service_ut.c index 66c08723595..2627a4154f9 100644 --- a/cas/ut/service_ut.c +++ b/cas/ut/service_ut.c @@ -491,8 +491,10 @@ static void meta_fop_submit(struct m0_fop_type *fopt, fop_submit(fopt, &m0_cas_meta_fid, recs); - for (i = 0; i < meta_recs_num; i++) + for (i = 0; i < meta_recs_num; i++) { m0_rpc_at_fini(&recs[i].cr_key); + m0_free(recs[i].cr_key.u.ab_buf.b_addr); + } m0_free(recs); } @@ -592,6 +594,9 @@ static void create(void) init(); meta_fid_submit(&cas_put_fopt, &ifid); M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -624,6 +629,11 @@ static void cctg_create(void) meta_cid_submit(&cas_put_fopt, &cid2); M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); m0_dix_ldesc_fini(desc); + /* Cleaning up allocated memory to avoid leaks. */ + meta_cid_submit(&cas_del_fopt, &cid1); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); + meta_cid_submit(&cas_del_fopt, &cid2); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -652,6 +662,9 @@ static void cctg_create_lookup(void) meta_cid_submit(&cas_get_fopt, &cid); M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); m0_dix_ldesc_fini(desc); + /* Cleaning up allocated memory to avoid leaks. */ + meta_cid_submit(&cas_del_fopt, &cid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -705,6 +718,9 @@ static void create_lookup(void) M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); meta_fid_submit(&cas_get_fopt, &ifid); M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -720,6 +736,9 @@ static void create_create(void) M0_UT_ASSERT(rep_check(0, -EEXIST, BUNSET, BUNSET)); meta_fid_submit(&cas_get_fopt, &ifid); M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -754,6 +773,9 @@ static void recreate(void) M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); meta_fid_submit(&cas_get_fopt, &ifid); M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -775,6 +797,9 @@ static void meta_cur_1(void) M0_UT_ASSERT(rep.cgr_rep.cr_nr == 1); M0_UT_ASSERT(rep_check(0, 1, BSET, BUNSET)); M0_UT_ASSERT(m0_fid_eq(repv[0].cr_key.u.ab_buf.b_addr, &ifid)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -797,6 +822,9 @@ static void meta_cur_eot(void) M0_UT_ASSERT(rep_check(0, 1, BSET, BUNSET)); M0_UT_ASSERT(rep_check(1, -ENOENT, BUNSET, BUNSET)); M0_UT_ASSERT(m0_fid_eq(repv[0].cr_key.u.ab_buf.b_addr, &ifid)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -816,6 +844,9 @@ static void meta_cur_0(void) 1); M0_UT_ASSERT(rep.cgr_rc == 0); M0_UT_ASSERT(rep.cgr_rep.cr_nr == 0); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -857,6 +888,9 @@ static void meta_cur_none(void) M0_UT_ASSERT(rep_check(1, 0, BUNSET, BUNSET)); M0_UT_ASSERT(rep_check(2, 0, BUNSET, BUNSET)); M0_UT_ASSERT(rep_check(3, 0, BUNSET, BUNSET)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -894,6 +928,9 @@ static void meta_cur_all(void) M0_UT_ASSERT(m0_fid_eq(repv[2].cr_key.u.ab_buf.b_addr, &m0_cas_dead_index_fid)); M0_UT_ASSERT(m0_fid_eq(repv[3].cr_key.u.ab_buf.b_addr, &fid)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &fid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1020,6 +1057,9 @@ static void insert(void) M0_UT_ASSERT(rep.cgr_rc == 0); M0_UT_ASSERT(rep.cgr_rep.cr_nr == 1); M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1043,6 +1083,9 @@ static void insert_lookup(void) == sizeof (uint64_t)); M0_UT_ASSERT(2 == *(uint64_t *)rep.cgr_rep.cr_rec[0].cr_val.u.ab_buf.b_addr); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1061,6 +1104,9 @@ static void insert_delete(void) M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); index_op(&cas_get_fopt, &ifid, 1, NOVAL); M0_UT_ASSERT(rep_check(0, -ENOENT, BUNSET, BUNSET)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1075,6 +1121,10 @@ static void lookup_none(void) index_op(&cas_put_fopt, &ifid, 1, 2); index_op(&cas_get_fopt, &ifid, 3, NOVAL); M0_UT_ASSERT(rep_check(0, -ENOENT, BUNSET, BUNSET)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); + fini(); } @@ -1101,6 +1151,9 @@ static void empty_value(void) M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); index_op(&cas_get_fopt, &ifid, 1, NOVAL); M0_UT_ASSERT(rep_check(0, -ENOENT, BUNSET, BUNSET)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1118,6 +1171,10 @@ static void insert_2(void) M0_UT_ASSERT(rep_check(0, -EEXIST, BUNSET, BUNSET)); index_op(&cas_get_fopt, &ifid, 1, NOVAL); M0_UT_ASSERT(rep_check(0, 0, BUNSET, BSET)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); + fini(); } @@ -1131,6 +1188,9 @@ static void delete_2(void) M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); index_op(&cas_del_fopt, &ifid, 1, NOVAL); M0_UT_ASSERT(rep_check(0, -ENOENT, BUNSET, BUNSET)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1182,6 +1242,9 @@ static void lookup_N(void) meta_fid_submit(&cas_put_fopt, &ifid); insert_odd(&ifid); lookup_all(&ifid); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1213,6 +1276,9 @@ static void lookup_restart(void) m0_cas__ut_svc_be_set(cas, &be.but_dom); m0_reqh_service_start(cas); lookup_all(&ifid); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1268,6 +1334,9 @@ static void cur_N(void) M0_UT_ASSERT(rep_check(k, -ENOENT, BUNSET, BUNSET)); M0_UT_ASSERT(rep.cgr_rep.cr_nr == INSERTS); } + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1340,6 +1409,9 @@ static void meta_lookup_fail(void) /* Lookup without ENOMEM returns record. */ meta_fid_submit(&cas_get_fopt, &ifid); M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1355,6 +1427,9 @@ static void meta_delete_fail(void) /* Lookup should return record. */ meta_fid_submit(&cas_get_fopt, &ifid); M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1379,6 +1454,9 @@ static void insert_fail(void) M0_UT_ASSERT(rep.cgr_rep.cr_nr == 1); M0_UT_ASSERT(rep.cgr_rep.cr_rec[0].cr_rc == -ENOENT); M0_UT_ASSERT(rep.cgr_rep.cr_rec[0].cr_val.u.ab_buf.b_addr == NULL); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1409,6 +1487,9 @@ static void lookup_fail(void) M0_UT_ASSERT(rep.cgr_rc == 0); M0_UT_ASSERT(repv[0].cr_val.u.ab_buf.b_nob == sizeof (uint64_t)); M0_UT_ASSERT(*(uint64_t *)repv[0].cr_val.u.ab_buf.b_addr == 2); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1450,6 +1531,9 @@ static void delete_fail(void) M0_UT_ASSERT(rep.cgr_rep.cr_nr == 1); M0_UT_ASSERT(rep.cgr_rep.cr_rec[0].cr_rc == -ENOENT); M0_UT_ASSERT(rep.cgr_rep.cr_rec[0].cr_val.u.ab_buf.b_addr == NULL); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1523,6 +1607,9 @@ static void cur_fail(void) for (i = 2; i < MULTI_INS - 1; i++) M0_UT_ASSERT(repv[i].cr_rc == 0); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1590,6 +1677,9 @@ static void multi_insert(void) M0_UT_ASSERT(rep.cgr_rep.cr_nr == MULTI_INS - 1); M0_UT_ASSERT(m0_forall(i, MULTI_INS - 1, rep.cgr_rep.cr_rec[i].cr_rc == 0)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1619,6 +1709,9 @@ static void multi_lookup(void) rep.cgr_rep.cr_rec[i].cr_rc == 0)); M0_UT_ASSERT(m0_forall(i, MULTI_INS - 1, *(uint64_t *)repv[i].cr_val.u.ab_buf.b_addr == i * i)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1654,6 +1747,9 @@ static void multi_delete(void) rep.cgr_rep.cr_rec[i].cr_rc == -ENOENT)); M0_UT_ASSERT(m0_forall(i, MULTI_INS - 1, repv[i].cr_val.u.ab_buf.b_addr == NULL)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1679,6 +1775,9 @@ static void multi_insert_fail(void) i % 2 ? rep.cgr_rep.cr_rec[i].cr_rc == 0 : rep.cgr_rep.cr_rec[i].cr_rc == -ENOMEM)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1714,6 +1813,9 @@ static void multi_lookup_fail(void) i % 2 ? *(uint64_t *)repv[i].cr_val.u.ab_buf.b_addr == i*i : repv[i].cr_val.u.ab_buf.b_addr == NULL)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1757,6 +1859,9 @@ static void multi_delete_fail(void) i % 2 ? repv[i].cr_val.u.ab_buf.b_addr == NULL : *(uint64_t *)repv[i].cr_val.u.ab_buf.b_addr == i * i)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fid_submit(&cas_del_fopt, &ifid); + M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); fini(); } @@ -1881,6 +1986,14 @@ static void multi_create_drop(void) 2); M0_UT_ASSERT(rep_check(0, 0, BUNSET, BUNSET)); M0_UT_ASSERT(rep_check(1, 0, BUNSET, BUNSET)); + /* Cleaning up allocated memory to avoid leaks. */ + meta_fop_submit(&cas_del_fopt, + (struct meta_rec[]) { + { .cid = nonce0 }, + { .cid = nonce1 } }, + 2); + M0_UT_ASSERT(rep.cgr_rc == 0); + M0_UT_ASSERT(rep.cgr_rep.cr_nr == 2); fini(); } diff --git a/dix/req.c b/dix/req.c index f5cb4b113fd..9ef7ba7df77 100644 --- a/dix/req.c +++ b/dix/req.c @@ -46,6 +46,7 @@ #include "dix/fid_convert.h" #include "dix/dix_addb.h" #include "dtm0/dtx.h" /* m0_dtx0_* API */ +#include "motr/idx.h" /* M0_DIX_MIN_REPLICA_QUORUM */ static struct m0_sm_state_descr dix_req_states[] = { [DIXREQ_INIT] = { @@ -209,11 +210,15 @@ M0_INTERNAL int m0_dix_req_wait(struct m0_dix_req *req, uint64_t states, static void dix_req_init(struct m0_dix_req *req, struct m0_dix_cli *cli, struct m0_sm_group *grp, + int64_t min_success, bool meta) { + M0_PRE(ergo(min_success < 1, + min_success == M0_DIX_MIN_REPLICA_QUORUM)); M0_SET0(req); req->dr_cli = cli; req->dr_is_meta = meta; + req->dr_min_success = min_success; m0_sm_init(&req->dr_sm, &dix_req_sm_conf, DIXREQ_INIT, grp); m0_sm_addb2_counter_init(&req->dr_sm); } @@ -222,14 +227,15 @@ M0_INTERNAL void m0_dix_mreq_init(struct m0_dix_req *req, struct m0_dix_cli *cli, struct m0_sm_group *grp) { - dix_req_init(req, cli, grp, true); + dix_req_init(req, cli, grp, 1, true); } M0_INTERNAL void m0_dix_req_init(struct m0_dix_req *req, struct m0_dix_cli *cli, - struct m0_sm_group *grp) + struct m0_sm_group *grp, + int64_t min_success) { - dix_req_init(req, cli, grp, false); + dix_req_init(req, cli, grp, min_success, false); } static enum m0_dix_req_state dix_req_state(const struct m0_dix_req *req) @@ -1304,12 +1310,13 @@ static int dix_rop_ctx_init(struct m0_dix_req *req, const struct m0_bufvec *keys, uint64_t *indices) { - struct m0_dix *dix = &req->dr_indices[0]; - struct m0_dix_ldesc *ldesc; - uint32_t keys_nr; - struct m0_buf key; - uint32_t i; - int rc = 0; + struct m0_dix *dix = &req->dr_indices[0]; + struct m0_dix_ldesc *ldesc; + struct m0_pool_version *pver; + uint32_t keys_nr; + struct m0_buf key; + uint32_t i; + int rc = 0; M0_ENTRY(); M0_PRE(M0_IS0(rop)); @@ -1320,6 +1327,13 @@ static int dix_rop_ctx_init(struct m0_dix_req *req, M0_PRE(keys_nr != 0); ldesc = &dix->dd_layout.u.dl_desc; rop->dg_pver = dix_pver_find(req, &ldesc->ld_pver); + M0_ASSERT(ergo(req->dr_min_success < 1, + req->dr_min_success == M0_DIX_MIN_REPLICA_QUORUM)); + if (req->dr_min_success == M0_DIX_MIN_REPLICA_QUORUM) { + pver = m0_dix_pver(req->dr_cli, &req->dr_indices[0]); + req->dr_min_success = (pver->pv_attr.pa_N + + pver->pv_attr.pa_K)/2 + 1; + } M0_ALLOC_ARR(rop->dg_rec_ops, keys_nr); M0_ALLOC_ARR(rop->dg_target_rop, rop->dg_pver->pv_attr.pa_P); if (rop->dg_rec_ops == NULL || rop->dg_target_rop == NULL) @@ -1402,6 +1416,20 @@ static void dix_rop(struct m0_dix_req *req) M0_LEAVE(); } +/** Checks if the given cas get reply has a newer version of the value */ +static int dix_item_version_cmp(const struct m0_dix_item *ditem, + const struct m0_cas_get_reply *get_rep) { + /* + * TODO: once cas versions are propagated, check if the get reply + * has a newer version than seen previously. Will need to add + * version info to struct m0_dix_item. This function should return + * true if no previous value is set, or if the previous value has + * an older version. For now, always return true so the last + * reply in the array wins. + */ + return -1; +} + static void dix_item_rc_update(struct m0_dix_req *req, struct m0_cas_req *creq, uint64_t key_idx, @@ -1418,7 +1446,8 @@ static void dix_item_rc_update(struct m0_dix_req *req, case DIX_GET: m0_cas_get_rep(creq, key_idx, &get_rep); rc = get_rep.cge_rc; - if (rc == 0) { + if (rc == 0 && dix_item_version_cmp(ditem, &get_rep) < 0) { + m0_buf_free(&ditem->dxi_val); ditem->dxi_val = get_rep.cge_val; /* Value will be freed at m0_dix_req_fini(). */ m0_cas_rep_mlock(creq, key_idx); @@ -1620,29 +1649,61 @@ static void dix_cas_rop_rc_update(struct m0_dix_cas_rop *cas_rop, int rc) static void dix_rop_completed(struct m0_sm_group *grp, struct m0_sm_ast *ast) { - struct m0_dix_req *req = ast->sa_datum; - struct m0_dix_rop_ctx *rop = req->dr_rop; - struct m0_dix_rop_ctx *rop_del_phase2 = NULL; - bool del_phase2 = false; - struct m0_dix_cas_rop *cas_rop; + struct m0_dix_req *req = ast->sa_datum; + struct m0_dix_rop_ctx *rop = req->dr_rop; + struct m0_dix_rop_ctx *rop_del_phase2 = NULL; + bool del_phase2 = false; + struct m0_dix_cas_rop *cas_rop; + int64_t min_success; + int64_t successful_ops = 0; (void)grp; if (req->dr_type == DIX_NEXT) m0_dix_next_result_prepare(req); else { + min_success = req->dr_min_success; + M0_ASSERT(min_success > 0); + + successful_ops = m0_tl_reduce(cas_rop, scan, &rop->dg_cas_reqs, 0, + + !!(scan->crp_creq.ccr_sm.sm_rc == 0)); + /* - * Consider DIX request to be successful if there is at least - * one successful CAS request. + * The idea here is that transient failures are likely to + * occur and may not persist long enough that the node gets + * marked as failed. These will still affect individual + * operations, so we need to make sure that dix correctly + * handles the issues (if possible) or returns a failure to + * the client. We therefore let the user choose min_success, + * which determines the minimum number of successful cas + * operations to consider the parent dix operation successful. + * This is necessary to ensure read-after-write consistency. + * If min_success is set to (N+K)/2 + 1 for both reads and + * writes, then even in the presence of transient failures at + * least one copy of the most recent version of data will be + * found. Other values can be set for reduced consistency or + * balancing read vs. write. + * + * Here we compare the previously computed successful_ops + * and min_success to decide if we can ignore failed cas + * operations. If successful_ops >= min_success, we've met + * the quorum requirement and can ignore failures. This is + * done by skipping dix_cas_rop_rc_update for failed cas + * operations. We're guaranteed to have at least one + * successful cas op somewhere in the list, so this results + * in the parent dix operation being considered a success, + * and cas version is used to break ties between multiple + * successful replies (see dix_item_version_cmp). In the + * case that successful_ops < min_success, we call + * dix_cas_rop_rc_update for every cas op, with the result + * that the failed operations will cause the parent dix op + * to fail. Since min_success must be greater than 0, this + * covers the case that all cas requests fail. */ - if (m0_tl_forall(cas_rop, cas_rop, - &rop->dg_cas_reqs, - cas_rop->crp_creq.ccr_sm.sm_rc != 0)) - dix_cas_rop_rc_update(cas_rop_tlist_tail( - &rop->dg_cas_reqs), 0); - m0_tl_for (cas_rop, &rop->dg_cas_reqs, cas_rop) { - if (cas_rop->crp_creq.ccr_sm.sm_rc == 0) + if (successful_ops < min_success || + cas_rop->crp_creq.ccr_sm.sm_rc == 0) { dix_cas_rop_rc_update(cas_rop, 0); + } m0_cas_req_fini(&cas_rop->crp_creq); } m0_tl_endfor; } @@ -2137,10 +2198,11 @@ static void dix_rop_units_set(struct m0_dix_req *req) m0_rwlock_read_unlock(&pm->pm_lock); /* - * Only one CAS GET request should be sent for every record. + * For meta requests, + * only one CAS GET request should be sent for every record. * Choose the best destination for every record. */ - if (req->dr_type == DIX_GET) { + if (req->dr_type == DIX_GET && req->dr_is_meta) { for (i = 0; i < rop->dg_rec_ops_nr; i++) dix_online_unit_choose(req, &rop->dg_rec_ops[i]); } diff --git a/dix/req.h b/dix/req.h index 4eeae2b5b81..3e05f8f0d9e 100644 --- a/dix/req.h +++ b/dix/req.h @@ -253,6 +253,11 @@ struct m0_dix_req { * starting key in DIX_NEXT request. */ uint32_t *dr_recs_nr; + /** + * Minimum number of successful CAS operations to treat + * parent DIX operation as successful. + */ + int64_t dr_min_success; /** Request flags bitmask of m0_cas_op_flags values. */ uint32_t dr_flags; @@ -283,7 +288,8 @@ struct m0_dix_next_reply { /** Initialises DIX request. */ M0_INTERNAL void m0_dix_req_init(struct m0_dix_req *req, struct m0_dix_cli *cli, - struct m0_sm_group *grp); + struct m0_sm_group *grp, + int64_t min_success); /** * Initialises DIX request operating with meta-indices. diff --git a/dix/ut/client_ut.c b/dix/ut/client_ut.c index 81470106e48..e5a9e8a3be1 100644 --- a/dix/ut/client_ut.c +++ b/dix/ut/client_ut.c @@ -1244,7 +1244,7 @@ static int dix_common_idx_flagged_op(const struct m0_dix *indices, int rc; int i; - m0_dix_req_init(&req, &dix_ut_cctx.cl_cli, dix_ut_cctx.cl_grp); + m0_dix_req_init(&req, &dix_ut_cctx.cl_cli, dix_ut_cctx.cl_grp, 1); m0_dix_req_lock(&req); switch (type) { case REQ_CREATE: @@ -1285,6 +1285,7 @@ static int dix_common_rec_op(const struct m0_dix *index, const struct m0_bufvec *keys, struct m0_bufvec *vals, const uint32_t *recs_nr, + int64_t min_success, uint32_t flags, struct dix_rep_arr *rep, enum ut_dix_req_type type) @@ -1294,7 +1295,7 @@ static int dix_common_rec_op(const struct m0_dix *index, int i; int k = 0; - m0_dix_req_init(&req, &dix_ut_cctx.cl_cli, dix_ut_cctx.cl_grp); + m0_dix_req_init(&req, &dix_ut_cctx.cl_cli, dix_ut_cctx.cl_grp, min_success); m0_dix_req_lock(&req); switch (type) { case REQ_PUT: @@ -1397,21 +1398,31 @@ static int dix_ut_put(const struct m0_dix *index, uint32_t flags, struct dix_rep_arr *rep) { - return dix_common_rec_op(index, keys, vals, NULL, flags, rep, REQ_PUT); + return dix_common_rec_op(index, keys, vals, NULL, 1, flags, rep, REQ_PUT); +} + +static int dix_ut_put_min_success(const struct m0_dix *index, + const struct m0_bufvec *keys, + struct m0_bufvec *vals, + int64_t min_success, + uint32_t flags, + struct dix_rep_arr *rep) +{ + return dix_common_rec_op(index, keys, vals, NULL, min_success, flags, rep, REQ_PUT); } static int dix_ut_get(const struct m0_dix *index, const struct m0_bufvec *keys, struct dix_rep_arr *rep) { - return dix_common_rec_op(index, keys, NULL, NULL, 0, rep, REQ_GET); + return dix_common_rec_op(index, keys, NULL, NULL, 1, 0, rep, REQ_GET); } static int dix_ut_del(const struct m0_dix *index, const struct m0_bufvec *keys, struct dix_rep_arr *rep) { - return dix_common_rec_op(index, keys, NULL, NULL, 0, rep, REQ_DEL); + return dix_common_rec_op(index, keys, NULL, NULL, 1, 0, rep, REQ_DEL); } static int dix_ut_next(const struct m0_dix *index, @@ -1420,7 +1431,7 @@ static int dix_ut_next(const struct m0_dix *index, uint32_t flags, struct dix_rep_arr *rep) { - return dix_common_rec_op(index, start_keys, NULL, recs_nr, flags, + return dix_common_rec_op(index, start_keys, NULL, recs_nr, 1, flags, rep, REQ_NEXT); } @@ -2655,17 +2666,35 @@ static void local_failures(void) dix_kv_alloc_and_fill(&keys, &vals, COUNT); rc = dix_common_idx_op(&index, 1, REQ_CREATE); M0_UT_ASSERT(rc == 0); + /* - * Consider DIX request to be successful if there is at least - * one successful CAS request. Here two cas requests can be - * sent successfully. + * Consider DIX request to be successful only if there are + * enough successful CAS requests to satisfy min_success. + * Here two cas requests can be sent successfully. First, try with + * min_success = 3, which should result in all CAS requests failing. */ m0_fi_enable_off_n_on_m("cas_req_replied_cb", "send-failure", 2, 3); - rc = dix_ut_put(&index, &keys, &vals, 0, &rep); + rc = dix_ut_put_min_success(&index, &keys, &vals, 3, 0, &rep); + m0_fi_disable("cas_req_replied_cb", "send-failure"); + M0_UT_ASSERT(rc == 0); + M0_UT_ASSERT(rep.dra_nr == COUNT); + M0_UT_ASSERT(m0_forall(i, COUNT, rep.dra_rep[i].dre_rc == -ENOTCONN)); + + dix_rep_free(&rep); + rc = dix_ut_del(&index, &keys, &rep); + M0_UT_ASSERT(rc == 0); + dix_rep_free(&rep); + + /* + * Now try again with min_success = 2, which should succeed. + */ + m0_fi_enable_off_n_on_m("cas_req_replied_cb", "send-failure", 2, 3); + rc = dix_ut_put_min_success(&index, &keys, &vals, 2, 0, &rep); m0_fi_disable("cas_req_replied_cb", "send-failure"); M0_UT_ASSERT(rc == 0); M0_UT_ASSERT(rep.dra_nr == COUNT); M0_UT_ASSERT(m0_forall(i, COUNT, rep.dra_rep[i].dre_rc == 0)); + dix_rep_free(&rep); dix_kv_destroy(&keys, &vals); dix_index_fini(&index); diff --git a/doc/CORTX-MOTR-ARCHITECTURE.md b/doc/CORTX-MOTR-ARCHITECTURE.md index 059468f208b..e9014fb22f9 100644 --- a/doc/CORTX-MOTR-ARCHITECTURE.md +++ b/doc/CORTX-MOTR-ARCHITECTURE.md @@ -75,7 +75,7 @@ # Object Layout # + Object is an array of blocks. Arbitrary scatter-gather IO with overwrite. Object has layout. + Default layout is parity de-clustered network raid: N+K+S striping. - + More details about [parity declustering](doc/pdclust/index.rst) + + More details about [parity declustering](pdclust/index.rst) + Layout takes hardware topology into account: distribute units to support fault-tolerance. ![image](./Images/6_Object_Layout.png) diff --git a/doc/HLD-Data-Integrity.md b/doc/HLD-Data-Integrity.md new file mode 100644 index 00000000000..f3d94f691f4 --- /dev/null +++ b/doc/HLD-Data-Integrity.md @@ -0,0 +1,145 @@ +# HLD of Data Integrity + +- I. Motr Client + - I.1 Application and motr data structure + - I.2 Parity Group Computation + - I.3 Tracking Data Unit Allocated to Object + - I.4 FOP Sending and Checksum Processing + - Write Path + - Read Path +- II. Motr Server Write Path + - II.1 Global Object => Component Object + - II.2 Balloc Processing + - Balloc extent and buffer extent processing + - II.3 EMAP Extent Processing + - II.4 COB-EMAP Details + - II.5 Checksum storage with EMAP Extent + +This document will give details of DI implementation in Motr + +## I. Motr Client +### I.1 Application and motr data structure +Application sends data as scatter gather list (SGL) of buffers (ioo_data), it also sends an index-list for object offset corresponding to the buffer (ioo_ext). There can be multiple send requests for reading/writing to the same object + +The example below describes scenario where application sends second request to motr for the same object. + +- Parity Stripe having N (Data Units) = 4; K (Parity Units) = 2; S (Spare Units) = 0 + +- Application buffer size 16KB + +- Unit Size (US) = 1MB + +- Motr Default Page Size (PS) = 4KB + +- Previous request has processed Data Unit 0-7 (DU) or Parity Group 0 (PG) & PG 1 + + - Current IO is for DU7-15 or PG 2 & PG 3 + +![image](./Images/DI01.png) +

Received from application

+ +### I.2 Parity Group Computation +- Motr client computes number of parity group in the request (ioo_iomap_nr) + +- Allocates data structure for all data(N) and parity units (K) + +- Populates parity group data structure for further processing (ioo_iomaps) + +- Data allocated are page or segment (4K) basis. + +![image](./Images/DI02.png) +

Parity Group Data Structure

+ +### I.3 Tracking Data Unit Allocated to Object +For DI computation an array (ti_goff_ivec) for each target is allocated to track global offset of each segment. + +![image](./Images/DI03.png) +

Mapping Data and Parity to Global Offset Space

+ +![image](./Images/DI04.png) +### I.4 FOP Sending and Checksum Processing +During FOP processing based on the DU goff which is added to the target structure (ti_goff_ivec), Parity Group Index and Data Unit Index is computed and stored in structure/array of FOP (irf_cksum_data) + +### Write Path + During write path the checksum for data also gets computed for each DU which is added to the FOP. Checksum computation is seeded with DU Index. + +![image](./Images/DI05.png) +### Read Path +During read path when the data is received from Motr Server, the checksum is computed and compared against received checksum + + + +![image](./Images/DI06.png) +## II. Motr Server Write Path +### II.1 Global Object => Component Object +Every Motr object is identified by FID also known as Global Object FID and its Stripe Units on devices are identified as Component Object FID. + +Component Object FID is derived from Global Object FID by adding Device ID to the Global Object FID. + +```c +// Logical representation +cob_fid = (gob_fid | device_id << M0_FID_DEVICE_ID_OFFSET) +``` +Every device on which stripe/shard of object is present will have COB entry. + +### II.2 Balloc Processing +Motr client send data buffer, checksum buffer using RPC to server. + +- Motr server requests blocks from the balloc module to cover the total size of data buffer sent by client + +- Balloc will attempt to allocate total size as one extent + + - If one chunk is not available then multiple balloc extent can be allocated + + - Currently more than one chunk will cause failure + +- In the diagram below it is shown that three balloc extents are getting allocated for two data DUs. + +### Balloc extent and buffer extent processing +As part of balloc processing, server code finds the number of contiguous fragment using overlap of balloc-extent and buffer extent. Also data structure is populated to track this. + +- m0_bufvec si_user : Tracking buffer fragment + +- m0_indexvec si_stob : Tracking balloc fragment + +![image](./Images/DI07.png) +

Balloc Processing and Fragment Computation

+ +These balloc-extent along with its buffer from unit for Storage IO. + +![image](./Images/DI08.png) +### II.3 EMAP Extent Processing +As part of EMAP extent processing, contiguous fragment is computed using overlap of Object offset extent (COB Offset) and balloc extent. This EMAP fragment data is processed later and gets written to the device EMAP btree. + +EMAP Fragment Data consist of following important fields + +- COB Offset Extent + + - e_start + + - e_end + +- Balloc Extent Start + + - ee_val + +![image](./Images/DI09.png) +### II.4 COB-EMAP Details +- When COB is created a default entry for the object extent is created + + - Fake extent with a span of 0 to ∞ + +- If an entry at start gets added then it cuts into this Fake extent and creates two segment + + - New Entry. + + - Fake extent gets right shifted. + +![image](./Images/DI10.png) +Using above concepts the three EMAP extent gets added to EMAP metadata btree. + +### II.5 Checksum storage with EMAP Extent +Checksum for all the DUs which are starting in a Balloc extent, gets added to that corresponding EMAP entry. During EMAP processing checksum gets correctly transferred to the extent and gets written in btree node. +![image](./Images/DI11.png) + + diff --git a/doc/HLD-of-SNS-Repair.md b/doc/HLD-of-SNS-Repair.md index d6a1e2bf07b..748184fa0a6 100644 --- a/doc/HLD-of-SNS-Repair.md +++ b/doc/HLD-of-SNS-Repair.md @@ -89,7 +89,7 @@ Following topics deserve attention: * Details of interaction between repair and DTM must be specified. * Redundancy other than N+1 (N+K, K > 1) must be regarded as a default configuration. * Multiple failures and repair in the presence of multiple failures must be considered systematically. -* Repair and re-balancing must be clearly distinguished. +* Repair and re-balancing must be distinguished appropriately. * Reclaim of a distributed spare space must be addressed (this is done in a separate Distributed Spare design documentation). * locking optimizations. @@ -150,7 +150,7 @@ Agent iterates components over the affected container or all the containers whic ### 5.11. SNS repair and layout ### The SNS manager gets an input set configuration and output set configuration as the repair is initiated. These input/output sets can be described by some form of layout. The SNS repair will read the data/parity from the devices described with the input set and reconstruct the missing data. In the process of reconstruction object layouts affected by the data reconstruction (layouts with data located on the lost storage device or node) are transactionally updated to reflect changed data placement. Additionally, while the reconstruction is in-progress, all affected layouts are switched into a degraded mode so that the clients can continue to access and modify data. -Note that the standard mode of operation is a so-called "non-blocking availability" (NBA) where after a failure the client can immediately continue writing new data without any IO degradation. To this end, a client is handed out a new layout to which it can write. After this point, the cluster-wide object has a composite layout: some parts of the object's linear name-space are laid accordingly to the old layout, and other parts (ones where clients write after a failure)—are a new one. In this configuration, clients never write to the old layout, while its content is being reconstructed. +Note that the standard mode of operation is a so-called "non-blocking availability" (NBA) where after a failure the client can immediately continue writing new data without any IO degradation. To this end, a client is handed out a new layout to which it can write. After this point, the cluster-wide object has a composite layout: some parts of the object's linear name-space are mapped accordingly to the old layout, and other parts (ones where clients write after a failure)—are a new one. In this configuration, clients never write to the old layout, while its content is being reconstructed. The situation where there is a client-originated IO against layouts being reconstructed is possible because of: * Reads have to access old data even under NBA policy and diff --git a/doc/HLD-of-SNS-client.md b/doc/HLD-of-SNS-client.md index 2d9d32b4db9..ef211ee5ea9 100644 --- a/doc/HLD-of-SNS-client.md +++ b/doc/HLD-of-SNS-client.md @@ -42,7 +42,7 @@ External SNS client interfaces are standard Linux file_operations and address_sp ## Logical Specification ### fop builder, NRS, and request handler -A fop, representing IO operation is created at the VFS or VM entry point1. The fop is then passed to the dummy NRS(23), which immediately passes it down to the request handler. The request handler uses file meta-data to identify the layout and calls the layout IO engine to proceed with the IO operation. +A fop, representing IO operation is created at the VFS or VM entry point1. The fop is then passed to the fake NRS(23), which immediately passes it down to the request handler. The request handler uses file meta-data to identify the layout and calls the layout IO engine to proceed with the IO operation. ### Layout Schema The layout formula generates a parity de-clustered file layout for a particular file, using file id (fid) as an identifier[2]. See Parity De-clustering Algorithm HLD [3] for details. At the moment, **m0t1fs** supports a single file with fid supplied as a mount option. diff --git a/doc/ISC-Service-User-Guide b/doc/ISC-Service-User-Guide index e78000ad3b5..00df051b4b6 100644 --- a/doc/ISC-Service-User-Guide +++ b/doc/ISC-Service-User-Guide @@ -96,11 +96,11 @@ Consider a simple API that on reception of string “Hello” responds with “W char *in_string, struct m0_rpc_conn *conn) { int rc; - /* A string is mapped to a mero buffer. */ + /* A string is mapped to a motr buffer. */ m0_buf_init(in_args, in_string, strlen(in_string)); /* Initialise RPC adaptive transmission data structure. */ m0_rpc_at_init(&isc_fop->fi_args); - /* Add mero buffer to m0_rpc_at */ + /* Add motr buffer to m0_rpc_at */ rc = m0_rpc_at_add(&isc_fop->fi_args, in_args, conn); @@ -198,7 +198,7 @@ We now discuss the callee side code. Let’s assume that the function is registe if (m0_buf_streq(in, “Hello”)) { /* * The string allocated here should not be freed by - * computation and Mero takes care of freeing it. + * computation and Motr takes care of freeing it. */ out_str = m0_strdup(“World”); @@ -224,7 +224,7 @@ Suppose we have a collection of arrays of integers, each stored as a Motr object ``` /* Arguments for getting min/max. */ struct arr_fids { - /* Number of arrays stored with Mero. */ + /* Number of arrays stored with Motr. */ uint32_t af_arr_nr; /* An array holding unique identifiers of arrays. */ struct m0_fid *af_gfids @@ -280,7 +280,7 @@ struct histo_args { /** Minimum value. */ uint64_t ha_min_val; - /** Global fid of object stored with Mero. */ + /** Global fid of object stored with Motr. */ struct m0_fid ha_gob_fid; } M0_XCA_RECORD; @@ -295,7 +295,7 @@ Here we discuss the API for generating a histogram of values, local to a node. T * Structure of a computation is advisable to be similar to * Motr foms. It returns M0_FSO_WAIT when it has to wait for * an external event (n/w or disk I/O)else it returns - * M0_FSO_AGAIN. These two symbols are defined in Mero. + * M0_FSO_AGAIN. These two symbols are defined in Motr. */ int histo_generate(struct m0_buf *in, struct m0_buf *out, struct m0_isc_comp_private *comp_data, diff --git a/doc/ISC-Service-User-Guide.md b/doc/ISC-Service-User-Guide.md index 8b8e9366d68..7f97eac33b3 100644 --- a/doc/ISC-Service-User-Guide.md +++ b/doc/ISC-Service-User-Guide.md @@ -96,11 +96,11 @@ Consider a simple API that on reception of string “Hello” responds with “W char *in_string, struct m0_rpc_conn *conn) { int rc; - /* A string is mapped to a mero buffer. */ + /* A string is mapped to a motr buffer. */ m0_buf_init(in_args, in_string, strlen(in_string)); /* Initialise RPC adaptive transmission data structure. */ m0_rpc_at_init(&isc_fop->fi_args); - /* Add mero buffer to m0_rpc_at */ + /* Add motr buffer to m0_rpc_at */ rc = m0_rpc_at_add(&isc_fop->fi_args, in_args, conn); @@ -199,7 +199,7 @@ We now discuss the callee side code. Let’s assume that the function is registe if (m0_buf_streq(in, “Hello”)) { /* * The string allocated here should not be freed by - * computation and Mero takes care of freeing it. + * computation and Motr takes care of freeing it. */ out_str = m0_strdup(“World”); @@ -225,7 +225,7 @@ Suppose we have a collection of arrays of integers, each stored as a Motr object ```C /* Arguments for getting min/max. */ struct arr_fids { - /* Number of arrays stored with Mero. */ + /* Number of arrays stored with Motr. */ uint32_t af_arr_nr; /* An array holding unique identifiers of arrays. */ struct m0_fid *af_gfids @@ -281,7 +281,7 @@ struct histo_args { /** Minimum value. */ uint64_t ha_min_val; - /** Global fid of object stored with Mero. */ + /** Global fid of object stored with Motr. */ struct m0_fid ha_gob_fid; } M0_XCA_RECORD; @@ -295,7 +295,7 @@ Here we discuss the API for generating a histogram of values, local to a node. T * Structure of a computation is advisable to be similar to * Motr foms. It returns M0_FSO_WAIT when it has to wait for * an external event (n/w or disk I/O)else it returns - * M0_FSO_AGAIN. These two symbols are defined in Mero. + * M0_FSO_AGAIN. These two symbols are defined in Motr. */ ```C int histo_generate(struct m0_buf *in, struct m0_buf *out, diff --git a/doc/Images/DI01.png b/doc/Images/DI01.png new file mode 100644 index 00000000000..1ae20a44047 Binary files /dev/null and b/doc/Images/DI01.png differ diff --git a/doc/Images/DI02.png b/doc/Images/DI02.png new file mode 100644 index 00000000000..e3310c80807 Binary files /dev/null and b/doc/Images/DI02.png differ diff --git a/doc/Images/DI03.png b/doc/Images/DI03.png new file mode 100644 index 00000000000..b29c84aff3f Binary files /dev/null and b/doc/Images/DI03.png differ diff --git a/doc/Images/DI04.png b/doc/Images/DI04.png new file mode 100644 index 00000000000..1a7631b42da Binary files /dev/null and b/doc/Images/DI04.png differ diff --git a/doc/Images/DI05.png b/doc/Images/DI05.png new file mode 100644 index 00000000000..440d1c63dcd Binary files /dev/null and b/doc/Images/DI05.png differ diff --git a/doc/Images/DI06.png b/doc/Images/DI06.png new file mode 100644 index 00000000000..01c3eebd219 Binary files /dev/null and b/doc/Images/DI06.png differ diff --git a/doc/Images/DI07.png b/doc/Images/DI07.png new file mode 100644 index 00000000000..969c474ffb9 Binary files /dev/null and b/doc/Images/DI07.png differ diff --git a/doc/Images/DI08.png b/doc/Images/DI08.png new file mode 100644 index 00000000000..3a08e4af500 Binary files /dev/null and b/doc/Images/DI08.png differ diff --git a/doc/Images/DI09.png b/doc/Images/DI09.png new file mode 100644 index 00000000000..9b7a614938d Binary files /dev/null and b/doc/Images/DI09.png differ diff --git a/doc/Images/DI10.png b/doc/Images/DI10.png new file mode 100644 index 00000000000..d0078c9e6c5 Binary files /dev/null and b/doc/Images/DI10.png differ diff --git a/doc/Images/DI11.png b/doc/Images/DI11.png new file mode 100644 index 00000000000..5f672dd4215 Binary files /dev/null and b/doc/Images/DI11.png differ diff --git a/doc/Images/DTM0R Components and message flow.svg b/doc/Images/DTM0R Components and message flow.svg new file mode 100644 index 00000000000..f66562d3229 --- /dev/null +++ b/doc/Images/DTM0R Components and message flow.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/doc/Motr-Lnet-Transport.md b/doc/Motr-Lnet-Transport.md index 94c849e7fa3..a1c0bb58c74 100644 --- a/doc/Motr-Lnet-Transport.md +++ b/doc/Motr-Lnet-Transport.md @@ -381,7 +381,7 @@ A Motr server uses the following pattern to use the LNet transport to initiate a A Motr tool uses the following pattern to use the LNet transport to initiate passive bulk tranfers to Motr server components: -1. The tool should use an end point address that is not assigned to any mero server or file system client. It should use a dynamic address to achieve this. +1. The tool should use an end point address that is not assigned to any motr server or file system client. It should use a dynamic address to achieve this. 2. To perform a bulk operation, the tool provisions a network buffer. The tool then registers this buffer and enqueues the buffer for transmission. 3. When a buffer operation completes, the buffer can be de-registered and the memory can be de-provisioned. @@ -437,7 +437,7 @@ LNet is capable of running without Lustre, but currently is distributed only thr ### References * [1] T1 Task Definitions -* [2] Mero Summary Requirements Table +* [2] Motr Summary Requirements Table * [3] m0 Glossary * [4] m0LNet Preliminary Design Questions * [5] RPC Bulk Transfer Task Plan diff --git a/doc/RPC_Layer_Core.rst b/doc/RPC_Layer_Core.rst index a2dde9ebdcb..cc2344de679 100644 --- a/doc/RPC_Layer_Core.rst +++ b/doc/RPC_Layer_Core.rst @@ -60,7 +60,7 @@ Requirements - [r.rpccore.efficient.bulk] 0-copy, if provided by the underlying network transport, is utilized; -- [r.rpccore.eos] support ordered exactly once semantics (EOS) of delivery; +- [r.rpccore.exactly-once-semantics] support ordered exactly once semantics of delivery; - [r.rpccore.formation.settings] support different setting like max_rpc_in_flight, max_page_per_rpc, etc. @@ -239,7 +239,7 @@ Cached FOPs might have dependencies each on other. This could affect the order o - m0_rpcmachine is a RPC processing machine, several instances of it might be existing simultaneously. -- m0_update_stream is an ADT associated with sessions and slots used for FOP sending with FIFO and EOS constrains. +- m0_update_stream is an ADT associated with sessions and slots used for FOP sending with FIFO and CORTX constrains. ********************* diff --git a/doc/Seagate-FDMI-HLD.md b/doc/Seagate-FDMI-HLD.md index 1ae638c0620..cb4e100d6c5 100644 --- a/doc/Seagate-FDMI-HLD.md +++ b/doc/Seagate-FDMI-HLD.md @@ -12,7 +12,7 @@ # Introduction # ## 1.1 Document's Purpose ## -The document is intended to specify the design of Mero FDMI interface. FDMI is a part of Mero product. FDMI provides interface for Mero plugins and allows horizontally extending the features and capabilities of the system. +The document is intended to specify the design of Motr FDMI interface. FDMI is a part of Motr product. FDMI provides interface for Motr plugins and allows horizontally extending the features and capabilities of the system. ## 1.2 Intended Audience ## * Product Architect @@ -23,12 +23,12 @@ The document is intended to specify the design of Mero FDMI interface. FDMI is a FDMI: File data manipulation interface ## 1.4 References ## -1. “Mero Object Store Architecture: Technical” MeroTechnicalWhitepaper.pdf -2. “mero a scalable storage platform” Mero technical (toi).pdf +1. “Motr Object Store Architecture: Technical” MotrTechnicalWhitepaper.pdf +2. “motr a scalable storage platform” Motr technical (toi).pdf 3. fdmihighleveldecomposition.pdf # Overview # -Mero is a storage core capable of deployment for a wide range of large scale storage regimes, from cloud and enterprise systems to exascale HPC installations. FDMI is a part of Mero core, providing interface for plugins implementation. FDMI is build around the core and allows for horizontally extending the features and capabilities of the system in a scalable and reliable manner. +Motr is a storage core capable of deployment for a wide range of large scale storage regimes, from cloud and enterprise systems to exascale HPC installations. FDMI is a part of Motr core, providing interface for plugins implementation. FDMI is build around the core and allows for horizontally extending the features and capabilities of the system in a scalable and reliable manner. ## 1.5 Product Purpose ## TBD @@ -52,9 +52,9 @@ In this section only architectural information like the following is displayed b -## 1.7 FDMI position in overall Mero Core design ## +## 1.7 FDMI position in overall Motr Core design ## -FDMI is an interface allowing Mero Core scale horizontally. The scaling includes two aspects: +FDMI is an interface allowing Motr Core scale horizontally. The scaling includes two aspects: * Core expansion in aspect of adding core data processing abilities, including data volumes as well as transformation into alternative representation. The expansion is provided by introducing FDMI plug-ins. @@ -62,15 +62,15 @@ FDMI is an interface allowing Mero Core scale horizontally. The scaling includes * Core expansion in aspect of adding new types of data the core is able to feed plug-ins. This sort of expansion is provided by introducing FDMI sources. - * Initial design implies that FOL record is the only source data type Mero Core provides so far. + * Initial design implies that FOL record is the only source data type Motr Core provides so far. -FDMI plug-in is an application linked with Mero Core to make use of corresponding FDMI interfaces and run separate from Mero instance/services. The purpose of introducing plug-in is getting notifications from Mero Core about particular changes in stored data and further post-processing of the data intended for producing some additional classes of data the Core currently is not able to provide. +FDMI plug-in is an application linked with Motr Core to make use of corresponding FDMI interfaces and run separate from Motr instance/services. The purpose of introducing plug-in is getting notifications from Motr Core about particular changes in stored data and further post-processing of the data intended for producing some additional classes of data the Core currently is not able to provide. -Instead, FDMI source appears to be a part of Mero instance being linked with appropriate FDMI interfaces and allowing connection to additional data providers. +Instead, FDMI source appears to be a part of Motr instance being linked with appropriate FDMI interfaces and allowing connection to additional data providers. -Considering the amount of data Mero Core operates with it obvious that plug-in typically requires a sufficiently reduced bulk of data to be routed to it for post-processing. The reduction is provided by introduction of mechanism of subscription to particular data types and conditions met at runtime. The subscription mechanism is based on set of filters the plug-in registers in Mero Filter Database during its initialization. +Considering the amount of data Motr Core operates with it obvious that plug-in typically requires a sufficiently reduced bulk of data to be routed to it for post-processing. The reduction is provided by introduction of mechanism of subscription to particular data types and conditions met at runtime. The subscription mechanism is based on set of filters the plug-in registers in Motr Filter Database during its initialization. Source in its turn refreshes its own subset of filters against the database. The subset is selected from overall filter set based on the knowledge about data types the source is able to feed FDMI with as well as operation with the data the source supports. @@ -80,7 +80,7 @@ FDMI consists of APIs implementing particular roles in accordance with FDMI use * Plug-in dock, responsible for: * Plug-in registration in FDMI instance - * Filter registration in Mero Filter Database + * Filter registration in Motr Filter Database * Listening to notifications coming over RPC * Payload processing * Self-diagnostic (TBD) @@ -89,7 +89,7 @@ FDMI consists of APIs implementing particular roles in accordance with FDMI use * Source registration * Retrieving/refreshing filter set for the source * Input data filtration - * Deciding on and posting notifications to filter subscribers over Mero RPC + * Deciding on and posting notifications to filter subscribers over Motr RPC * Deferred input data release * Self-diagnostic (TBD) @@ -176,6 +176,6 @@ Input data may require to remain locked in the Source until the moment when plug ![image](./images/Image8_FDMIserviceFoundDead.PNG) -When interaction between Mero services results in a timeout exceeding pre-configured value, the not responding service needs to be announced dead across the whole system. First of all **confd** service is notified about the service not responding. After being marked dead in **confd** database, the service has to be reported to **filterd** as well. The main purpose is to deregister FDMI sources hosted by the service, if any, to stop propagating **filterd** database changes to those. +When interaction between Motr services results in a timeout exceeding pre-configured value, the not responding service needs to be announced dead across the whole system. First of all **confd** service is notified about the service not responding. After being marked dead in **confd** database, the service has to be reported to **filterd** as well. The main purpose is to deregister FDMI sources hosted by the service, if any, to stop propagating **filterd** database changes to those. As well, the moment of the last instance of the source type coming out, the corresponding plug-ins might be notified. diff --git a/doc/faq.rst b/doc/faq.rst index 96534f4f582..dc2db05b3db 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -43,6 +43,6 @@ Mero -> Motr rename make[1]: *** [all-recursive] Error 1 make: *** [all] Error 2 - A: Remove ``/etc/ld.so.conf.d/mero.conf``, then rebuild Motr after ``git + A: Remove ``/etc/ld.so.conf.d/motr.conf``, then rebuild Motr after ``git clean -dfx`` (WARNING: removes all files that are not staged and are not in the repo). diff --git a/doc/fdmi_demo/demo-fdmi/m0-instance/Makefile b/doc/fdmi_demo/demo-fdmi/m0-instance/Makefile index 194c059f9aa..a2c205fa47a 100755 --- a/doc/fdmi_demo/demo-fdmi/m0-instance/Makefile +++ b/doc/fdmi_demo/demo-fdmi/m0-instance/Makefile @@ -1,6 +1,6 @@ CC=gcc -MERO_PATH=/root/mero-true-bulk-rebased +MOTR_PATH=/root/motr-true-bulk-rebased LUSTRE_PATH=/usr/src/lustre-2.7.18.4-headers CFLAGS=-g -std=gnu99 -Wall -Werror -Wno-attributes -Wno-unused-variable \ @@ -8,10 +8,10 @@ CFLAGS=-g -std=gnu99 -Wall -Werror -Wno-attributes -Wno-unused-variable \ -DM0_EXTERN=extern -fno-strict-aliasing -fno-omit-frame-pointer -fno-common \ -fPIC -INCLUDE_FLAGS=-include config.h -I$(MERO_PATH) -I$(LUSTRE_PATH)/lnet/include \ +INCLUDE_FLAGS=-include config.h -I$(MOTR_PATH) -I$(LUSTRE_PATH)/lnet/include \ -I$(LUSTRE_PATH)/lustre/include -LDFLAGS=-L$(MERO_PATH)/extra-libs/gf-complete/src/.libs -L$(MERO_PATH)/mero/.libs -lm -lpthread -lrt -lgf_complete -lyaml -luuid -lmero +LDFLAGS=-L$(MOTR_PATH)/extra-libs/gf-complete/src/.libs -L$(MOTR_PATH)/motr/.libs -lm -lpthread -lrt -lgf_complete -lyaml -luuid -lmotr OBJS=src/main.o diff --git a/doc/fdmi_demo/demo-fdmi/m0-instance/src/main.c b/doc/fdmi_demo/demo-fdmi/m0-instance/src/main.c index f69361707ea..662506345ea 100755 --- a/doc/fdmi_demo/demo-fdmi/m0-instance/src/main.c +++ b/doc/fdmi_demo/demo-fdmi/m0-instance/src/main.c @@ -24,7 +24,7 @@ #include "pool/pool.h" /* m0_pool_version */ #include "conf/confc.h" /* m0_confc_close */ #include "net/lnet/lnet.h" /* m0_net_lnet_xprt */ -#include "mero/ha.h" +#include "motr/ha.h" #include "rpc/rpc_machine.h" /* m0_rpc_machine */ #include "rpc/rpc.h" /* m0_rpc_bufs_nr */ #include "reqh/reqh.h" /* m0_reqh */ @@ -34,9 +34,9 @@ #include "fdmi/service.h" #include "fdmi/plugin_dock.h" #include "fdmi/plugin_dock_internal.h" -#include +#include #include -#include +#include #define MALLOC_ARR(arr, nr) ((arr) = malloc((nr) * sizeof ((arr)[0]))) diff --git a/doc/fdmi_demo/demo-fdmi/clovis-app/Makefile b/doc/fdmi_demo/demo-fdmi/motr-client/Makefile similarity index 55% rename from doc/fdmi_demo/demo-fdmi/clovis-app/Makefile rename to doc/fdmi_demo/demo-fdmi/motr-client/Makefile index 158bb858ba0..30eb9c1f35c 100644 --- a/doc/fdmi_demo/demo-fdmi/clovis-app/Makefile +++ b/doc/fdmi_demo/demo-fdmi/motr-client/Makefile @@ -1,6 +1,6 @@ CC=gcc -MERO_PATH=/root/mero-true-bulk-rebased +MOTR_PATH=/root/motr LUSTRE_PATH=/usr/src/lustre-2.7.18.4-headers CFLAGS=-g -std=gnu99 -Wall -Werror -Wno-attributes -Wno-unused-variable \ @@ -8,21 +8,21 @@ CFLAGS=-g -std=gnu99 -Wall -Werror -Wno-attributes -Wno-unused-variable \ -DM0_EXTERN=extern -fno-strict-aliasing -fno-omit-frame-pointer -fno-common \ -fPIC -INCLUDE_FLAGS=-include config.h -I$(MERO_PATH) -I$(LUSTRE_PATH)/lnet/include \ +INCLUDE_FLAGS=-include config.h -I$(MOTR_PATH) -I$(LUSTRE_PATH)/lnet/include \ -I$(LUSTRE_PATH)/lustre/include -LDFLAGS=-L$(MERO_PATH)/extra-libs/gf-complete/src/.libs -L$(MERO_PATH)/mero/.libs -lm -lpthread -lrt -lgf_complete -lyaml -luuid -lmero +LDFLAGS=-L$(MOTR_PATH)/extra-libs/gf-complete/src/.libs -L$(MOTR_PATH)/motr/.libs -lm -lpthread -lrt -lgf_complete -lyaml -luuid -lmotr OBJS=src/main.o NID:=$(sudo lctl list_nids) -clovis-app: $(OBJS) +motr-client: $(OBJS) $(CC) -o $@ $(OBJS) $(LDFLAGS) .c.o: $(CC) -c $(CFLAGS) $(INCLUDE_FLAGS) -o $@ $< -# test: clovis-app -# ./clovis-app $(NID):12345:45:1 $(NID):12345:44:101 '<0x7000000000000001:0>' '<0x7200000000000000:0>' +# test: motr-client +# ./motr-client $(NID):12345:45:1 $(NID):12345:44:101 '<0x7000000000000001:0>' '<0x7200000000000000:0>' # - diff --git a/doc/fdmi_demo/demo-fdmi/clovis-app/src/main.c b/doc/fdmi_demo/demo-fdmi/motr-client/src/main.c similarity index 98% rename from doc/fdmi_demo/demo-fdmi/clovis-app/src/main.c rename to doc/fdmi_demo/demo-fdmi/motr-client/src/main.c index 4cc2e26c144..99827176e60 100644 --- a/doc/fdmi_demo/demo-fdmi/clovis-app/src/main.c +++ b/doc/fdmi_demo/demo-fdmi/motr-client/src/main.c @@ -2,8 +2,8 @@ #include #include #include -#include -#include +#include +#include #include "lib/trace.h" #include "fdmi/fdmi.h" #include "fdmi/service.h" diff --git a/dtm0/dtm0-dld.c b/dtm0/dtm0-dld.c new file mode 100644 index 00000000000..e81bf2edb21 --- /dev/null +++ b/dtm0/dtm0-dld.c @@ -0,0 +1,1468 @@ +/* -*- C -*- */ +/* + * Copyright (c) 2012-2020 Seagate Technology LLC and/or its Affiliates + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * For any questions about this software or licensing, + * please email opensource@seagate.com or cortx-questions@seagate.com. + * + */ + + +/** + @page DLD Motr DLD Template + + - @ref DLD-ovw (todo) + - @ref DLD-def (done) + - @ref DLD-req + - @ref DLD-depends + - @ref DLD-highlights + - @subpage DLD-fspec "Functional Specification" + - @ref DLD-lspec + - @ref DLD-lspec-comps + - @ref DLD-lspec-sub + - @ref DLD-lspec-state + - @ref DLD-lspec-thread + - @ref DLD-lspec-numa + - @ref DLD-conformance + - @ref DLD-ut + - @ref DLD-st + - @ref DLD-O + - @ref DLD-ref + - @ref DLD-impl-plan + +
+ @section DLD-ovw Overview + All specifications must start with an Overview section that + briefly describes the document and provides any additional + instructions or hints on how to best read the specification. + + DTM0 does anynchronous replication of Units (see in the definitions) + among the transaction participants. This is needed to improve the data + durability in case of some transient cluster components failures or + some network issues (like temporary disconnections or partitioning). + + Let's take for example the following scenario: 3-way replication + (3 CAS services), write quorum == 2, read quorum == 2. In this case, + if we have more than 1 failure, we may get the wrong data. Consider + the following sequence: + + 1) dix PUT1 + 2) cas3 is TRANSIENT, PUT1 is written in cas1 and cas2 + 3) PUT1 is SUCCESSFUL, because write quorum was reached + 4) cas1 goes FAILED and replaced by a new cas4 + 5) cas3 goes ONLINE + 6) cas2 goes TRANSIENT + 7) dix rebalance start to cas4 + 8) cas2 goes ONLINE + 9) PUT1 is replicating to cas4 via rebalance + 10) In addition to PUT1 another 1 million records are replicated + ..... TODO + + XXX: must we use the same read quorum on rebalance?? if yes - the + above use case is not valid. + +
+ @section DLD-def Definitions + Mandatory. + The DLD shall provide definitions of the terms and concepts + introduced by the design, as well as the relevant terms used by the + specification but described elsewhere. References to the + M0 Glossary and the component's HLD are permitted and encouraged. + Agreed upon terminology should be incorporated in the glossary. + + Previously defined terms: + + - Storage device (sdev) A configuration object that corresponds to a + physical device where Motr keeps data. In this document, this term is used + as synonym for "persistent participant": Motr stores its persistent data on + persistent participants. + + - Unit Unit of data. In this document, this term is used + primary to describe DIX records (DIX/CAS) and data units (IO/IOS). + + New terms: + + - Participant A member of a distributed transaction. Participants + comprise originators and storage devices. One persistent participant + corresponds to one sdev. + + - Originator The initiator of a distributed transaction. Originators + has no persistent storage. + + - PERSISTENT message, Pmsg, Pmgs (plural) A message that indicates + that a certain information (transaction, log record) became persistent + on a certain storage device (persistent participant). + + - Log record has/is All-P: P messages were received about all + non-FAILED storage devices that are participants of this log record's + dtx. (FAILED sdevs will be replaced later with the new sdevs which will have + new FIDs, and the data will be rebalanced to them.) + + - Local participant -- participant which is handled by the current + DTM0 domain. (One DTM0 domain may have several local participants.) + + - Remote participant -- participant which is not local participant. + + - Availability This term is used in the following cases: + + Read availability of participant if it can successfully serve READ + requests. Participant is available for reads in ONLINE state only. + + Write availability of participant if it can successfully serve WRITE + requests. Participant is available for writes in ONLINE state. + + Unit is READ-available if at least read-quorum of replicas are on + READ-available participants, i.e. on ONLINE storage devices. + + Unit is WRITE-available if at least write-quorum of replicas are on + WRITE-available participants, i.e. on ONLINE storage devices. + +
+ @section DLD-req Requirements + Mandatory. + The DLD shall state the requirements that it attempts to meet. + + The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", + "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this + document are to be interpreted as described in RFC 2119. + + Considerations/assumptions (TODO: move it into the respective secions): + + - Client has limited capacity (timeout for operation). + Conclusion: Sliding window is always moving on. + Need to support m0_op_cancel(). On client we just drop the txn from the + log. For the servers we just update o.BEGIN in the window range in the + new txn_reqs. + + - Max duration of TRANSIENT failure - up to 1 month. Algorithms should + support this and work without impact (performance-wise). + + - We want to be able to clean up the log in a way that this operation + is not limited by the time when someone entered TRANSIENT. (This + optimisation can be done on top of the base algorithm later. Consider + an additional Max-All-P' pointer which skips the records of transient + participants, and which allows to clean up the log.) + + - N% of performance degradation is allowed during dtm0 recovery. + + Requirements: + + - dtm0 MUST maximize availability and durability of the system without + big impact on its performance. + + - R.dtm0.maximize.availability + + - R.dtm0.maximize.durability + dtm0 MUST restore missing units replicas in a minimal time. + See Online-Recovering for the details. + + - R.dtm0.little-impact.performance + - dtm0 MUST NOT introduce bottlenecks in the system. + - dtm0 MUST NOT introduce unnecessary delays. + + - dtm0 MUST handle at least the folloing kinds of failures: + - replicas MAY become missing due to process crash/restart or because of + unreliable network (missing packets or network partitioning). + + - dtm0 MUST restore missing replicas even without process restarts. + No special RECOVERING state is needed, recovery is done automatically. + + - dtm0 MUST NOT restore missing replicas in case of permanent failures. + + - [R.dtm0.client.limited-ram] dtm0 memory usage MUST be limited. + - Clients have the window of pending requests which limits the memory + consumption of clients, as well as of the servers. + - Failures (like transient participants etc.) should not cause the increase + of memory usage on the clients. Clients MAY participate in the recovery + process, when their memory permits. + + - dtm0 SHOULD NOT support transaction dependencies. + + - dtm0 consistency model MUST be configurable?? + Note: this sounds like hard-to-implement requirement atm. + Maybe for the future. + + - dtm0 performance/durability tradeoff MUST be configurable?? + Note: this might be hard to implement, but try to design with this + thought in mind. + Example: configure the priority of online recovery. + + - configuration options: + 1. Read quorum, write quorum. + 2. Number of replicas (N+K). + + - dtm0 MUST handle out-of-disk-space and out-of-memory conditions. + User should get an error, the system should not crash or hang + leaving the system in inconsistent state. + + - dtm0 MUST minimize the use of storage for the transactions that are + replicated on all non-failed participants. (Justification of the Pruner.) + Comment: log record eventually might serve as FOL (File Operations + Log) record, in this case pruning of the log will depend on the + FOL liveness requirements. + +
+ @section DLD-depends Dependencies + Mandatory. Identify other components on which this specification + depends. + + + - dtm0 relies on Motr HA (Hare) to provide state of Motr configuration + objects in the cluster. + + - DIX relies on dtm0 to restore missing units replicas. + +
+ @section DLD-highlights-window Design Highlights: Window + Each client has a window of pending transactions which have not + reached the stable state nor have been cancelled yet. + + We are sending this window to each paricipant with every new request. + The window is range [o.BEGIN, o.END) where: + - o.END is max(o.originator)+1 for any transactions ever created + on the originator (since the process start) or 1 there was not any. + - o.BEGIN is min(o.originator) for all non-finalised transactions or + o.END if there are no non-finalised transactions. + + The window is used as a logical timeout to start sending redo msgs + for this originator. We don't need to send redos to the transactions + which are still pending. To the window we also add some fixed N-txns + number of transactions during which we still give a chance for + participants to send us pmsgs before we start sending redos to them. + +
+ @section DLD-highlights-clocks Design Highlights: Clocks + TODO: revisit, maybe not needed. + + Originator keeps uint64_t clock for itself and for every storage device. + The clocks are initialized with zero when the originator starts. + When a new transaction is created then the originator clock and the + corresponding clocks of the storage devices are getting incremented, + and then they all are added to the transaction descriptor: + + @verbatim + XXX: o.sdev1,sdev2,..sdev6 are used solve the problem log holes. + + [o.originator] [o.sdev1] [o.sdev2] [o.sdev3] [o.sdev4] [o.sdev5] [o.sdev6] + | | | | | | | + 0 0 0 0 0 0 0 + | | | | | | | + +----------------------------------------+ | | | + T1| 1 1 1 1 | 0 0 0 + +----------------------------------------+ | | | + | | | | | | | + 1 1 1 1 0 0 0 + | | | | | | | + +----------------------------------------------------+ | | + T2| 2 2 2 1 | 0 0 + +----------------------------------------------------+ | | + | | | | | | | + 2 1 2 2 1 0 0 + @endverbatim + + The picture represents volatile counters on the originator. + + DTM0 log on a persistent participant maintains the following structure + for every originator: + + @verbatim + struct originator { + map max_all_p; + }; + @endverbatim + + In this structure we store the information passed with P message: the + most recent (max) o.originator value among the transactions received + from that originator that have become All-P. + +
+ @section DLD-highlights-holes Design Highlights: Holes + + The idea of holes is to make sure that we don't prune the log before some + transaction is still present at some participant. + + There are two type of holes: temporary and permanent. + - Temporary hole -- transaction has no REDO and transaction with such sdev + clock will eventually come either from the originator or from another sdev; + - Permanent hole -- transaction with such sdev clock value will never come; + or, of it comes (for some weird reason) to some participant - it should be + discarded as a stale one. + + Temporary holes are possible on the right side of Max-All-P pointer, + that's why the pruner can prune only the records before Max-All-P. + +
+ @section DLD-highlights-clocks Design Highlights: Online-Recovering + + Online-Recovering interval is where we send REDOs to others from which + we don't have Pmsgs. + + Idea: we have window [o.BEGIN, o.END) on the client. We have "recent" + transactions from the client such as: + + @verbatim + w1 [BEGIN=1 END=11) + w2 [BEGIN=2 END=12) + ... + w13 [BEGIN=13 END=14) + ... + w14 [BEGIN=14 END=34) + + ------------------------------------------------------------------> + o.originator + @endverbatim + + @verbatim + + [All-P / almost-All-P] [Online-Recovering] [N-txns] [current-window] + ------------------------------------------------------------------> + + All-P -- transactions that do have Pmsgs from all participants except + FAILED ones (if any). In other words, FAILED participants do not affect + All-P (because they are failed permanently and we don't expect anything + from them). + + Almost-All-P -- transactions that do not have P messages from TRANSIENT + participants but all other participants have sent Pmsgs or are FAILED. + + N-txns -- a group of transactions for which we are not going to send out + REDOs because it is possible that the requests are still somewhere in the + incoming queue on the server side, so that they could be executed (or any + similiar situation). + + current-window -- [o.BEGIN, o.END). + + @endverbatim + + min-nall-p is the oldest transaction from the originator for which we + still wait for pmsg(s). + + (min-nall-p) is added to every Pmsg: + + @verbatim + struct pmsg { + fid source; + fid destination; + + u64(o.originator) timestamp; + fid originator; + + u64(o.originator) min-nall-p; + }; + @endverbatim + +
+ @section DLD-highlights-clocks Design Highlights: Basic-Recovery-Algorithm + + Upon receival of cas request or redo msg we add the record to the btree log, + if it's not already there. (Otherwise, we just close the local transaction.) + The key is the originator_fid + timestamp, so all the records will be + naturally sorted in the btree by the time. + + As the new records are coming, the older ones will move leftwise. When the + record moves to the 3rd interval (Online-Recovering, see the timeline + diagram below), we can start sending redo msgs to other paricipants, from + which we didn't receive pmsg yet. + + When the record got all pmsgs from all participants, it becomes all-p and + can move to the 4th interval (All-P), where it can be eventually deleted by + the pruner. + + Let's start with the case where we have no TRANSIENT failures of storage + devices in the pool. In this case, the diagram would look like the following + picture: + + @verbatim + (IV) (III) (II) (I) + [ All-P ] [Online-Recovering] [N-txns] [current-window] + ------------------------------------------------------------------> + [ <- may have temporary and permanent holes ---> ] (2) + [ may have + permanent but + not temporary + holes ] (1) + @endverbatim + + We need volatile structure to keep track of the range (2). + We make a tree (for example, rb) such as: + @verbatim + key = o.originator; + value = { + ptr to log record in BE seg, + fid participant_array[] (participant_index -> fids), + bool pmsg_array[] (participant_index -> has Pmsg or not), + bool is_locally_persistent (locally persistent or not), + be_op *executed; + be_op *persistent; + }; + @endverbatim + + The tree is owned by the log. + + B -> A: min-nall-p; + txA, txB; + txA \in A; txB \in B; + txA == min-nall-p(on A, contains B); // == next(max-all-p(A, B)) + txB == min-nall-p(on B, contains A); + + txB.clock < txA.clock; // send A -> B: Online-Recovering + + \E A.tx: B.min-nall-p(A) < A.min-nall-p(B) + +
+ @section DLD-highlights-clocks Design Highlights: With T + + struct log { ... preserved_max_all_p[originator -> o.originator]; }; + +
+ @section DLD-highlights-clocks Design Highlights: HA + + HA must be able to detect slow servers and deal with them one way or another. + DTM will send information to HA that will help to detect slow servers. + It allows to limit the size of REDO-with-RECOVERING range + (to satisfy R.dtm0.limited-ram). + +
+ @section DLD-highlights-clocks Design Highlights: Single clock + + Client has its own logical clock (o.originator). + Transactions in all server lists (per originator, per sdev) are ordered. + Transactions in the originator's list are ordered by o.originator. + For each originator the participant keeps per storage device arrays. + First array points to Max-All-P, second array points to the lowest Non-All-P. + With every Pmsg (src sdev, dst sdev, originator fid, dtxid), the min + Non-All-P for ... + + Let's assume we have one originator. It will help us to define the algorithm + for one single originator, and then we can just extend it to the multiple + originators case (because originators are independent). + Let's start with the alrogithm that figures out holes in the + REDO-without-recovery case. For each local storage device we keep an array + with elements for each storage device, and the elements of the array would + define the min-nall-p for a transaction on this originator which includes + both storage devices (local and remote). + In each Pmsg we send this min-nall-p value for src (local) and dst (remote). + On the receiving end we compare this min-nall-p with the value we have for + the same pair of storage devices. If remote value min-nall-p > local then + it means we should send REDO to that participant. + Each Pmsg also updates max-all-p and min-nall-p. + all-p can be moved forward if remote min-nall-p > local all-p? + + TODO: describe persistent iterators for TRANSIENT failures (Almost-All-P). + +
+ @section DLD-highlights-clocks Design Highlights: ADDB counters + TODO. + +
+ @section DLD-highlights-holes Design Highlights: RECOVERING is not needed + + We assume that REDO-with-RECOVERING may happen any time. + Moreover, any operation can be canceled at any moment, including the time + when write-quorum has not been reached. + If record was written with write-quorum then consequent read-quorum read + will return the recent data. In any other case, consistency is not + guaranteed. + Therfore, there is no need to have a separate RECOVERING state. + +
+ @section DLD-highlights-holes Design Highlights: Simple recovery + + The goal: move Max-All-P as far as possible. + For that, we have to solve two tasks: + - fill in the missing remote temporary holes; + - figure out if there are local temporary holes right after the current + Max-All-P. + + At first, let's take a look once again the intervals: + + @verbatim + (IV) (III) (II) (I) + [ Seq-All-P ] [Online-Recovering][N-txns] [current-window] + x------------x---------------------x-------x---------------> (originator's clock) + ^ ^ ^ + | | Max-All-P | + | | Last-chance-before-recovering-starts + | Last non-pruned dtx + + Intervals: + IV: [last non-pruned dtx, Max-All-P] + III: (Max-All-P, Non-Rwr] + @endverbatim + + We use Min-Non-All-P to determine if remote side requires + Online-Recovering. + For every remote storage device we keep volatile Min-Non-All-P: + - initialized with zero; + - updated when Pmsg is received; + + Note, transactions on the originator start with 1. + Note, Min-Non-All-P is sent as a part of Pmsg. + + Whenever the Online-Recovery interval becomes non-empty we start sending + REDOs to the corresponding participants. By definition, the interval may + contain non-All-P transactions, temporary holes and All-P records. + + There are 3 possible cases for the log record after Max-All-P: + + - next record is Non-All-P (1); + - next record is a temporary hole (2); + - next record is All-P (3); + + In the 1st first case we just send REDO msg(s) to the participant(s) from + which we did not get Pmsgs yet. The participants will send us Pmsgs, which + will eventually lead us to the third case. + + To progress in the second and third cases we use remote min-nall-p values: + we check if the next log record clock value is less than the minimum of the + set of remote min-nall-p from all participants, it means there are no + temporary holes between the current Max-All-P and the next record (whish is + All-P record by itself, 3rd case), so we move Max-All-P to it. + + @verbatim + Let's say min-min-nall-p = min(p.min-nall-p) for all participants p in the + cluster). + Then, we have the following condition and action: + if next(Max-All-P) < min-min-nall-p and next(Max-All-P) is All-P then + then we move Max-All-P = next(Max-All-P) + endif. + @endverbatim + + Known problem: if an originator did not have enough transactions to get the + pmsgs from all participants in the cluster before it crashed or got + disconnected or just stopped sending new transactions for a while for some + reason, and we cannot move Max-All-P because of that, we don't cleanup + its records from the log for now. TODO. + +
+ @section DLD-highlights-holes Design Highlights: Improvements for simple + recovery + + Now, let's take a look at the intervals when one participant is in transient. + In this case, All-P interval will not be able to advance because we are + expecting that all participants will be able to execute REDOs and send + their min-nall-p. This will lead as to the situation where DTM0 log + cannot be pruned. There are ways to avoid this. + + Here is an example of the algorithm with using an additional + Virtual-Max-All-P pointer which skips transactions which have all-p except + the TRANSIENT participant(s). + + @verbatim + Virtual-Max-All-P: iterator, created when some sdev(s) go TRANSIENT. + Initial state: Virtual-Max-All-P == Max-All-P. + + v-min-min-nall-p = min(p.min-nall-p) for all participants p in the cluster + except the ones that are in TRANSIENT. + + Then, v-min-min-nall-p is used instead of min-min-nall-p in the basic + algorithm. + + [Max-All-P, Virtual-Max-All-P] + Max-All-P == Virtual-Max-All-P + @endverbatim + + The basic algorithm requies O(N^2) memory for per-sdev data (lists, counters, + etc.). To alleviate the problem, we may keep volatile data on the client side. + In this case, the client will tell storage devices about min-nall-p. + TODO + + The basic algorithm requires O(N^2) P messages (per sdev). To alleviate the + problem, the client may redistribute Pmsgs: server may send Pmsg to the + client as a part of EXECUTED message (for example, inside CAS reply). The + client will send other server's Pmsgs as a part of EXECUTE message + (for example, CAS request). + TODO + + The basic algorithm requiers O(N^2) REDO messages (per sdev). To alleviate + the problem, we send REDOs from the first non-failed and non-transient + participant in the participant list of the transaction. + TODO + +
+ @section DLD-highlights Design Highlights + Mandatory. This section briefly summarizes the key design + decisions that are important for understanding the functional and + logical specifications, and enumerates topics that need special + attention. + + [ head ... tail] [ to_be_assigned/current ] + + Sliding window: all pending transactions on the client: + + - min timestamp (the "head" of the list or "current" if the list is empty); + - "current" timestamp; + + Invatiant: for the given [min, current) interval, its boundaries are never + decreasing (or never go leftward). + + Sliding window is updated whenver client is not idle. + When client is idle then we rely on local detection: if there is no io + from that client while there is io from others then the client is idle. + Sliding window allows us to prune records at the right time. + TODO: figure out an algorithm to detect the idle client, to be able to + shrink the window. (Otherwise, the recovery will get stuck.) + + - dtm0 uses persistent dtm0 log on participans other than originator, + and it uses volatile dtm0 long on originator. + + - persistent dtm0 log uses BE as storage for dtm0 log records. + + - dtm0 uses m0 network for communication. + + - dtm0 saves information in dtm0 log that may be needed for dtm0 recovery + until the transaction is replicated on all non-failed participants. + + - to restore a missing replica, dtm0 participant sends dtm0 log record to the + participant where the replica is missing. + + - dtm0 propagates back pressure (memory, IO, etc) across the system, thus + helping us to avoid overload due to "too many operations are in-progress". + + [Categories of fids] + Originator is service. + Storage device is persistent participant. + Participant is a service or a storage device. + +
+ @section DLD-lspec Logical Specification + Mandatory. This section describes the internal design of the component, + explaining how the functional specification is met. Sub-components and + diagrams of their interaction should go into this section. The section has + mandatory subsections created using the Doxygen @@subsection command. The + designer should feel free to use additional sub-sectioning if needed, though + if there is significant additional sub-sectioning, provide a table of + contents here. + + - @ref DLD-lspec-comps + - @ref DLD-lspec-sub + - @ref DLD-lspec-ds-log + - @ref DLD-lspec-sub1 + - @ref DLDDFSInternal + - @ref DLD-lspec-state + - @ref DLD-lspec-thread + - @ref DLD-lspec-numa + + + @subsection DLD-lspec-comps Component Overview + Mandatory. + This section describes the internal logical decomposition. + A diagram of the interaction between internal components and + between external consumers and the internal components is useful. + + - The components diagram of DTM0: + ./doc/Images/DTM0R Components and message flow.svg + + - dtm0 consists of dtm0 log, pruner, recovery machine, persistent machine, + net, HA, dtx0 modules. + + @subsection DLD-lspec-sub Subcomponent design + Such sections briefly describes the purpose and design of each + sub-component. Feel free to add multiple such sections, and any additional + sub-sectioning within. + + - DTM0 log: + DTM0 log uses BE to store data. + It uses BTree, key are txids, values are records. + BTrees are "threaded" trees: each record is linked into several lists in + adddtion to being in the tree. + DTM0 log provides an API that allows the user to add or remove log records. + Also, there are log iterators: API that allows the user to traverse over + specific kinds of records (for example, "all-p" records). + @verbatim + + struct m0_dtx0_id { + struct m0_fid dti_originator; + uint64_t dti_timestamp; + } M0_XCA_RECORD M0_XCA_DOMAIN(rpc|be); + + struct m0_dtx0_participants { + uint64_t dtpa_participants_nr; + struct m0_fid *dtpa_participants; + } M0_XCA_SEQUENCE M0_XCA_DOMAIN(rpc|be); + + struct m0_dtx0_descriptor { + struct m0_dtx0_id dtd_id; + struct m0_dtx0_participants dtd_participants; + } M0_XCA_RECORD M0_XCA_DOMAIN(rpc|be); + + struct m0_dtx0_payload { + uint32_t dtp_type M0_XCA_FENUM(m0_dtx0_payload_type); + struct m0_bufs dtp_data; + } M0_XCA_RECORD M0_XCA_DOMAIN(rpc|be); + + struct m0_dtm0_redo { + struct m0_dtx0_descriptor dtr_descriptor; + struct m0_dtx0_payload dtr_payload; + }; + + struct pmsg { + dtx0_id; + fid source; // sdev_fid + fid destination; // sdev|service fid + u64 min_nall_p; + }; + + struct redo_list_link { + struct log_record *rll_rec; + struct m0_be_list_link rll_link; + } M0_XCA_RECORD M0_XCA_DOMAIN(rpc|be); + + struct redo_list_links { + uint32_t rll_nr; + struct redo_list_link *rll_links; + } M0_XCA_SEQUENCE M0_XCA_DOMAIN(rpc|be); + + // Volatile list populated by log for pmachine to send pmsgs. + // XXX: consider using be_queue, if it's easier + struct persistent_records { + // struct pmsg pr_pmsgs[participants_n]; // another variant to consider? + struct m0_dtx0_descriptor pr_rec; + fid pr_source_sdev; + u64 pr_min_nall_p; + m0_list_link pr_link; + }; + + struct redo_ptrs { + struct log_record *prev; + struct log_record *next; + }; + + struct log_record { + struct m0_dtm0_redo redo; + //be_list_link allp; ?? for the pruner ready to cleanup + struct redo_list_links redo_links; + //struct redo_ptrs redo_links[MAX N+K+S - 1]; + + // Is transaction still volatile or already persistent? + // Note: this field itself is not captured in BE. + bool is_volatile; // 1 - yes, (i.e. not persistent), 0 - no (persistent) + } M0_XCA_RECORD M0_XCA_DOMAIN(rpc|be); + + struct dtm0_log_originator { + fid originator; + + // window rcvd from the originator + u64 o.BEGIN; // max value received from originator + u64 o.END; // max value received from originator + + u64 max_allp; // value is the timestamp of max_allp txn + + u64 local_min_nall_p; // volatile + + // Volatile hashmap to store min_nall_p values from pmsgs + u64 min_nall_p[sdev]; + }; + + struct dtm0_log_sdev { + be_list redo; // see rll_link + }; + + struct dtm0_log { + btree records(key=dtxid, value=log_record); + btree originators(key=o_fid, value=dtm0_log_originator); + btree redo_lists(key=sdev, value=dtm0_log_sdev); + m0_tlist persistent_records; // see pr_link + }; + + @endverbatim + + @verbatim + Per originator lists are ordered by originator's clocks. + Per sdev lists are not ordered. + All-p list is not ordered. + Record is moved to All-p only ... + + + + @endverbatim + + - DTM0 net: + DTM0 net uses Motr RPC. + It provides a queue-like API: a message could be posted into the network, + and the user may subscribe to particular kind of incoming messages. + It has no persistence state. + The API provides asynchronous completion: the user may wait until message was + acknoweledged by the remote side (RPC reply was received). + It cancels all outgoing/incoming messages when the HA tells us that + participant goes to FAILED/TRANSIENT state. + Optimizations: + - Do not use network if the destination is in the same Motr process. + + - DTX0: + It is external API of dtm0 for dtm0 users. + Duplicates (CAS requests, REDO messages, etc.) are checked at the DTM0 log + level. + + - Persistent machine: + It solves two tasks. It sends out a Pmsg when a local transction becomes + persistent (1). + It updates the log when a Pmsg is received from a remote participant (2). + Persistent machine contains a set of local participants. + + However, we want to minimize the latency of user operation. In Pmach, we can + prioritize sending P messages to clients over sending of P messages to other + participants. Clients are interested in Pmsgs only for in-progress + transactions. Because of that, we can create an in-memory queue for each + client which will be used as the source of Pmsgs to the client. If we keep + this queue sorted and remove Pmsgs for transactions with T < T.client.min + then this queue will not grow much more than the total amount of all + in-progress transactions for the client, which will satisfy + R.dtm0.limited-ram. + The queue does not have to be persistent, so we keep it in RAM, thus + satisfying R.dtm0.maximize-performance. + + Alternatives: + 1. In-memory queue (for outgoing Pmsgs). Cons: the queue is not bounded; it + can grow quickly if the remote is slow. It is not acceptable (see + R.dtm0.limited-ram). XXX: if we use one-way RPC msgs, this argument is + not relevant. + + Pmach optimizations: + For outgoing: coalescing is done at the net level; wait until all local + transactions are persistent before sending Pmsg about local participants. + XXX: persistent on all local participants which are ONLINE, RECOVERING, + during rebalance/direct rebalance. + For incoming: + Wait until Pmsgs about a dtx received from all non-failed (XXX: ONLINE, + etc. like before) participants before persisting those Pmsgs. This + will improve BE seg locality of reference. + + - Recovery machine: + It solves two tasks. It sends out REDO messages (1). + It applies incoming REDO messages sent from other recovery machines (2). + REDO messages are read out from the log directly. They are applied through + a callback (for example, posted as CAS FOMs). + + For each local participant, we iterate over the REDO-list, send out REDOs, + thus recovering the corresponding remote participant. The recovery machine + sends the REDO to the remote participant. Note: REDO implies that the txn + was persistent on the sender participant. + + Note, recovery machine may need to recover a local storage device + (intra-process recovery). It is done in the same way as with remote storage + devices, except DTM0 net will not be sending messages over network, instead + they will be loopback-ed. + + When a REDO message is received, recovery machine calls the corresponding + callback that will apply the message. For example, the callback submits a CAS + FOM and then recovery machine awaits until the data gets landed on disk. + Then, the machine sends reply. The remote recovery machine receives the reply + and then sends another REDO. It allows to propagate the back pressure from + the recovering participant to the remote. + XXX: with one-way RPC msgs no replies are needed, pmsg will be sent instead + when the redo becomes persistent. The sender should not try to resend redo + until some timeout expires, like 10 secs or more. + + Duplicates are not checked by recovery machine. Instead, DTX0 and DTM0 log + do that. + + Aside from recovering of TRANSIENT failures, recovery machine reacts to + FAILED state: in case of originator it causes "client eviction" (reduce the + current window to zero). + In case permanent failure of the storage device, the machine does nothing -- + Motr repair/rebalance/direct-rebalance will take care of such device when + the device is replaced. + + Optimizations: + - send one REDO message for multiple participants that exist in the same + process. + + - Pruner: + It removes records when their respective transactions become persistent on + all non-failed participants. + It removes records of FAILED participants (eviction). After storage device + goes to FAILED state, pruner assumes that the records which have this storage + device as a participant no longer have this participant without P message. + If there are no other missing P messages then the pruner assumes that this + log record has All-P. + Pruner is only interested in only log records that are in the log and that + have All-P. + + - HA (dtm0 ha): + It provides interface to Motr HA tailored for dtm0 needs. + What is needed from HA + - Pmach wants to know states and transitions for remote participants. + - Remach: states and transitions of all participants. + - Net: states and transitions of all participants. + - Pruner: TRANSIENT and FAILED states. + + History of ha events means that we should receive the same events again + in case of crash and restart. To implement such semantics we need to inform + ha when we finished consuming and handling the ha event. + + For the basic version of the algorithm, the history of ha events does NOT + matter. + + [subscription to transitions] + DTM0 HA allows its user to subscribe to storage device states or service + states updates. + Note: to avoid missing ha states while you subscribe, always check the + ha state already after subscription. + + [persistent ha history] + DTM0 HA uses BE to keep the persistent history of state trasitions of + participants. The history is garbadge-collected by DTM0 HA itself: + for example, FAILED participants are removed when eviction is complete. + The other components may use the history. + + - domain: + DTM0 domain is a container for DTM0 log, pruner, recovery machine, + persistent machine, network. It serves as an entry point for any other + component that wants to interact with DTM0. For example, distributed + transactions are created within the scope of DTM0 domain. + TODO: remove this text from domain.h, or from here. + To initialize DTM0, the user has to initialize DTM0 domain which will + initialize all internal DTM0 subsystems. + In normal m0d process there will be one DTM0 domain, but in the UT + there may be more than one DTM0 domain per Motr process. + + + @subsubsection DLD-lspec-ds-log Subcomponent Data Structures: DTM0 log + This section briefly describes the internal data structures that are + significant to the design of the sub-component. These should not be a part + of the Functional Specification. + + XXX + DTM0 log record is linked to one BTree and many BE lists. + DTM0 log BTree: key is txid, value is a pointer to log record. + BE list links records for participants redo_lists. + DTM0 log contains the BTree and a data structure that holds the + heads of BE lists and related information. + + @section m0_dtm0_log interface for recovery machine + TODO: remove this text either from here or from log.h + + Note: log iter here does not care about holes in the log. + + - m0_dtm0_log_iter_init() - initializes log record iterator for a + sdev participant. It iterates over all records that were in the log during + last local process restart or during last remote process restart for the + process that handles that sdev. + - m0_dtm0_log_iter_next() - gives next log record for the sdev participant. + - m0_dtm0_log_iter_fini() - finalises the iterator. It MUST be done for every + call of m0_dtm0_log_iter_init(). + + @section interface used by pmach + + - m0_dtm0_log_p_get_local() - returns the next P message for the local + transactions that become persistent (logged). + Returns M0_FID0 during m0_dtm0_log_stop() call. After M0_FID0 is returned + new calls to the log MUST NOT be made. + + *pmsg + bool *successful - true if got something + + m0_dtm0_log_p_get_local(*op, *pmsg, *successful) + + - m0_dtm0_log_p_put() - records that P message was received for the sdev + participant. + + @section interface for pruner + + - m0_dtm0_log_prune_get() - returns dtx0 id for the dtx which has all + participants (except originator) reported P for the dtx0. Also returns all + dtx0 which were cancelled. + - m0_dtm0_log_prune() - remove the REDO message about dtx0 from the log + + dtx0 interface, client & server + + - bool m0_dtm0_log_redo_add_intent() - function to check if the transaction + has to be applied or not (if it's a duplicate), and reserves a slot in the + log for that record (in case if it has to be applied). Note: this is just + an optimisation to avoid creating additional BE transactions just to check + for the duplicates. + + - m0_dtm0_log_redo_add() - adds a REDO message into the log. + + @section dtx0 interface, client only + + - m0_dtm0_log_redo_p_wait(op) - returns the number of P messages for the dtx + and user can wait (on the op) for bigger number of P msgs or until + m0_dtm0_log_redo_cancel() is called. TODO: revise, because client has no + foms. + - m0_dtm0_log_redo_cancel() - notification that the client doesn't need the + dtx anymore. Before the function returns the op. Note: can be done later. + - m0_dtm0_log_redo_end() - notifies dtx0 that the operation dtx0 is a part of + is complete. This function MUST be called for every m0_dtm0_log_redo_add(). + It's called when we've got enough pmsgs and don't need to keep this txn + in the queue anymore. + +
+ @section DLD-impl-plan-components Implementation Plan: Components + + On the client side: + - dtx0: dtm0 api to others, init/fini tx, cancel tx, STABLE callback; + - log: add req/redo, handle cancel tx, hadle pmsg, ...; + - pmach: recv(net) and apply pmsgs to the log; + - pruner: cleans up the log to keep memory footprint in the limits + (removes STABLE and canceled transaction after a delay??); + - net: simple send and recv api (establishing sessions/connections + automatically); + - ha: subscription to new states (exactly once semantics?? can we + lose an ha-event?? we need to store events in BE to not miss them + after crash-restart); + - remach: sends REDOs; + + On the server side: + - dtx0: dtm0 api to others, init/fini tx with be, persistent callback; + - log: add req/redo, handle pmsgs, signals about local txns getting + persistent...; + - pmach: set of FOMs that recv(net) and add_p(log) pmsgs, also they + are awaiting on the log to signal and send(net) pmsgs; + - pruner: awaits on log for new Max-All-P, awaits on ha for new FAILED; + - net: send, recv; + - ha: persistent log, delivered(), subscription to new states; + - remach: set of FOMs that await on log records for which we should send + REDO, and receives incoming REDOs from net, and applies incoming REDOs. + + unordered: + - optimizations: persistent iterators (Virutial-Max-All-P), client-based + persistence-related coordination (to reduce the number of pmsgs in the + cluster), first-non-falied participant sends REDO (to reduce the number + of REDOs in the cluster); + + V1: always working log without online recovery, only happy-path + - DTM0 log in btree: + - Static list of participants, each item is a dtx for which we have not + received Pmsgs; + - Pmach (simple shim between log and net); + - Pruner (removes all-p records); + - no recovery machine; + - DTX0 (simple shim between user and log); + - drlink-based net (drlink == old net); + - dix fills m0_dtx0_descriptor of m0_cas_op; + + Tasks for V1: + - DTM0 log without REDO list + Simple Pruner (dtm/log branch). + (See https://github.com/Seagate/cortx-motr/pull/1503) + - Add static REDO lists for remote storage devices (to handle Pmsgs). + (See https://github.com/Seagate/cortx-motr/issues/2006) + - Implement m0_dtm0_log_p_get_local(op) for sending pmsgs. + - Implement dtm/net (dtm/net branch). + - Implement PMach (initial code present in dtm/refactoring-next) + - dix fills m0_dtx0_descriptor of m0_cas_op: + - move clk_src into dtm0 domain; + - Handle new PMSG on Client(s), options: + - an adapter (new -> old) (with enabled by default dtm0); + - m0_cookie or just disable dtm0 by default (as it is right now); + + V2: + - V1; + - add recovery after restart; + - adpter for client (pmsg old -> new); + - remach that sends REDO from lists and merges lists; + + Tasks for V2: + - Complete tasks in V1 + - Handle Send/Receive REDOs in servers. Simple way to determine when to send + REDO to the peer (physical timeout !!!) + + V3: + - V2; + - Replace the old one with the new DTM0; + + Features (starts with V3): + - Originators lists, unordered, client evition; + + Add when client appears; + + Remove when client eviction is done; + - Dtx0, Pmach, pruner, volatile log for client: + + Pmach: receives Pmsg, marks them in the log; + + Log, dtx0: sends STABLE to user; + + User may cancel a dtx0 at any time; + + Canceled dtx0 is removed from the log; + - Logical clock on originators: + + increment when added to the log; + + send the window to the server side (juts send); + - Remove RECOVERING state; + - DTM0 net based on drlink (could be neede earlier for V1,V2); + - DTM0 net based on queues and Motr RPC; + - DTM0 HA; + - in-memory queue for pmach; + - independent (per-participant) iterators for pmach; + - DTM0 log: tracking of real local p: + + a data structure (htable+list) to keep track of in-progress be tx. + - Redo-without-RECOVERING: + + Start after a delay (physical T timeout == N CAS timeouts); + - R-w-R: add new criteria -- starts after N local txns or T timeout. + - Client eviction is replaced by R-W-R; + - Ordinary recovery is replaced by R-W-R; + - R-W-R, ordering, min-nall-p/max-all-p, pruner by max-all-p, sending of + min-nall-p; + - Optimize R-W-R: route messages through the client; + - Optimize: Delay pruning until V amount of space is consumed or N tnxs; + - Optimize: Send REDO from the first non-failed; + - Optimize: Send REDO from the client log (cache); + - Optimize: Pmach, batching of Pmsgs for the same tx, for the same sdev; + - Optimize: Partition of DTM0 log by dtx0id; + - Optimize: Persistent per-participant list for FOL purposes; + + +
+ @section DLD-impl-plan Implementation Plan + Mandatory. Describe the steps that should be taken to implement this + design. + + The plan should take into account: + - The need for early exposure of new interfaces to potential consumers. + Should parts of the interfaces be landed early with stub support? + Does consumer code have to be updated with such stubs? + - Modular and functional decomposition. Identify pieces that can be + developed (coded and unit tested) in smaller sub-tasks. Smaller tasks + demonstrate progress to management, as well as reduce the inspection + overhead. It may be necessary to plan on re-factoring pieces of code + to support this incremental development approach. + - Understand how long it will take to implement the design. If it is + significantly long (more than a few weeks), then task decomposition + becomes even more essential, because it allows for merging of changes + from dev into your feature branch, if necessary. + Task decomposition should be reflected in the GSP task plan for the sprint. + - Determine how to maximize the overlap of inspection and ongoing + development. + The smaller the inspection task the faster it could complete. + Good planning can reduce programmer idle time during inspection; + being able to overlap development of the next coding sub-task while the + current one is being inspected is ideal! + It is useful to anticipate how you would need organize your GIT code + source branches to handle this efficiently. + Remember that you should only present modified code for inspection, + and not the changes you picked up with periodic merges from another branch. + - The software development process used by Motr provides sufficient + flexibility to decompose tasks, consolidate phases, etc. For example, + you may prefer to develop code and UT together, and present both for + inspection at the same time. This would require consolidation of the + CINSP-PREUT phase into the CINSP-POSTUT phase. + Involve your inspector in such decisions. + Document such changes in this plan, update the task spreadsheet for both + yourself and your inspector. + + The implementation plan should be deleted from the DLD when the feature + is landed into dev. + + + Unsolved questions: + - Let's say we have a hole in the log. How to understand that the hole is + valid (i.e., no redo will be received) or non-valid (a missing record)? + - DTM0 service FIDs, user-service FIDs, storage device FIDs, process FID -- + what is the relation between them? see/grep [Cat of fids]. + + Q: Isolation? + A: RM/locks/etc. Executed state is required. + It allows us not to wait on STABLE in certain cases. + Counter point: but still there are some guarantees that dtm0 must provide. + A: Nikita will provide an answer to the question about READ availablity. + + Q: Cancel really needed? + A: no requirement for an explicit cancel(). + + + NOTES + Holes in the log + ---------------- + + Kinds of holes: + - missing or canceled record + - missing Pmsg + + Client sends: + - earliest non-stable + - latest non-stable + + Server (to server) sends: + - lastest allp + - earliest non-allp + + Log + --- + + originator list is ordered by its clock. + + TODO: add client cache of REDO message that are not linked with + dtx'es (not needed for the client). + + TODO: right now we assume one DTM0 log per pool. Later on, we need a + a single DTM0 log. + + TODO: consider almost "immutable" list links in log records (for FOL). + + + @verbatim + (IV) (III) (II) (I) + [ Seq-All-P ] [Online-Recovering] [N-txns] [current-window] + + (sdev1.self) + x------------x-------------O1-m1----------------x------x---------------> + (sdev1.sdev2) + x------------x--------m2-m3------------------x------x---------------> + + + (sdev2.self) + x------------x--------m4-O2-------------------x------x---------------> + (sdev2.sdev1) + x------------x----------------m5--------------------x------x---------------> + ^ ^ ^ + | | Max-All-P | + | | Last-non-r-w-r-able-dtx + | Last non-pruned dtx + + Intervals: + IV: [last non-pruned dtx, Max-All-P] + III: (Max-All-P, Non-Rwr] + + m - min-nall-p + @endverbatim + + @verbatim + CAS REQ + (will be executed) + | + \|/ + + (IV) (III) (II) (I) + [ Seq-All-P ] [Online-Recovering] [N-txns] [current-window] + x------------x-----------------------------x--------x---------------> + ^ ^ + | Max-All-P | Last-non-r-w-r-able-dtx + + CAS REQ (will be droped) + | + \|/ + + (IV) (III) (II) (I) + [ Seq-All-P ][ ] [ ] [current-window] + x------------x----------------------x------x---------------> + ^ ^ ^ + | | Max-All-P | + | | Last-non-r-w-r-able-dtx + | Last non-pruned dtx + + Intervals: + IV: [last non-pruned dtx, Max-All-P] + III: (Max-All-P, Non-Rwr] + + m - min-nall-p + @endverbatim + + New conclusions: + - we can (must?) reject CAS requests which txid is less than min-non-all-p + (local). + - but redo will be ignored when their txid is less than min-min-non-all-p + (global). + + redo_lists[by_participants] - what are they? + + Cas request from the III-rd (Online-Recovering) interval might be + present in at least one the redo_lists[by_participant], so that the + recovery process can send the redo msgs to the online participants from + which there was no pmsg yet. Therefore, every log record must have + redo_links[i], where i < K. (This, in turn, requires the btree log record + to be the pointer to the actual log record with the redo_links, so that + the lists pointers are not broken on btree rebalance.) + + Why do we need redo_lists[] exactly? + + Consider the case when some participant is down for a while, like days. + There will be huge amount of redo records in the log for it. But we still + need to be able to find quickly the redo records to be sent to online + participants from which we don't have any pmsgs (for some reason) - + this is part of the normal online recovery work. If we don't have such + redo_lists[i], we will need to traverse the btree-log from the start (from + the olders records) every time and skip a huge amount of records for the + participant which is down. redo_lists[i] allow us to ignore offline + participant(s) easily and work only with online ones. + + How do we add redo records to the redo_lists[]? + + There is no strict requirement for redo msgs to be ordered, but it's good + to have them in order by txn_id in the redo_lists[]. In the initial + implementation we can just append the records at the tail of the lists. + + Upon receiving a new cas request or redo msg, the 1st thing we must + check whether it is in the right interval: if it's older than the min-nall-p, + it should be dropped. (This might be a request which got stuck for a while + in the network somewhere which is stale by now.) + + After inserting the record to the log-btree, we should add it to redo_lists[] + for each participant the transaction belongs to. + + In most cases, the records will be sorted if we just add them to the end of + the redo_lists[], and in a very rare cases when it is not (for example, when + some cas request was delayed in the network for some reason so that it + immediately falls into the III-rd interval) - the right place can be easily + found by searching from the end of the list. As it was mentioned above, this + optimisation can be implemented later. + + How the redo_lists[] are cleaned up? + + Upon receival of pmsg from the participant p, we find the record in the log- + btree by its txn_id and remove it from the correspondent redo_lists[p]. If + it was the last non-empty redo_list for this log record (which means we've + got pmsgs from all participant for it and it becomes all-p), and this log + record is min-nall-p - we can move min-nall-p pointer to the right until + we find the next nall-p log record. + + What if we get pmsg for which there is no log record yet? + + Such situations may happen, indeed, when, for example, one server processes + requests faster than the other or due to some network delays. In any case, + we should record such pmsgs to avoid sending needless redo msgs later, which + will only aggravate the situation on a busy networks and systems. + + One way to record such pmsgs is to create a placeholder records in the log + with the correspondent flag in the payload structure. On the 1st such pmsg + arrival, we should add the placeholder record to all redo_lists[] of the + correspondent participants, except the one from which the pmsg arrived. On + subsequent arrival of pmsgs, we can just remove the placeholder record from + the correspondent redo_list. On actual request arrival, we can just update + the placeholder with the payload and don't touch the redo_lists[]. + + This method has several drawbacks: 1) it generates additional transactions + (when they are not strictly necessary); 2) it requires to add information + about all participants into the pmsgs (additional network load). So here is + another, more lightweight approach: collect such pmsgs in a volatile hash + table (key - txn_id, value - list of participants we've got pmsgs from), and + consult this table each time we create a new log record: if there were pmsgs + for it already, don't add this log to the correspondent redo_lists[]. + + Case: find next min_nall_p + sdev fid, originator fid, timestamp + + + Case: min_nall_p array + Independent clocks o1 and o2. + + o1,s1,s2,o2 + + s1: min_nall_p for o1 is @10. + s1 -> s2: Pmsg { o=o1, src=s1, dst=s2 ... min_nall_p=@10 } + + what happens on s2? + min_nall_p[sdev][originator]; + min_nall_p[local-sdev][*]; // owned by local dtm0 domain + min_nall_p[remote-sdev][*]; // is not owned by it + + min_nall_p[size = nr of sdev][size = nr of originators]; + min_nall_p[s1][o1] = @10; + ToMoveMaxAllP(o1): + min_nall_p_set = min_nall_p[*][o1]; + if for all m in min_nall_p_set : m != 0 then + min_min_nall_p = min(min_nall_p_set) + if next(MaxAllP) < min_min_nall_p then + MaxAllP = next(MaxAllP) + fi + + what happens on s1? how min_nall_p is updated? + s2 -> s1: Pmsg { o=o1, src=s2, dst=s1 ... min_nall_p=@5 } + local - s1, remote - s2, local - s3 + tx1: o1, s1, s2, ... + tx2: o1, s3, s4, ... + + + on_pmsg(pmsg): + assert s1 is pmsg.dst + record = log.lookup(pmsg.id) + record.redo_list[pmsg.src].del() + if allp(record) and record.id == + local_min_nall_p[pmsg.id.originator]: + local_min_nall_p[pmsg.id.originator] = + Next(?); + Next(?) is: + iter = record + while all(iter) or is not participant(iter, s1): + iter = next(iter) + return iter.timestamp + + end + min_min_nall_p = ... + if ... then + min + + min_nall_p[s1][o1]; + + + s1: min_nall_p for o2 is @9. + s1 -> s2: Pmsg { o=o2, src=s1, dst=s2 ... min_nall_p=@9 } + + what happens on s2? + min_nall_p[size = nr of sdev]; + min_nall_p[s1] = @10 or @9???; + can we compare @10 and @9?; + + ... + min_nall_p[s1][o2] = @9; + min_nall_p[*][o2]; + + */ + + +#include "doc/dld/dld_template.h" + +/** + @defgroup DLDDFSInternal Motr Sample Module Internals + @brief Detailed functional specification of the internals of the + sample module. + + This example is part of the DLD Template and Style Guide. It illustrates + how to keep internal documentation separate from external documentation + by using multiple @@defgroup commands in different files. + + Please make sure that the module cross-reference the DLD, as shown below. + + @see @ref DLD and @ref DLD-lspec + + @{ + */ + +/** Structure used internally */ +struct dld_sample_internal { + int dsi_f1; /**< field to do blah */ +}; + +/** Invariant for dld_sample_internal must be called holding the mutex */ +static bool dld_sample_internal_invariant(const struct dld_sample_internal *dsi) +{ + if (dsi->dsi-f1 == 0) + return false; + return true; +} + +/** @} */ /* end internal */ + +/** + External documentation can be continued if need be - usually it should + be fully documented in the header only. + @addtogroup DLDFS + @{ + */ + +/** + * This is an example of bad documentation, where an external symbol is + * not documented in the externally visible header in which it is declared. + * This also results in Doxygen not being able to automatically reference + * it in the Functional Specification. + */ +unsigned int dld_bad_example; + +/** @} */ /* end-of-DLDFS */ + +/* + * Local variables: + * c-indentation-style: "K&R" + * c-basic-offset: 8 + * tab-width: 8 + * fill-column: 79 + * scroll-step: 1 + * End: + */ diff --git a/dtm0/dtm0-dld.h b/dtm0/dtm0-dld.h new file mode 100644 index 00000000000..75eadc529ac --- /dev/null +++ b/dtm0/dtm0-dld.h @@ -0,0 +1,309 @@ +/* -*- C -*- */ +/* + * Copyright (c) 2012-2020 Seagate Technology LLC and/or its Affiliates + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * For any questions about this software or licensing, + * please email opensource@seagate.com or cortx-questions@seagate.com. + * + */ + + +#pragma once + +#ifndef __MOTR_DLD_DTM0_H__ +#define __MOTR_DLD_DTM0_H__ + +/** + @page DLD-fspec DLD Functional Specification Template + Mandatory. This page describes the external interfaces of the + component. The section has mandatory sub-divisions created using the Doxygen + @@section command. It is required that there be Table of Contents at the + top of the page that illustrates the sectioning of the page. + + - @ref DLD-fspec-ds + - @ref DLD-fspec-sub + - @ref DLD-fspec-cli + - @ref DLD-fspec-usecases + - @ref DLDDFS "Detailed Functional Specification" + + XXX + DTM0 has two interfaces: dtx0 and domain. DTM0 domain is used to initiallize + global dtm0-related structures. DTX0 is used to execute transactions. + + + The Functional Specification section of the DLD shall be placed in a + separate Doxygen page, identified as a @@subpage of the main specification + document through the table of contents in the main document page. The + purpose of this separation is to co-locate the Functional Specification in + the same source file as the Detailed Functional Specification. + + A table of contents should be created for the major sections in this page, + as illustrated above. It should also contain references to other + @b external Detailed Functional Specification sections, which even + though may be present in the same source file, would not be visibly linked + in the Doxygen output. + + @section DLD-fspec-ds Data Structures + Mandatory for programmatic interfaces. Components with programming + interfaces should provide an enumeration and @b brief description of the + major externally visible data structures defined by this component. No + details of the data structure are required here, just the salient + points. + + For example: + +
            + The @c dld_sample_ds1 structure tracks the density of the + electro-magnetic field with the following: +@code +struct dld_sample_ds1 { + ... + int dsd_flux_density; + ... +}; +@endcode + The value of this field is inversely proportional to the square of the + number of lines of comments in the DLD. +
+ Note the indentation above, accomplished by means of an HTML table + is purely for visual effect in the Doxygen output of the style guide. + A real DLD should not use such constructs. + + Simple lists can also suffice: + - dld_sample_ds1 + - dld_bad_example + + The section could also describe what use it makes of data structures + described elsewhere. + + Note that data structures are defined in the + @ref DLDDFS "Detailed Functional Specification" + so do not duplicate the definitions! + Do not describe internal data structures here either - they can be described + in the @ref DLD-lspec "Logical Specification" if necessary. + + @section DLD-fspec-sub Subroutines + Mandatory for programmatic interfaces. Components with programming + interfaces should provide an enumeration and brief description of the + externally visible programming interfaces. + + Externally visible interfaces should be enumerated and categorized by + function. Do not provide details. They will be fully documented in + the @ref DLDDFS "Detailed Functional Specification". + Do not describe internal interfaces - they can be described in the + @ref DLD-lspec "Logical Specification" if necessary. + + @subsection DLD-fspec-sub-cons Constructors and Destructors + + @subsection DLD-fspec-sub-acc Accessors and Invariants + + @subsection DLD-fspec-sub-opi Operational Interfaces + - dld_sample_sub1() + + @section DLD-fspec-cli Command Usage + Mandatory for command line programs. Components that provide programs + would provide a specification of the command line invocation arguments. In + addition, the format of any any structured file consumed or produced by the + interface must be described in this section. + + @section DLD-fspec-usecases Recipes + This section could briefly explain what sequence of interface calls or + what program invocation flags are required to solve specific usage + scenarios. It would be very nice if these examples can be linked + back to the HLD for the component. + + Note the following references to the Detailed Functional Specification + sections at the end of these Functional Specifications, created using the + Doxygen @@see command: + + @see @ref DLDDFS "Sample Detailed Functional Specification" + */ + +/** + @defgroup DLDDFS Motr Sample Module + @brief Detailed functional specification template. + + This page is part of the DLD style template. Detailed functional + specifications go into a module described by the Doxygen @@defgroup command. + Note that you cannot use a hyphen (-) in the tag of a @@defgroup. + + Module documentation may spread across multiple source files. Make sure + that the @@addtogroup Doxygen command is used in the other files to merge + their documentation into the main group. When doing so, it is important to + ensure that the material flows logically when read through Doxygen. + + You are not constrained to have only one module in the design. If multiple + modules are present you may use multiple @@defgroup commands to create + individual documentation pages for each such module, though it is good idea + to use separate header files for the additional modules. In particular, it + is a good idea to separate the internal detailed documentation from the + external documentation in this header file. Please make sure that the DLD + and the modules cross-reference each other, as shown below. + + @see The @ref DLD "Motr Sample DLD" its + @ref DLD-fspec "Functional Specification" + and its @ref DLD-lspec-thread + + @{ + */ + + +struct m0_dtm0_pmsg { +}; + +/* LOG */ + +/** + * Initializes log record iterator for a sdev participant. It iterates over all + * records that were in the log during last local process restart or during + * last remote process restart for the process that handles that sdev. +*/ +M0_INTERNAL void m0_dtm0_log_iter_init(struct m0_dtm0_log *dol); + +/** + * Gives next log record for the sdev participant. + */ +M0_INTERNAL void m0_dtm0_log_iter_next(struct m0_dtm0_log *dol); + +/** + * Finalises the iterator. It MUST be done for every call of + * m0_dtm0_log_iter_init(). + */ +M0_INTERNAL void m0_dtm0_log_iter_fini(struct m0_dtm0_log *dol); + +/** + * Notifies the log that the participant has restarted. + * All iterators for the participant MUST be finalized at the time of the call. + * Any record that doesn't have P from the participant at the time of the call + * will be returned during the next iteration for the participant. + */ +M0_INTERNAL void m0_dtm0_log_participant_restarted(struct m0_dtm0_log *dol); +M0_INTERNAL void m0_dtm0_log_participant_restarted_credit(struct m0_dtm0_log *dol); + +/* pmach interface */ + +/** + * Returns the next P messages for transactions that became persistent + * locally. + * @param[in,out] dtxs Array allocated by the caller. + * @param[in,out] dtxs_nr The size of dtxs, and the the number of transactions + * returned by this function. + * If returned dtxs_nr then the log is being stopped, so that no further calls + * to the function should be made. + */ +M0_INTERNAL void m0_dtm0_log_p_get(struct m0_dtm0_log *dol, + struct m0_be_op *op, + struct m0_fid *sdev_fid, + struct m0_dtx0_id *dtxs, + uint64_t *dtxs_nr); + +/** + * Records that P message was received for the sdev participant. + */ +M0_INTERNAL void m0_dtm0_log_p_put(struct m0_dtm0_log *dol, + struct m0_be_tx *tx, + struct m0_dtm0_pmsg *pmsgs, + uint64_t pmsgs_nr); + +M0_INTERNAL void m0_dtm0_log_p_put_credit(struct m0_dtm0_log *dol, + uint64_t pmsgs_nr, + struct m0_be_tx_credit *accum); + +/* pruner interface */ + +/** + * Returns dtx0 id for the dtx which has all participants (except originator) + * reported P for the dtx0. + */ +M0_INTERNAL void m0_dtm0_log_p_get_none_left(struct m0_dtm0_log *dol); + +/** + * Remove the REDO message about dtxs from the log. + */ +M0_INTERNAL void m0_dtm0_log_prune(struct m0_dtm0_log *dol, + struct m0_dtx0_id *dtxs, + uint64_t nr); + +M0_INTERNAL void m0_dtm0_log_prune_credit(struct m0_dtm0_log *dol, + uint64_t nr); + +/* dtx0 interface, client & server */ + +/** + * Check if the transaction has to be applied or not, and records to the log + * about an intent to apply for that redo (in case if it has to be applied). + * @param op_executed will let the caller know when the record becomes + * executed. Useful for CAS foms to send the reply back. + * If it's NULL then it will not be used. + * @param op_persistent will let the caller know when the record becomes + * persistent (i.e. BE tx becomes M0_BTS_LOGGED). Useful + * for local recovery machine to tell remote recovery + * machine that the redo message was processed. + * If it's NULL then it will not be used. + * @param redo The redo message to add the intent for. + * @param sdev_fid Storage device the redo will be applied for. + * @return true if the redo has never been added to the log and this function + * hasn't been called for this dtx. + * @return false if the redo has been applied earlier or there was an intent to + * add it to the log. In this case op_executed and op_persistent + */ +M0_INTERNAL bool m0_dtm0_log_redo_add_intent(struct m0_dtm0_log *dol, + struct m0_be_op *op_executed, + struct m0_be_op *op_persistent, + struct m0_dtm0_redo *redo, + struct m0_fid *sdev_fid); + +/** + * Adds a REDO message and, optionally, P message, to the log. + */ +M0_INTERNAL void m0_dtm0_log_redo_add(struct m0_dtm0_log *dol, + struct m0_be_tx *tx, + struct m0_dtm0_redo *redo, + struct m0_fid *p_sdev_fid); + +/* dtx0 interface, client only */ + +/** + * Returns the number of P messages for the dtx and waits until either the + * number increases or m0_dtm0_log_redo_cancel() is called. + */ +M0_INTERNAL void m0_dtm0_log_redo_p_wait(struct m0_dtm0_log *dol); + +/** + * Notification that the client doesn't need the dtx anymore. Before the + * function returns the op + */ +M0_INTERNAL void m0_dtm0_log_redo_cancel(struct m0_dtm0_log *dol); + +/** + * Notifies dtx0 that the operation dtx0 is a part of is complete. + * This function MUST be called for every m0_dtm0_log_redo_add(). + */ +M0_INTERNAL void m0_dtm0_log_redo_end(struct m0_dtm0_log *dol); + + +/** @} */ /* DLDDFS end group */ + +#endif /* __MOTR_DLD_TEMPLATE_H__ */ + +/* + * Local variables: + * c-indentation-style: "K&R" + * c-basic-offset: 8 + * tab-width: 8 + * fill-column: 79 + * scroll-step: 1 + * End: + */ diff --git a/dtm0/fop.c b/dtm0/fop.c index 8eba703dcbb..5c5259eca04 100644 --- a/dtm0/fop.c +++ b/dtm0/fop.c @@ -324,7 +324,18 @@ M0_INTERNAL int m0_dtm0_on_committed(struct m0_fom *fom, * It is impossible to commit a transaction without DTM0 service up and * running. */ + if (dtms == NULL) { + static uint32_t count = 0; + if (count == 0) { + M0_LOG(M0_FATAL, "DTM is enabled but is not " + "configured in conf. Skip " + "DTM now. Please Check!"); + count++; /* Only print the message at the first time. */ + } + return 0; /* FIXME but now let's skip it if no DTM service. */ + } M0_PRE(dtms != NULL); + log = dtms->dos_log; M0_PRE(log != NULL); /* It is impossible to commit something on a volatile log. */ diff --git a/dtm0/log.c b/dtm0/log.c index de177e7b684..c36c7e202e2 100644 --- a/dtm0/log.c +++ b/dtm0/log.c @@ -440,6 +440,9 @@ M0_INTERNAL void m0_dtm0_log_prune(struct m0_dtm0_log *dol, &M0_BUF_INIT_PTR(dtx0_id), &rec_buf); M0_ASSERT(ergo(rc != 0, rec == NULL)); M0_ASSERT(M0_IN(rc, (0, -ENOENT))); + + if (rc == -ENOENT) + return; m0_mutex_lock(&dol->dtl_lock); M0_ASSERT(dtm0_log_invariant(dol)); dtm0_log_all_p_be_list_del(&dol->dtl_data->dtld_all_p, tx, rec); diff --git a/dtm0/recovery.c b/dtm0/recovery.c index 7fb39ee9e99..af75b9ffa2f 100644 --- a/dtm0/recovery.c +++ b/dtm0/recovery.c @@ -1396,6 +1396,7 @@ static void m0_be_queue__finish(struct m0_be_queue *bq, struct m0_buf *item) } M0_POST(bq->bq_the_end); m0_be_queue_unlock(bq); + M0_LOG(M0_DEBUG, "The queue %p is ended.", bq); } #define M0_BE_QUEUE__FINISH(bq, item_type) ({ \ item_type item; \ @@ -2262,15 +2263,18 @@ m0_ut_remach_populate(struct m0_dtm0_recovery_machine *m, } } +/** + * This function is called as a postmortem after a REDO message has already + * been replayed. It checks if the REDO message contains EOL flag. If yes, + * an EOL item is added to the EOL queue. This is to end the queue. + */ M0_INTERNAL void m0_dtm0_recovery_machine_redo_post(struct m0_dtm0_recovery_machine *m, struct dtm0_req_fop *redo, struct m0_be_op *op) { - bool is_eol = - !!(redo->dtr_flags & M0_BITS(M0_DMF_EOL)); - bool is_eviction = - !!(redo->dtr_flags & M0_BITS(M0_DMF_EVICTION)); + bool is_eol = !!(redo->dtr_flags & M0_BITS(M0_DMF_EOL)); + bool is_eviction = !!(redo->dtr_flags & M0_BITS(M0_DMF_EVICTION)); const struct m0_fid *initiator = &redo->dtr_initiator; struct eolq_item item = {}; struct recovery_fom *rf; @@ -2287,11 +2291,14 @@ m0_dtm0_recovery_machine_redo_post(struct m0_dtm0_recovery_machine *m, .ei_type = EIT_EOL, .ei_source = *initiator, }; + /* Similar to eolq_post(). Maybe call it directly? */ m0_be_queue_lock(&rf->rf_eolq); - M0_ASSERT_INFO(!rf->rf_eolq.bq_the_end, - "REDOs are not allowed if local recovery" - " has already been finished."); - M0_BE_QUEUE_PUT(&rf->rf_eolq, op, &item); + if (!rf->rf_eolq.bq_the_end) + M0_BE_QUEUE_PUT(&rf->rf_eolq, op, &item); + else { + m0_be_op_active(op); + m0_be_op_done(op); + } m0_be_queue_unlock(&rf->rf_eolq); } else { M0_LOG(M0_WARN, diff --git a/ha/entrypoint.c b/ha/entrypoint.c index e5e2aea1724..e053e60e772 100644 --- a/ha/entrypoint.c +++ b/ha/entrypoint.c @@ -501,10 +501,12 @@ static struct m0_rpc_item_ops ha_entrypoint_client_item_ops = { static void ha_entrypoint_client_fop_release(struct m0_ref *ref) { struct m0_fop *fop; - + struct m0_ha_entrypoint_req_fop *req_fop_data; M0_ENTRY(); M0_PRE(ref != NULL); fop = container_of(ref, struct m0_fop, f_ref); + req_fop_data = (struct m0_ha_entrypoint_req_fop*)fop->f_data.fd_data; + m0_buf_free(&req_fop_data->erf_git_rev_id); fop->f_data.fd_data = NULL; m0_fop_fini(fop); M0_SET0(fop); diff --git a/layout/ut/plan.c b/layout/ut/plan.c index 7e6d39c215d..1a92729bafc 100644 --- a/layout/ut/plan.c +++ b/layout/ut/plan.c @@ -311,6 +311,8 @@ static int lap_ut_init(void) { int rc; + m0_fi_enable("m0_dtm0_in_ut", "ut"); + rc = lap_ut_server_start(); M0_ASSERT(rc == 0); @@ -325,6 +327,7 @@ static int lap_ut_fini(void) lap_ut_client_stop(); lap_ut_server_stop(); + m0_fi_disable("m0_dtm0_in_ut", "ut"); return 0; } diff --git a/m0t1fs/linux_kernel/st/m0t1fs_poolmach.sh b/m0t1fs/linux_kernel/st/m0t1fs_poolmach.sh index 1d9789d0af4..b4a521852d5 100755 --- a/m0t1fs/linux_kernel/st/m0t1fs_poolmach.sh +++ b/m0t1fs/linux_kernel/st/m0t1fs_poolmach.sh @@ -19,10 +19,10 @@ # -. $(dirname $0)/common.sh -. $(dirname $0)/m0t1fs_common_inc.sh -. $(dirname $0)/m0t1fs_client_inc.sh -. $(dirname $0)/m0t1fs_server_inc.sh +. $(dirname "$0")/common.sh +. $(dirname "$0")/m0t1fs_common_inc.sh +. $(dirname "$0")/m0t1fs_client_inc.sh +. $(dirname "$0")/m0t1fs_server_inc.sh pool_mach_test() { diff --git a/m0t1fs/linux_kernel/st/m0t1fs_rpc_cancel_test.sh b/m0t1fs/linux_kernel/st/m0t1fs_rpc_cancel_test.sh index 2e949cfe661..a29592a6bd2 100755 --- a/m0t1fs/linux_kernel/st/m0t1fs_rpc_cancel_test.sh +++ b/m0t1fs/linux_kernel/st/m0t1fs_rpc_cancel_test.sh @@ -19,11 +19,11 @@ # -. $(dirname $0)/common.sh -. $(dirname $0)/m0t1fs_common_inc.sh -. $(dirname $0)/m0t1fs_client_inc.sh -. $(dirname $0)/m0t1fs_server_inc.sh -. $(dirname $0)/m0t1fs_sns_common_inc.sh +. $(dirname "$0")/common.sh +. $(dirname "$0")/m0t1fs_common_inc.sh +. $(dirname "$0")/m0t1fs_client_inc.sh +. $(dirname "$0")/m0t1fs_server_inc.sh +. $(dirname "$0")/m0t1fs_sns_common_inc.sh . $M0_SRC_DIR/utils/functions # opcode diff --git a/m0t1fs/linux_kernel/st/m0t1fs_server.sh b/m0t1fs/linux_kernel/st/m0t1fs_server.sh index 7cd501bbb37..ab5d338a8f3 100755 --- a/m0t1fs/linux_kernel/st/m0t1fs_server.sh +++ b/m0t1fs/linux_kernel/st/m0t1fs_server.sh @@ -39,9 +39,9 @@ if [ "x$1" = "x-h" ]; then exit 0 fi -. $(dirname $0)/common.sh -. $(dirname $0)/m0t1fs_common_inc.sh -. $(dirname $0)/m0t1fs_server_inc.sh +. $(dirname "$0")/common.sh +. $(dirname "$0")/m0t1fs_common_inc.sh +. $(dirname "$0")/m0t1fs_server_inc.sh main() { diff --git a/m0t1fs/linux_kernel/st/m0t1fs_sns_repair_1f.sh b/m0t1fs/linux_kernel/st/m0t1fs_sns_repair_1f.sh index b4e94e978cc..0613c71993d 100755 --- a/m0t1fs/linux_kernel/st/m0t1fs_sns_repair_1f.sh +++ b/m0t1fs/linux_kernel/st/m0t1fs_sns_repair_1f.sh @@ -19,11 +19,11 @@ # -. $(dirname $0)/common.sh -. $(dirname $0)/m0t1fs_common_inc.sh -. $(dirname $0)/m0t1fs_client_inc.sh -. $(dirname $0)/m0t1fs_server_inc.sh -. $(dirname $0)/m0t1fs_sns_common_inc.sh +. $(dirname "$0")/common.sh +. $(dirname "$0")/m0t1fs_common_inc.sh +. $(dirname "$0")/m0t1fs_client_inc.sh +. $(dirname "$0")/m0t1fs_server_inc.sh +. $(dirname "$0")/m0t1fs_sns_common_inc.sh ################################################### # SNS repair is only supported in COPYTOOL mode, diff --git a/m0t1fs/linux_kernel/st/m0t1fs_sns_repair_1k_1f.sh b/m0t1fs/linux_kernel/st/m0t1fs_sns_repair_1k_1f.sh index d4b1885b0d4..3e814ab7e3b 100755 --- a/m0t1fs/linux_kernel/st/m0t1fs_sns_repair_1k_1f.sh +++ b/m0t1fs/linux_kernel/st/m0t1fs_sns_repair_1k_1f.sh @@ -19,11 +19,11 @@ # -. $(dirname $0)/common.sh -. $(dirname $0)/m0t1fs_common_inc.sh -. $(dirname $0)/m0t1fs_client_inc.sh -. $(dirname $0)/m0t1fs_server_inc.sh -. $(dirname $0)/m0t1fs_sns_common_inc.sh +. $(dirname "$0")/common.sh +. $(dirname "$0")/m0t1fs_common_inc.sh +. $(dirname "$0")/m0t1fs_client_inc.sh +. $(dirname "$0")/m0t1fs_server_inc.sh +. $(dirname "$0")/m0t1fs_sns_common_inc.sh ################################################### # SNS repair is only supported in COPYTOOL mode, diff --git a/motr/client.h b/motr/client.h index 80a62b03834..fd40cd5071e 100644 --- a/motr/client.h +++ b/motr/client.h @@ -568,6 +568,24 @@ enum m0_idx_opcode { M0_IC_NR /* 21 */ } M0_XCA_ENUM; +/** + * Option to set on an index operation. + * + * See @ref m0_idx_op_setoption. + */ +enum m0_op_idx_option { + /** + * Index operations are distributed across one or more CAS services + * to achieve the configured durability. This option specifies the + * minimum number of services that must successfully complete an + * operation for the operation as a whole to be considered + * successful. The value for this option must be greater than zero + * or a special value. This should normally be set to + * @ref M0_DIX_MIN_REPLICA_QUORUM to ensure consistency. + */ + M0_OIO_MIN_SUCCESS = 1, +}; + /** * Flags passed to m0_obj_op() to specify object IO operation behaviour. */ @@ -1628,6 +1646,18 @@ int m0_idx_op(struct m0_idx *idx, uint32_t flags, struct m0_op **op); +/** + * Set an option on an index operation. + * + * The index op must have been previously initialized with @ref m0_idx_op. + * It is undefined behavior to change an option after the index op has + * been launched. The meaning of each option is documented in + * @ref m0_op_idx_option. + */ +void m0_idx_op_setoption(struct m0_op *op, + enum m0_op_idx_option option, + int64_t value); + void m0_realm_create(struct m0_realm *realm, uint64_t wcount, uint64_t rcount, struct m0_op **op); diff --git a/motr/client_init.c b/motr/client_init.c index 154924689d4..9b65e8b4720 100644 --- a/motr/client_init.c +++ b/motr/client_init.c @@ -1710,11 +1710,27 @@ int m0_client_init(struct m0_client **m0c_p, if (ENABLE_DTM0) { struct m0_reqh_service *reqh_svc; + struct m0_confc *confc = m0_reqh2confc(&m0c->m0c_reqh); + if (M0_IS0(confc)) { + M0_LOG(M0_FATAL, "DTM is enabled, but the confc is not " + "initialised. This happens in UT to " + "test failure cases. If not, please " + "check! Skip DTM now"); + rc = 0; + goto skip_dtm; /* FIXME */ + } rc = m0_conf_process2service_get(m0_reqh2confc(&m0c->m0c_reqh), &m0c->m0c_reqh.rh_fid, M0_CST_DTM0, &cli_svc_fid); - M0_ASSERT(rc == 0); + if (rc != 0) { + M0_LOG(M0_FATAL, "DTM is enabled, but DTM service is" + " not defined in conf.\nPlease check" + " the conf file for more details\n" + "Now let's just skip DTM init"); + rc = 0; + goto skip_dtm; /* FIXME Please add DTM service. */ + } if (m0_dtm0_in_ut()) { /* When in UT, m0c_reqh.rh_fid is the same as the @@ -1743,6 +1759,7 @@ int m0_client_init(struct m0_client **m0c_p, ha_process_event(m0c, M0_CONF_HA_PROCESS_DTM_RECOVERED); } +skip_dtm: if (conf->mc_is_addb_init) { char buf[256]; /* uint64 max character size */ @@ -1801,7 +1818,10 @@ void m0_client_fini(struct m0_client *m0c, bool fini_m0) M0_PRE(m0_sm_conf_is_initialized(&m0_op_conf)); M0_PRE(m0_sm_conf_is_initialized(&entity_conf)); M0_PRE(m0c != NULL); + + /* FIXME please see m0_client_init() M0_PRE(ergo(ENABLE_DTM0, m0c->m0c_dtms != NULL)); + */ if (m0c->m0c_dtms != NULL) m0_dtm_client_service_stop(&m0c->m0c_dtms->dos_generic); diff --git a/motr/client_internal.h b/motr/client_internal.h index b50f1041cf9..ed4d812367e 100644 --- a/motr/client_internal.h +++ b/motr/client_internal.h @@ -230,6 +230,12 @@ struct m0_op_idx { struct m0_sm_group *oi_sm_grp; struct m0_ast_rc oi_ar; + /** + * Minimum number of successful CAS operations to treat + * parent DIX operation as successful. + */ + int64_t oi_min_success; + /* A bit-mask of m0_op_idx_flags. */ uint32_t oi_flags; diff --git a/motr/idx.c b/motr/idx.c index ee1c87f7643..bc7e200916b 100644 --- a/motr/idx.c +++ b/motr/idx.c @@ -42,7 +42,9 @@ static void idx_op_cb_free(struct m0_op_common *oc); static void idx_op_cb_cancel(struct m0_op_common *oc); const struct m0_bob_type oi_bobtype; + M0_BOB_DEFINE(M0_INTERNAL, &oi_bobtype, m0_op_idx); + const struct m0_bob_type oi_bobtype = { .bt_name = "oi_bobtype", .bt_magix_offset = offsetof(struct m0_op_idx, oi_magic), @@ -206,6 +208,7 @@ static int idx_op_init(struct m0_idx *idx, int opcode, oi->oi_vals = vals; oi->oi_rcs = rcs; oi->oi_flags = flags; + oi->oi_min_success = M0_DIX_MIN_REPLICA_QUORUM; locality = m0__locality_pick(oi_instance(oi)); M0_ASSERT(locality != NULL); @@ -217,6 +220,18 @@ static int idx_op_init(struct m0_idx *idx, int opcode, if (ENABLE_DTM0 && !(flags & M0_OIF_NO_DTM) && M0_IN(op->op_code, (M0_IC_PUT, M0_IC_DEL))) { + if (m0c->m0c_dtms == NULL) { + static uint32_t count = 0; + if (count == 0) { + M0_LOG(M0_FATAL, "DTM is enabled but is not " + "configured in conf. Skip " + "DTM now. Please Check!"); + count++; + /* Only print the msg at the first time. */ + } + oi->oi_dtx = NULL; + goto skip_dtm; /* FIXME Add DTM service to conf */ + } M0_ASSERT(m0c->m0c_dtms != NULL); oi->oi_dtx = m0_dtx0_alloc(m0c->m0c_dtms, oi->oi_sm_grp); if (oi->oi_dtx == NULL) @@ -226,6 +241,7 @@ static int idx_op_init(struct m0_idx *idx, int opcode, M0_ADDB2_ADD(M0_AVI_CLIENT_TO_DIX, cid, did); } else oi->oi_dtx = NULL; +skip_dtm: if (opcode == M0_EO_CREATE && entity->en_type == M0_ET_IDX && entity->en_flags & M0_ENF_META) { @@ -593,6 +609,28 @@ int m0_idx_op(struct m0_idx *idx, } M0_EXPORTED(m0_idx_op); +void m0_idx_op_setoption(struct m0_op *op, + enum m0_op_idx_option option, + int64_t value) +{ + struct m0_op_common *oc; + struct m0_op_idx *oi; + + M0_PRE(op != NULL); + oc = bob_of(op, struct m0_op_common, oc_op, &oc_bobtype); + oi = bob_of(oc, struct m0_op_idx, oi_oc, &oi_bobtype); + + switch (option) { + case M0_OIO_MIN_SUCCESS: + M0_PRE(ergo(value < 1, value == M0_DIX_MIN_REPLICA_QUORUM)); + oi->oi_min_success = value; + break; + default: + M0_IMPOSSIBLE("Invalid index op option"); + } +} +M0_EXPORTED(m0_idx_op_setoption); + /** * Sets an entity operation to create or delete an index. * diff --git a/motr/idx.h b/motr/idx.h index d2f689bef6f..5373788ec2d 100644 --- a/motr/idx.h +++ b/motr/idx.h @@ -108,6 +108,16 @@ enum m0_op_idx_flags { M0_OIF_NO_DTM = 1 << 5 }; +enum { + /** + * Special value for @ref M0_OIO_MIN_SUCCESS. Sets the required + * successful operations to (N+K)/2 + 1. This ensures that transient + * failues cannot result in inconsistent/stale data for + * the index operation. + */ + M0_DIX_MIN_REPLICA_QUORUM = -1 +}; + /** * Query operations for an index service. The operations in this data * structure can be divided into 2 groups: diff --git a/motr/idx_dix.c b/motr/idx_dix.c index 5e5ef3a0357..984740e2b15 100644 --- a/motr/idx_dix.c +++ b/motr/idx_dix.c @@ -663,7 +663,7 @@ static int dix_req_create(struct m0_op_idx *oi, m0_clink_init(&req->idr_dtx_clink, dixreq_clink_dtx_cb); if (idx_is_distributed(oi)) { m0_dix_req_init(&req->idr_dreq, op_dixc(oi), - oi->oi_sm_grp); + oi->oi_sm_grp, oi->oi_min_success); to_dix_map(&oi->oi_oc.oc_op, &req->idr_dreq); req->idr_dreq.dr_dtx = oi->oi_dtx; m0_clink_init(&req->idr_clink, dixreq_clink_cb); diff --git a/motr/motr-pub.api b/motr/motr-pub.api index bf14e533e79..58e21f0c24e 100644 --- a/motr/motr-pub.api +++ b/motr/motr-pub.api @@ -91,6 +91,7 @@ m0_obj_fini m0_obj_op m0_idx_init m0_idx_fini +m0_idx_op_setoption m0_idx_op m0_op_kick m0_rc diff --git a/motr/setup.c b/motr/setup.c index 2e91289692b..1fe5e5f8ec0 100644 --- a/motr/setup.c +++ b/motr/setup.c @@ -459,6 +459,7 @@ static void cs_reqh_ctx_fini(struct m0_reqh_context *rctx) m0_free(rctx->rc_services[i]); m0_free(rctx->rc_services); m0_free(rctx->rc_service_fids); + m0_free((char*)rctx->rc_addb_stlocation); rctx->rc_stob.s_sfile.sf_is_initialised = false; rctx->rc_stob.s_ad_disks_init = false; } @@ -1676,7 +1677,12 @@ static int cs_storage_setup(struct m0_motr *cctx) } } - M0_ASSERT(rctx->rc_mdstore.md_dom != NULL); + if (rctx->rc_mdstore.md_dom == NULL) { + rc = -ENOENT; + M0_ERR_INFO(rc, "Cob domain not found for root cob"); + goto cleanup_addb2; + } + /* Init mdstore and root cob as it should be created by mkfs. */ rc = m0_mdstore_init(&rctx->rc_mdstore, rctx->rc_beseg, true); if (rc != 0) { @@ -2289,8 +2295,8 @@ static int _args_parse(struct m0_motr *cctx, int argc, char **argv) LAMBDA(void, (const char *s) { char tmp_buf[512]; - sprintf(tmp_buf, "%s-%d", s, (int)m0_pid()); - rctx->rc_addb_stlocation = strdup(tmp_buf); + snprintf(tmp_buf, sizeof(tmp_buf), "%s-%d", s, (int)m0_pid()); + rctx->rc_addb_stlocation = m0_strdup(tmp_buf); })), M0_STRINGARG('d', "Device configuration file", LAMBDA(void, (const char *s) diff --git a/motr/st/api.c b/motr/st/api.c index 698722344c1..6679851d5a1 100644 --- a/motr/st/api.c +++ b/motr/st/api.c @@ -122,6 +122,8 @@ int st_idx_op(struct m0_idx *idx, rc = m0_idx_op(idx, opcode, keys, vals, rcs, flag, op); if (*op != NULL) st_mark_op(*op); + m0_idx_op_setoption(*op, M0_OIO_MIN_SUCCESS, + M0_DIX_MIN_REPLICA_QUORUM); return rc; } diff --git a/motr/st/mt/mt_fom.c b/motr/st/mt/mt_fom.c index 99a2ef2962d..464d716522a 100644 --- a/motr/st/mt/mt_fom.c +++ b/motr/st/mt/mt_fom.c @@ -214,6 +214,11 @@ static int st_common_tick(struct m0_fom *fom, void *data, int *phase, int *rcs = fctx->sfc_rcs; int64_t ci; int rc; + int batch; + + batch = CMT_BATCH; + if (ENABLE_DTM0) + batch = 0; /* A single k/v in a DIX op. */ M0_LOG(M0_DEBUG, "i=%d fired=%d rqtype=%d", fctx->sfc_i, !!fctx->sfc_fired, rqtype); @@ -229,7 +234,7 @@ static int st_common_tick(struct m0_fom *fom, void *data, int *phase, M0_SET0(vals); st_kv_alloc_and_fill(keys, vals, (int)ci * CMT_BATCH_OFF, - (int)ci * CMT_BATCH_OFF + CMT_BATCH, + (int)ci * CMT_BATCH_OFF + batch, M0_IN(rqtype, (REQ_GET, REQ_DEL))); switch(rqtype) { case REQ_CREATE: @@ -279,7 +284,7 @@ static int st_common_tick(struct m0_fom *fom, void *data, int *phase, case REQ_GET: ci = fctx->sfc_ci; st_vals_check(keys, vals, (int)ci * CMT_BATCH_OFF, - (int)ci * CMT_BATCH_OFF + CMT_BATCH); + (int)ci * CMT_BATCH_OFF + batch); default: M0_ASSERT(m0_forall(i, CMT_IDXS_NR, rcs[i] == 0)); } diff --git a/motr/ut/idx_dix.c b/motr/ut/idx_dix.c index 9ad5dedfa6f..177f9b12943 100644 --- a/motr/ut/idx_dix.c +++ b/motr/ut/idx_dix.c @@ -466,6 +466,8 @@ static void ut_dix_record_ops(bool dist, uint32_t cr_get_flags, struct m0_op_common *oc; struct m0_op_idx *oi; + if (ENABLE_DTM0) /* If DTM0 enabled, no multiple keys/vals in a op. */ + return; idx_dix_ut_init(); general_ifid_fill(&ifid, dist); m0_container_init(&realm, NULL, &M0_UBER_REALM, ut_m0c); @@ -809,12 +811,23 @@ static void ut_dix_record_ops_non_dist_no_dtm(void) ut_dix_record_ops(false, 0, M0_OIF_NO_DTM); } +static int ut_suite_idx_dix_init() +{ + m0_fi_enable("m0_dtm0_in_ut", "ut"); + return 0; +} +static int ut_suite_idx_dix_fini() +{ + m0_fi_disable("m0_dtm0_in_ut", "ut"); + return 0; +} + struct m0_ut_suite ut_suite_idx_dix = { .ts_name = "idx-dix", .ts_owners = "Egor", - .ts_init = NULL, - .ts_fini = NULL, + .ts_init = ut_suite_idx_dix_init, + .ts_fini = ut_suite_idx_dix_fini, .ts_tests = { { "init-fini", ut_dix_init_fini, "Egor" }, { "namei-ops-dist", ut_dix_namei_ops_dist, "Egor" }, @@ -1293,7 +1306,7 @@ static void st_dtm0_r_common(uint32_t sdev_id) dtm0_ut_send_redo(&duc.duc_ifid, sdev_id, &key, &val, M0_CAS_PUT_FOP_OPCODE); - /* XXX dirty hack, but now we don't have completion notification */ + /* XXX dirty workaround, but now we don't have completion notification */ rem = 2ULL * M0_TIME_ONE_SECOND; while (rem != 0) m0_nanosleep(rem, &rem); @@ -1304,7 +1317,7 @@ static void st_dtm0_r_common(uint32_t sdev_id) dtm0_ut_send_redo(&duc.duc_ifid, sdev_id, &key, NULL, M0_CAS_DEL_FOP_OPCODE); - /* XXX dirty hack, but now we don't have completion notification */ + /* XXX dirty workaround, but now we don't have completion notification */ rem = 2ULL * M0_TIME_ONE_SECOND; while (rem != 0) m0_nanosleep(rem, &rem); diff --git a/net/bulk_emulation/st/main.c b/net/bulk_emulation/st/main.c index fb1c9dc02dc..87f127beb2f 100644 --- a/net/bulk_emulation/st/main.c +++ b/net/bulk_emulation/st/main.c @@ -256,7 +256,7 @@ void client(struct client_params *params) if (params->xprt->px_3part_addr) { cctx.pc_port = params->base_port; cctx.pc_id = params->client_id; - sprintf(ident, "Client %d:%d", cctx.pc_port, cctx.pc_id); + sprintf(ident, "Client %d:%u", cctx.pc_port, cctx.pc_id); cctx.pc_rid = PART3_SERVER_ID; } else { cctx.pc_port = params->base_port + params->client_id; diff --git a/net/bulk_emulation/st/ping.c b/net/bulk_emulation/st/ping.c index 790cf318f5d..10ce90aa2d3 100644 --- a/net/bulk_emulation/st/ping.c +++ b/net/bulk_emulation/st/ping.c @@ -136,6 +136,9 @@ int encode_msg(struct m0_net_buffer *nb, const char *str) struct m0_bufvec_cursor incur; struct m0_bufvec_cursor cur; + if(nb == NULL) + return -1; + nb->nb_length = len + 1; m0_bufvec_cursor_init(&cur, &nb->nb_buffer); bp = m0_bufvec_cursor_addr(&cur); @@ -1189,6 +1192,7 @@ int ping_client_passive_send(struct ping_ctx *ctx, M0_ASSERT(nb != NULL); /* reuse encode_msg for convenience */ rc = encode_msg(nb, data); + M0_ASSERT(rc == 0); nb->nb_qtype = M0_NET_QT_PASSIVE_BULK_SEND; nb->nb_ep = server_ep; if (ctx->pc_passive_bulk_timeout > 0) { diff --git a/net/net.c b/net/net.c index 914027e98b8..36d2b1a3a7e 100644 --- a/net/net.c +++ b/net/net.c @@ -28,9 +28,7 @@ #include "lib/memory.h" #include "lib/misc.h" #include "lib/mutex.h" -#ifndef __KERNEL__ -# include "lib/string.h" /* m0_streq */ -#endif +#include "lib/string.h" #define M0_TRACE_SUBSYSTEM M0_TRACE_SUBSYS_NET #include "lib/trace.h" diff --git a/net/sock/sock.c b/net/sock/sock.c index 1953a0f87c9..a5cffd53c26 100644 --- a/net/sock/sock.c +++ b/net/sock/sock.c @@ -2386,6 +2386,7 @@ static int ep_find(struct ma *ma, const char *name, struct ep **out) /** Returns end-point transfer machine. */ static struct ma *ep_ma(struct ep *ep) { + M0_PRE(ep != NULL); return ep->e_ep.nep_tm->ntm_xprt_private; } diff --git a/net/test/demo/test.sh b/net/test/demo/test.sh deleted file mode 100755 index 15da77ca4a7..00000000000 --- a/net/test/demo/test.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env bash -# -# Copyright (c) 2020 Seagate Technology LLC and/or its Affiliates -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# For any questions about this software or licensing, -# please email opensource@seagate.com or cortx-questions@seagate.com. -# - -set -eux - -x=1 -for i in {1..100}; do - let x++ -done -echo $x diff --git a/net/test/network.c b/net/test/network.c index a849d65dc4b..852092f2d3e 100644 --- a/net/test/network.c +++ b/net/test/network.c @@ -187,6 +187,7 @@ static int net_test_buf_init(struct m0_net_buffer *buf, static void net_test_buf_fini(struct m0_net_buffer *buf, struct m0_net_domain *dom) { + M0_PRE(buf != NULL); M0_PRE(buf->nb_dom == dom); m0_net_buffer_deregister(buf, dom); diff --git a/net/test/node_bulk.c b/net/test/node_bulk.c index 52b87d7f27a..1e602702129 100644 --- a/net/test/node_bulk.c +++ b/net/test/node_bulk.c @@ -850,10 +850,10 @@ static void node_bulk_cb(struct m0_net_test_network_ctx *net_ctx, bool buf_send; bool buf_bulk; + M0_PRE(net_ctx != NULL); LOGD("node_bulk_cb: tm_addr = %s, buf_index = %u, q = %d" ", ev-nbe_status = %d", net_ctx->ntc_tm->ntm_ep->nep_addr, buf_index, q, ev->nbe_status); - M0_PRE(net_ctx != NULL); role_client = ctx->nbc_nh.ntnh_role == M0_NET_TEST_ROLE_CLIENT; M0_PRE(ergo(q == M0_NET_QT_MSG_RECV, !role_client)); M0_PRE(ergo(q == M0_NET_QT_MSG_SEND, role_client)); diff --git a/net/test/st/run-1x1.sh b/net/test/st/run-1x1.sh index c4e24c9044d..3b41aa3c5ee 100644 --- a/net/test/st/run-1x1.sh +++ b/net/test/st/run-1x1.sh @@ -57,7 +57,7 @@ node_start_addr() DIR_COUNTER=$(($DIR_COUNTER + 1)) "$CMD_M0NETTESTD" -a "$addr" -c "$addr_console" & popd > /dev/null - eval PID_"$4"=$! + eval PID_"${pid_role}"=$! fi } @@ -145,7 +145,7 @@ fi "$PARSABLE" \ $BULK_PARAMETERS & PID_CONSOLE=$! -wait $PID_CONSOLE +wait "$PID_CONSOLE" # The same time for fini sleep "$NODE_INIT_DELAY" diff --git a/net/test/st/st-bulk.sh b/net/test/st/st-bulk.sh index 53f43fedbc4..5b72e4ed9e2 100755 --- a/net/test/st/st-bulk.sh +++ b/net/test/st/st-bulk.sh @@ -21,15 +21,15 @@ CWD=$(cd "$( dirname "$0")" && pwd) -source $CWD/st-config.sh -TEST_TYPE="bulk" -MSG_NR=1048576 -MSG_SIZE=1m -CONCURRENCY_CLIENT=8 -CONCURRENCY_SERVER=16 -BD_BUF_NR_CLIENT=16 -BD_BUF_NR_SERVER=32 -BD_BUF_SIZE=16k -BD_BUF_NR_MAX=8 +source "$CWD"/st-config.sh +export TEST_TYPE="bulk" +export MSG_NR=1048576 +export MSG_SIZE=1m +export CONCURRENCY_CLIENT=8 +export CONCURRENCY_SERVER=16 +export BD_BUF_NR_CLIENT=16 +export BD_BUF_NR_SERVER=32 +export BD_BUF_SIZE=16k +export BD_BUF_NR_MAX=8 -source $CWD/run-1x1.sh +source "$CWD"/run-1x1.sh diff --git a/net/test/st/st-ping.sh b/net/test/st/st-ping.sh index a01060f461a..d18e05a00c9 100755 --- a/net/test/st/st-ping.sh +++ b/net/test/st/st-ping.sh @@ -22,10 +22,10 @@ CWD=$(cd "$( dirname "$0")" && pwd) source "$CWD"/st-config.sh -TEST_TYPE="ping" -MSG_NR=1048576 -MSG_SIZE=4k -CONCURRENCY_CLIENT=8 -CONCURRENCY_SERVER=16 +export TEST_TYPE="ping" +export MSG_NR=1048576 +export MSG_SIZE=4k +export CONCURRENCY_CLIENT=8 +export CONCURRENCY_SERVER=16 source "$CWD"/run-1x1.sh diff --git a/net/test/ut/commands.c b/net/test/ut/commands.c index 3db9c6a3209..95ba0d10d8c 100644 --- a/net/test/ut/commands.c +++ b/net/test/ut/commands.c @@ -671,6 +671,7 @@ static void commands_node_thread2(struct net_test_cmd_node *node) M0_ALLOC_PTR(cmd); M0_UT_ASSERT(cmd != NULL); + M0_UT_ASSERT(nodes != NULL); if (console_thread) { buf[0] = '\0'; diff --git a/net/test/ut/slist.c b/net/test/ut/slist.c index ede54cd0181..599b37c7b0e 100644 --- a/net/test/ut/slist.c +++ b/net/test/ut/slist.c @@ -117,7 +117,7 @@ void m0_net_test_slist_ut(void) rc = m0_net_test_slist_init(&slist, buf, ','); M0_UT_ASSERT(rc == 0); rc_bool = m0_net_test_slist_unique(&slist); - M0_UT_ASSERT(!rc_bool ^ (i < 10)); + M0_UT_ASSERT((!rc_bool) ^ (i < 10)); slist_check(&slist, i, buf); /* serialize string list to buffer */ len = m0_net_test_slist_serialize(M0_NET_TEST_SERIALIZE, &slist, diff --git a/net/ut/bulk_if.c b/net/ut/bulk_if.c index b8cfeec3c7d..8cd50e89a0d 100644 --- a/net/ut/bulk_if.c +++ b/net/ut/bulk_if.c @@ -123,6 +123,7 @@ static int ut_dom_init(const struct m0_net_xprt *xprt, ut_end_point_release_called = false; ut_dom_fini_called = false; ut_dom_init_called = true; + ut_xprt_pvt.num = 0; dom->nd_xprt_private = &ut_xprt_pvt; return 0; } diff --git a/rpc/conn.c b/rpc/conn.c index 638b65fac31..bfc85eaeb61 100644 --- a/rpc/conn.c +++ b/rpc/conn.c @@ -1433,7 +1433,8 @@ M0_INTERNAL int m0_rpc_conn_ha_timer_start(struct m0_rpc_conn *conn) return M0_RC(0); /* there's no point to arm the timer */ if (m0_sm_timer_is_armed(&conn->c_ha_timer)) return M0_RC(0); /* Already started */ - else + else if (M0_IN(conn->c_ha_timer.tr_timer.t_state, + (M0_TIMER_STOPPED, M0_TIMER_INITED))) m0_sm_timer_fini(&conn->c_ha_timer); if (conn->c_rpc_machine->rm_stopping) return M0_RC(0); @@ -1448,11 +1449,14 @@ M0_INTERNAL int m0_rpc_conn_ha_timer_start(struct m0_rpc_conn *conn) M0_INTERNAL void m0_rpc_conn_ha_timer_stop(struct m0_rpc_conn *conn) { + M0_ENTRY("conn %p", conn); M0_PRE(m0_rpc_machine_is_locked(conn->c_rpc_machine)); if (m0_sm_timer_is_armed(&conn->c_ha_timer)) { M0_LOG(M0_DEBUG, "Cancelling HA timer; rpc conn=%p", conn); m0_sm_timer_cancel(&conn->c_ha_timer); - } + } else if (M0_IN(conn->c_ha_timer.tr_timer.t_state, + (M0_TIMER_STOPPED, M0_TIMER_INITED))) + m0_sm_timer_fini(&conn->c_ha_timer); } /** diff --git a/scripts/jenkins/code-coverage-with-xperior-and-doxygen.groovy b/scripts/jenkins/code-coverage-with-xperior-and-doxygen.groovy index 60b1dabc3e8..1a1cae85163 100755 --- a/scripts/jenkins/code-coverage-with-xperior-and-doxygen.groovy +++ b/scripts/jenkins/code-coverage-with-xperior-and-doxygen.groovy @@ -59,15 +59,15 @@ pipeline { sh ''' set -ae set - WD=$(pwd) + WORKING_DIR=$(pwd) hostname id ls export DO_MOTR_BUILD=yes export TESTDIR=motr/.xperior/testds/ - export XPERIOR="${WD}/xperior" - export ITD="${WD}/seagate-ci/xperior" - export XPLIB="${WD}/xperior-perl-libs/extlib/lib/perl5" + export XPERIOR="${WORKING_DIR}/xperior" + export ITD="${WORKING_DIR}/seagate-ci/xperior" + export XPLIB="${WORKING_DIR}/xperior-perl-libs/extlib/lib/perl5" export PERL5LIB="${XPERIOR}/mongo/lib:${XPERIOR}/lib:${ITD}/lib:${XPLIB}/" export PERL_HOME="/opt/perlbrew/perls/perl-5.22.0/" export PATH="${PERL_HOME}/bin/:$PATH:/sbin/:/usr/sbin/"