diff --git a/TAG b/TAG index aefa26665a7..47d92ef13d9 100644 --- a/TAG +++ b/TAG @@ -1 +1 @@ -2.6.1-rc2 +2.6.1-rc3 diff --git a/debian/changelog b/debian/changelog index b9edca31485..6891cea4737 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +daos (2.6.1-3) unstable; urgency=medium + [ Phillip Henderson ] + * Third release candidate for 2.6.1 + + -- Phillip Henderson Tue, 01 Oct 2024 14:23:00 -0500 + daos (2.6.1-2) unstable; urgency=medium [ Phillip Henderson ] * Second release candidate for 2.6.1 diff --git a/docs/release/release_notes.md b/docs/release/release_notes.md index aab836cdeef..2f95f111120 100644 --- a/docs/release/release_notes.md +++ b/docs/release/release_notes.md @@ -2,6 +2,57 @@ We are pleased to announce the release of DAOS version 2.6. +## DAOS Version 2.6.1 (2024-10-05) + +The DAOS 2.6.1 release contains the following updates on top of DAOS 2.6.0: + +* Mercury update for slingshot 11.0 host stack and other UCX provider fixes. + +### Bug fixes and improvements + +The DAOS 2.6.1 release includes fixes for several defects and a few changes +of administrator interface that can improve usability of DAOS system. + +* Fix a race between MS replica stepping up as leader and engines joining the + system, this race may cause engine join to fail. + +* Fix a race in concurrent container destroy which may cause engine crash. + +* Pool destroy returns explicit error instead of success if there is an + in-progress destroy against the same pool. + +* EC aggregation may cause inconsistency between data shard and parity shard, + this has been fixed in DAOS Version 2.6.1. + +* Enable pool list for clients. + +* Running "daos|dmg pool query-targets" with rank argument can query all + targets on that rank. + +* Add daos health check command which allows basic system health checks from client. + +* DAOS Version 2.6.0 always excludes unreachable engines reported by SWIM and schedule rebuild for + excluded engines, this is an overreaction if massive engines are impacted by power failure or + switch reboot because data recovery is impossible in these cases. DAOS 2.6.1 introduces a new + environment variable to set in the server yaml file for each engine (DAOS_POOL_RF) to indicate the + number of engine failures seen before stopping the changing of pool membership and completing in + progress rebuild. It will just let all I/O and on-going rebuild block. DAOS system can finish in + progress rebuild and be available again after bringing back impacted engines. The recommendation + is to set this environment variable to 2. + +* In DAOS Version 2.6.0, accessing faulty NVMe device returns wrong error code + to DAOS client which can fail the application. DAOS 2.6.1 returns correct + error code to DAOS client so the client can retry and eventually access data + in degraded mode instead of failing the I/O. + +* Pil4dfs fix to avoid deadlock with level zero library on aurora and support + for more libc functions that were not intercepted before + +For details, please refer to the Github +[release/2.6 commit history](https://github.com/daos-stack/daos/commits/release/2.6) +and the associated [Jira tickets](https://jira.daos.io/) as stated in the commit messages. + + ## DAOS Version 2.6.0 (2024-07-26) ### General Support diff --git a/src/tests/ftest/datamover/copy_procs.py b/src/tests/ftest/datamover/copy_procs.py index ce980f373f4..a1734659587 100644 --- a/src/tests/ftest/datamover/copy_procs.py +++ b/src/tests/ftest/datamover/copy_procs.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2022 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -46,7 +46,7 @@ def test_copy_procs(self): :avocado: tags=DmvrCopyProcs,test_copy_procs """ # Create pool and containers - pool1 = self.create_pool() + pool1 = self.get_pool() cont1 = self.get_container(pool1) cont2 = self.get_container(pool1) diff --git a/src/tests/ftest/datamover/dst_create.py b/src/tests/ftest/datamover/dst_create.py index a0a5f4bef27..379c152f763 100644 --- a/src/tests/ftest/datamover/dst_create.py +++ b/src/tests/ftest/datamover/dst_create.py @@ -58,8 +58,7 @@ def run_dm_dst_create(self, tool, cont_type, api, check_props): self.set_api(api) # Create 1 pool - pool1 = self.create_pool() - pool1.connect(2) + pool1 = self.get_pool() # Create a source cont cont1 = self.get_container(pool1, type=cont_type) @@ -98,8 +97,7 @@ def run_dm_dst_create(self, tool, cont_type, api, check_props): self.verify_cont(cont3, api, check_props, src_props) # Create another pool - pool2 = self.create_pool() - pool2.connect(2) + pool2 = self.get_pool() result = self.run_datamover( self.test_id + " cont1 to cont4 (different pool) (empty cont)", diff --git a/src/tests/ftest/datamover/large_dir.py b/src/tests/ftest/datamover/large_dir.py index f5e6c0e9aac..53187382007 100644 --- a/src/tests/ftest/datamover/large_dir.py +++ b/src/tests/ftest/datamover/large_dir.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2022 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -46,7 +46,7 @@ def run_dm_large_dir(self, tool): file_size = self.params.get("bytes", self.mdtest_cmd.namespace) # create pool and cont1 - pool = self.create_pool() + pool = self.get_pool() cont1 = self.get_container(pool) # run mdtest to create data in cont1 diff --git a/src/tests/ftest/datamover/large_file.py b/src/tests/ftest/datamover/large_file.py index 6fc9faf03e0..b962bdca376 100644 --- a/src/tests/ftest/datamover/large_file.py +++ b/src/tests/ftest/datamover/large_file.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -43,7 +43,7 @@ def run_dm_large_file(self, tool): self.fail("Failed to get ior processes for {}".format(self.tool)) # create pool and cont - pool = self.create_pool() + pool = self.get_pool() cont1 = self.get_container(pool) # create initial data in cont1 diff --git a/src/tests/ftest/datamover/negative.py b/src/tests/ftest/datamover/negative.py index 3b05e1c8dfd..f3891045d86 100644 --- a/src/tests/ftest/datamover/negative.py +++ b/src/tests/ftest/datamover/negative.py @@ -65,7 +65,7 @@ def test_dm_bad_params_dcp(self): start_dfuse(self, dfuse) # Create a test pool - pool1 = self.create_pool() + pool1 = self.get_pool() # Create a special container to hold UNS entries uns_cont = self.get_container(pool1) @@ -215,7 +215,7 @@ def test_dm_bad_params_fs_copy(self): start_dfuse(self, dfuse) # Create a test pool - pool1 = self.create_pool() + pool1 = self.get_pool() # Create a special container to hold UNS entries uns_cont = self.get_container(pool1) diff --git a/src/tests/ftest/datamover/obj_large_posix.py b/src/tests/ftest/datamover/obj_large_posix.py index 87f252ea23e..f522e7a92df 100644 --- a/src/tests/ftest/datamover/obj_large_posix.py +++ b/src/tests/ftest/datamover/obj_large_posix.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2022 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -37,7 +37,7 @@ def run_dm_obj_large_posix(self, tool): file_size = self.params.get("bytes", "/run/mdtest/*") # Create pool1 and cont1 - pool1 = self.create_pool() + pool1 = self.get_pool() cont1 = self.get_container(pool1) # Create a large directory in cont1 diff --git a/src/tests/ftest/datamover/obj_small.py b/src/tests/ftest/datamover/obj_small.py index ed9ba5674b5..4e3a4d1fbb2 100644 --- a/src/tests/ftest/datamover/obj_small.py +++ b/src/tests/ftest/datamover/obj_small.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -58,8 +58,7 @@ def run_dm_obj_small(self, tool): self.set_tool(tool) # Create pool1 - pool1 = self.create_pool() - pool1.connect(2) + pool1 = self.get_pool() # Create cont1 cont1 = self.get_container(pool1) @@ -85,8 +84,7 @@ def run_dm_obj_small(self, tool): self.num_akeys_array, self.akey_sizes, self.akey_extents) # Create pool2 - pool2 = self.create_pool() - pool2.connect(2) + pool2 = self.get_pool() # Clone cont1 to a new cont3 in pool2 result = self.run_datamover( diff --git a/src/tests/ftest/datamover/posix_meta_entry.py b/src/tests/ftest/datamover/posix_meta_entry.py index bb608c27853..a37818ccbcc 100644 --- a/src/tests/ftest/datamover/posix_meta_entry.py +++ b/src/tests/ftest/datamover/posix_meta_entry.py @@ -67,7 +67,7 @@ def run_dm_posix_meta_entry(self, tool): start_dfuse(self, dfuse) # Create 1 pool - pool1 = self.create_pool() + pool1 = self.get_pool() # Create 1 source container with test data cont1 = self.get_container(pool1) diff --git a/src/tests/ftest/datamover/posix_preserve_props.py b/src/tests/ftest/datamover/posix_preserve_props.py index 91df7c11135..bc1e52ace71 100644 --- a/src/tests/ftest/datamover/posix_preserve_props.py +++ b/src/tests/ftest/datamover/posix_preserve_props.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -56,8 +56,7 @@ def run_dm_preserve_props(self, tool, cont_type, api): self.set_api(api) # Create 1 pool - pool1 = self.create_pool() - pool1.connect(2) + pool1 = self.get_pool() # set the path to read and write container properties self.preserve_props_path = join(self.tmp, "cont_props.h5") diff --git a/src/tests/ftest/datamover/posix_subsets.py b/src/tests/ftest/datamover/posix_subsets.py index 45e33d9cec9..fd14e0cf7b0 100644 --- a/src/tests/ftest/datamover/posix_subsets.py +++ b/src/tests/ftest/datamover/posix_subsets.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -54,7 +54,7 @@ def run_dm_posix_subsets(self, tool): start_dfuse(self, dfuse) # Create 1 pool - pool1 = self.create_pool() + pool1 = self.get_pool() # create dfuse containers to test copying to dfuse subdirectories dfuse_cont1 = self.get_container(pool1) diff --git a/src/tests/ftest/datamover/posix_symlinks.py b/src/tests/ftest/datamover/posix_symlinks.py index 68d60e4c973..f1dc87e46d7 100644 --- a/src/tests/ftest/datamover/posix_symlinks.py +++ b/src/tests/ftest/datamover/posix_symlinks.py @@ -60,7 +60,7 @@ def run_dm_posix_symlinks(self, tool): start_dfuse(self, dfuse) # Create 1 pool - pool1 = self.create_pool() + pool1 = self.get_pool() # Create a special container to hold UNS entries uns_cont = self.get_container(pool1) diff --git a/src/tests/ftest/datamover/posix_types.py b/src/tests/ftest/datamover/posix_types.py index 0ef85d018a8..79583bfb574 100644 --- a/src/tests/ftest/datamover/posix_types.py +++ b/src/tests/ftest/datamover/posix_types.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -68,8 +68,8 @@ def run_dm_posix_types(self, tool): start_dfuse(self, dfuse) # Create 2 pools - pool1 = self.create_pool(label='pool1') - pool2 = self.create_pool(label='pool2') + pool1 = self.get_pool(label='pool1') + pool2 = self.get_pool(label='pool2') # Create a special container to hold UNS entries uns_cont = self.get_container(pool1) diff --git a/src/tests/ftest/datamover/serial_large_posix.py b/src/tests/ftest/datamover/serial_large_posix.py index 6917097d901..0feb7253a8d 100644 --- a/src/tests/ftest/datamover/serial_large_posix.py +++ b/src/tests/ftest/datamover/serial_large_posix.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2022 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -43,7 +43,7 @@ def run_dm_serial_large_posix(self, tool): file_size = self.params.get("bytes", "/run/mdtest/*") # Create pool1 and cont1 - pool1 = self.create_pool() + pool1 = self.get_pool() cont1 = self.get_container(pool1) # Create a large directory in cont1 @@ -51,7 +51,7 @@ def run_dm_serial_large_posix(self, tool): self.run_mdtest_with_params("DAOS", "/", pool1, cont1, flags=mdtest_flags[0]) # Create pool2 - pool2 = self.create_pool() + pool2 = self.get_pool() # Use dfuse as a shared intermediate for serialize + deserialize dfuse_cont = self.get_container(pool1) diff --git a/src/tests/ftest/datamover/serial_small.py b/src/tests/ftest/datamover/serial_small.py index 28ce84bee35..75e91285959 100644 --- a/src/tests/ftest/datamover/serial_small.py +++ b/src/tests/ftest/datamover/serial_small.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2020-2022 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -56,8 +56,7 @@ def run_dm_serial_small(self, tool): self.set_tool(tool) # Create pool1 - pool1 = self.create_pool() - pool1.connect(2) + pool1 = self.get_pool() # Create cont1 cont1 = self.get_container(pool1) @@ -69,8 +68,7 @@ def run_dm_serial_small(self, tool): self.num_akeys_array, self.akey_sizes, self.akey_extents) # Create pool2 - pool2 = self.create_pool() - pool2.connect(2) + pool2 = self.get_pool() # Serialize/Deserialize cont1 to a new cont2 in pool2 result = self.run_datamover( diff --git a/src/tests/ftest/deployment/basic_checkout.py b/src/tests/ftest/deployment/basic_checkout.py index 52a828e8329..216e89fd795 100644 --- a/src/tests/ftest/deployment/basic_checkout.py +++ b/src/tests/ftest/deployment/basic_checkout.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -120,7 +120,7 @@ def test_basic_checkout_dm(self): self.ior_ppn = self.ppn # create pool and container - pool = self.create_pool() + pool = self.get_pool() cont = self.get_container(pool, oclass=self.ior_cmd.dfs_oclass.value) # run datamover diff --git a/src/tests/ftest/deployment/basic_checkout.yaml b/src/tests/ftest/deployment/basic_checkout.yaml index 03d420ab82b..7ce9515bae8 100644 --- a/src/tests/ftest/deployment/basic_checkout.yaml +++ b/src/tests/ftest/deployment/basic_checkout.yaml @@ -70,7 +70,7 @@ mdtest_easy: &mdtest_easy_base write_bytes: 0 num_of_files_dirs: 100000000 stonewall_timer: 30 - stonewall_statusfile: "/var/tmp/daos_testing/stoneWallingStatusFile" + stonewall_statusfile: stoneWallingStatusFile dfs_destroy: false mdtest_dfs_s1: <<: *mdtest_easy_base diff --git a/src/tests/ftest/deployment/disk_failure.py b/src/tests/ftest/deployment/disk_failure.py index 23f2132171c..10f91c8074e 100644 --- a/src/tests/ftest/deployment/disk_failure.py +++ b/src/tests/ftest/deployment/disk_failure.py @@ -119,7 +119,6 @@ def test_disk_failure_w_rf(self): Test disk failures during the IO operation. :avocado: tags=all,manual - :avocado: tags=hw,medium :avocado: tags=deployment,disk_failure :avocado: tags=DiskFailureTest,test_disk_failure_w_rf """ @@ -131,7 +130,6 @@ def test_disk_fault_to_normal(self): Test a disk inducing faults and resetting is back to normal state. :avocado: tags=all,manual - :avocado: tags=hw,medium :avocado: tags=deployment,disk_failure :avocado: tags=DiskFailureTest,test_disk_fault_to_normal """ diff --git a/src/tests/ftest/deployment/io_sys_admin.py b/src/tests/ftest/deployment/io_sys_admin.py index bca8373ba5c..265c1ad42f3 100644 --- a/src/tests/ftest/deployment/io_sys_admin.py +++ b/src/tests/ftest/deployment/io_sys_admin.py @@ -40,66 +40,88 @@ def test_io_sys_admin(self): new_cont_user = self.params.get("user", "/run/container_set_owner/*") new_cont_group = self.params.get("group", "/run/container_set_owner/*") + # Toggle independent steps + steps_to_run = { + "pool_create_ownership": True, + "storage_system_query": True, + "io": True, + "snapshot": True, + "datamover": True + } + for step in steps_to_run: + run = self.params.get(step, "/run/io_sys_admin/steps_to_run/*", None) + if run is not None: + steps_to_run[step] = run + dmg = self.get_dmg_command() daos = self.get_daos_command() - for idx in range(1, 4): - pool = self.get_pool(namespace=f"/run/pool_{idx}/", create=False) - check_pool_creation(self, [pool], 60) - containers = [] - for cont_idx in range(1, 4): - containers.append( - self.get_container(pool, namespace=f"/run/container_{cont_idx}/")) - containers[-1].set_owner(f"{new_cont_user}@", f"{new_cont_group}@") - - daos.container_list(pool.identifier) - self.destroy_containers(containers) - pool.destroy() - - # dmg storage scan - dmg.storage_scan() - dmg.system_query() - dmg.system_leader_query() - - # write large data sets - self.run_file_count() - # create snapshot - self.container[-1].create_snap() - # overwrite the last ior file - self.ior_cmd.signature.update('456') - self.processes = self.ior_np - self.ppn = self.ior_ppn - self.run_ior_with_pool(create_pool=False, create_cont=False) - - nvme_free_space_before_snap_destroy = self.get_free_space()[1] - # delete snapshot - self.container[-1].destroy_snap(epc=self.container[-1].epoch) - # Now check if the space is returned back. - counter = 1 - returned_space = self.get_free_space()[1] - nvme_free_space_before_snap_destroy - - data_written = (int(self.ppn) * human_to_bytes(self.ior_cmd.block_size.value)) - while returned_space < int(data_written): - # try to wait for 4 x 60 secs for aggregation to be completed or - # else exit the test with a failure. - if counter > 4: - self.log.info("Free space before snapshot destroy: %s", - nvme_free_space_before_snap_destroy) - self.log.info("Free space when test terminated: %s", - self.get_free_space()[1]) - self.fail("Aggregation did not complete as expected") - - time.sleep(60) + if steps_to_run["pool_create_ownership"]: + self.log_step("Verify pool creation time and container set-owner") + for idx in range(1, 4): + pool = self.get_pool(namespace=f"/run/pool_{idx}/", create=False) + check_pool_creation(self, [pool], 60) + containers = [] + for cont_idx in range(1, 4): + containers.append( + self.get_container(pool, namespace=f"/run/container_{cont_idx}/")) + containers[-1].set_owner(f"{new_cont_user}@", f"{new_cont_group}@") + + daos.container_list(pool.identifier) + self.destroy_containers(containers) + pool.destroy() + + if steps_to_run["storage_system_query"]: + self.log_step("Verify storage scan and system query") + dmg.storage_scan() + dmg.system_query() + dmg.system_leader_query() + + if steps_to_run["io"]: + self.log_step("Verifying large dataset IO") + self.run_file_count() + + if steps_to_run["snapshot"]: + self.log_step("Verifying snapshot creation and aggregation") + self.container[-1].create_snap() + # overwrite the last ior file + self.ior_cmd.signature.update('456') + self.processes = self.ior_np + self.ppn = self.ior_ppn + self.run_ior_with_pool(create_pool=False, create_cont=False) + + nvme_free_space_before_snap_destroy = self.get_free_space()[1] + # delete snapshot + self.container[-1].destroy_snap(epc=self.container[-1].epoch) + # Now check if the space is returned back. + counter = 1 returned_space = self.get_free_space()[1] - nvme_free_space_before_snap_destroy - counter += 1 - - self.log.info("#####Starting FS_COPY Test") - self.run_dm_activities_with_ior("FS_COPY", self.pool, self.container[-1]) - self.log.info("#####Starting DCP Test") - self.run_dm_activities_with_ior("DCP", self.pool, self.container[-1]) - self.log.info("#####Starting DSERIAL Test") - self.run_dm_activities_with_ior("DSERIAL", self.pool, self.container[-1]) - self.log.info("#####Starting CONT_CLONE Test") - self.run_dm_activities_with_ior("CONT_CLONE", self.pool, self.container[-1]) - self.log.info("#####Completed all Datamover tests") - self.container.pop(0) + + data_written = (int(self.ppn) * human_to_bytes(self.ior_cmd.block_size.value)) + while returned_space < int(data_written): + # try to wait for 4 x 60 secs for aggregation to be completed or + # else exit the test with a failure. + if counter > 4: + self.log.info( + "Free space before snapshot destroy: %s", + nvme_free_space_before_snap_destroy) + self.log.info( + "Free space when test terminated: %s", self.get_free_space()[1]) + self.fail("Aggregation did not complete as expected") + + time.sleep(60) + returned_space = self.get_free_space()[1] - nvme_free_space_before_snap_destroy + counter += 1 + + if steps_to_run["datamover"]: + self.log_step("Verifying datamover") + self.log.info("#####Starting FS_COPY Test") + self.run_dm_activities_with_ior("FS_COPY", self.pool, self.container[-1]) + self.log.info("#####Starting DCP Test") + self.run_dm_activities_with_ior("DCP", self.pool, self.container[-1]) + self.log.info("#####Starting DSERIAL Test") + self.run_dm_activities_with_ior("DSERIAL", self.pool, self.container[-1]) + self.log.info("#####Starting CONT_CLONE Test") + self.run_dm_activities_with_ior("CONT_CLONE", self.pool, self.container[-1]) + self.log.info("#####Completed all Datamover tests") + self.container.pop(0) diff --git a/src/tests/ftest/deployment/io_sys_admin.yaml b/src/tests/ftest/deployment/io_sys_admin.yaml index 6c3edab15b3..f2a238ad4b5 100644 --- a/src/tests/ftest/deployment/io_sys_admin.yaml +++ b/src/tests/ftest/deployment/io_sys_admin.yaml @@ -104,3 +104,11 @@ dcp: np: 16 hdf5_vol: plugin_path: /usr/lib64/mpich/lib + +io_sys_admin: + steps_to_run: + pool_create_ownership: True + storage_system_query: True + io: True + snapshot: True + datamover: True diff --git a/src/tests/ftest/performance/mdtest_easy.yaml b/src/tests/ftest/performance/mdtest_easy.yaml index 8fdd27031c2..a81db811686 100644 --- a/src/tests/ftest/performance/mdtest_easy.yaml +++ b/src/tests/ftest/performance/mdtest_easy.yaml @@ -46,7 +46,7 @@ mdtest: &mdtest_base write_bytes: 0 num_of_files_dirs: 100000000 stonewall_timer: 30 - stonewall_statusfile: "/var/tmp/daos_testing/stoneWallingStatusFile" + stonewall_statusfile: stoneWallingStatusFile dfs_destroy: false mdtest_s1: &mdtest_s1 diff --git a/src/tests/ftest/performance/mdtest_hard.yaml b/src/tests/ftest/performance/mdtest_hard.yaml index ae3fcebaf5c..2bf5e0d73ca 100644 --- a/src/tests/ftest/performance/mdtest_hard.yaml +++ b/src/tests/ftest/performance/mdtest_hard.yaml @@ -47,7 +47,7 @@ mdtest: &mdtest_base write_bytes: 3901 num_of_files_dirs: 100000000 stonewall_timer: 30 - stonewall_statusfile: "/var/tmp/daos_testing/stoneWallingStatusFile" + stonewall_statusfile: stoneWallingStatusFile dfs_destroy: false mdtest_s1: &mdtest_s1 diff --git a/src/tests/ftest/recovery/ddb.py b/src/tests/ftest/recovery/ddb.py index 04df3184984..25e7223e0fa 100644 --- a/src/tests/ftest/recovery/ddb.py +++ b/src/tests/ftest/recovery/ddb.py @@ -90,12 +90,12 @@ def copy_remote_to_local(remote_file_path, test_dir, remote): # Use clush --rcopy to copy the file from the remote server node to the local test # node. clush will append . to the file when copying. args = "--rcopy {} --dest {}".format(remote_file_path, test_dir) - clush_command = get_clush_command(hosts=remote, args=args) + clush_command = get_clush_command(hosts=remote, args=args, timeout=60) try: - run_command(command=clush_command) + run_command(command=clush_command, timeout=None) except DaosTestError as error: - print("ERROR: Copying {} from {}: {}".format(remote_file_path, remote, error)) - raise error + raise DaosTestError( + f"ERROR: Copying {remote_file_path} from {remote}: {error}") from error # Remove the appended . from the copied file. current_file_path = "".join([remote_file_path, ".", remote]) @@ -103,10 +103,8 @@ def copy_remote_to_local(remote_file_path, test_dir, remote): try: run_command(command=mv_command) except DaosTestError as error: - print( - "ERROR: Moving {} to {}: {}".format( - current_file_path, remote_file_path, error)) - raise error + raise DaosTestError( + f"ERROR: Moving {current_file_path} to {remote_file_path}: {error}") from error class DdbTest(RecoveryTestBase): diff --git a/src/tests/ftest/server/multiengine_persocket.py b/src/tests/ftest/server/multiengine_persocket.py index 8c92fdfbdad..0431a9b7b2a 100644 --- a/src/tests/ftest/server/multiengine_persocket.py +++ b/src/tests/ftest/server/multiengine_persocket.py @@ -63,15 +63,12 @@ def verify_list_attr(self, indata, attributes_list): self.log.info(" list_attr size: %s", size) if length != size: - self.fail( - "FAIL: Size does not match for Names in list attr, Expected " - "len={} and received len={}".format(length, size)) + self.fail(f"Container attribute list size mismatch: expected {length}, received {size}") + # verify the Attributes names in list_attr retrieve for key in indata.keys(): if key.decode() not in attributes_list: - self.fail( - "FAIL: Name does not match after list attr, Expected " - "buf={} and received buf={}".format(key, attributes_list)) + self.fail(f"Unexpected container attribute received: {key}") def verify_get_attr(self, indata, outdata): """verify the Attributes value after get_attr. @@ -92,37 +89,29 @@ def verify_get_attr(self, indata, outdata): self.log.info(" set_attr data: %s", decoded) for attr, value in indata.items(): - if value != decoded.get(attr.decode(), None): + received = decoded.get(attr.decode(), None) + if value != received: self.fail( - "FAIL: Value does not match after get({}), Expected " - "val={} and received val={}".format(attr, value, - decoded.get(attr.decode(), None))) - - def daos_server_scm_reset(self, step): - """Perform daos_server scm reset. + f"Unexpected value for container attribute {attr}: expected {value}, " + f"received {received}") - Args: - step (str): test step. - """ + def daos_server_scm_reset(self): + """Perform daos_server scm reset.""" cmd = DaosServerCommand() cmd.sudo = False cmd.debug.value = False cmd.set_sub_command("scm") cmd.sub_command_class.set_sub_command("reset") cmd.sub_command_class.sub_command_class.force.value = True - self.log.info( - "===(%s.A)Starting daos_server scm reset: %s", step, str(cmd)) + self.log_step("Resetting server PMem") results = run_remote(self.log, self.hostlist_servers, str(cmd), timeout=180) if not results.passed: - self.fail( - "#({0}.A){1} failed, " - "please make sure the server equipped with PMem modules".format(step, cmd)) + self.fail("Error resetting server PMem - ensure servers are equipped with PMem modules") - def daos_server_scm_prepare_ns(self, step, engines_per_socket=1): + def daos_server_scm_prepare_ns(self, engines_per_socket=1): """Perform daos_server scm prepare --scm-ns-per-socket. Args: - step (str): test step. engines_per_socket (int): number of engines per socket. """ cmd = DaosServerCommand() @@ -132,15 +121,10 @@ def daos_server_scm_prepare_ns(self, step, engines_per_socket=1): cmd.sub_command_class.set_sub_command("prepare") cmd.sub_command_class.sub_command_class.scm_ns_per_socket.value = engines_per_socket cmd.sub_command_class.sub_command_class.force.value = True - - self.log.info( - "===(%s.B)Starting daos_server scm prepare -S: %s", step, str(cmd)) + self.log_step(f"Preparing server PMem for {engines_per_socket} engines per socket") results = run_remote(self.log, self.hostlist_servers, str(cmd), timeout=180) if not results.passed: - self.fail( - "#({0}.B){1} failed, " - "please make sure the server equipped with {2} PMem " - "modules.".format(step, cmd, engines_per_socket)) + self.fail(f"Error preparing server PMem for {engines_per_socket} engines per socket") def host_reboot(self, hosts): """To reboot the hosts. @@ -154,7 +138,7 @@ def host_reboot(self, hosts): if not wait_for_result(self.log, check_ping, 600, 5, True, host=hosts[0], expected_ping=False, cmd_timeout=60, verbose=True): - self.fail("Shutwown not detected within 600 seconds.") + self.fail("Shutdown not detected within 600 seconds.") if not wait_for_result(self.log, check_ping, 600, 5, True, host=hosts[0], expected_ping=True, cmd_timeout=60, verbose=True): self.fail("Reboot not detected within 600 seconds.") @@ -184,20 +168,9 @@ def storage_format(self): if not run_local(self.log, "dmg storage format").passed: self.fail("dmg storage format failed") - def cleanup(self): - """Servers clean up after test complete.""" - self.pool.destroy(recursive=1, force=1) - cleanup_cmds = [ - "sudo systemctl stop daos_server.service", - "sudo umount /mnt/daos*", - "sudo wipefs -a /dev/pmem*", - "/usr/bin/ls -l /dev/pmem*", - 'lsblk|grep -E "NAME|pmem"'] - for cmd in cleanup_cmds: - run_remote(self.log, self.hostlist_servers, cmd, timeout=90) - - def test_multiengines_per_socket(self): + def test_multi_engines_per_socket(self): """Test ID: DAOS-12076. + Test description: Test multiple engines/sockets. (1) Scm reset and prepare --scm-ns-per-socket (2) Start server @@ -207,112 +180,75 @@ def test_multiengines_per_socket(self): (6) Container create and attributes test (7) IOR test (8) MDTEST - (9) Cleanup + To launch test: (1) Make sure server is equipped with PMem - (2) ./launch.py test_multiengines_per_socket -ts -tc + (2) ./launch.py test_multi_engines_per_socket -ts -tc + :avocado: tags=manual :avocado: tags=server - :avocado: tags=MultiEnginesPerSocketTest,test_multiengines_per_socket + :avocado: tags=MultiEnginesPerSocketTest,test_multi_engines_per_socket """ - # (1) Scm reset and prepare --scm-ns-per-socket - step = 1 - self.log.info("===(%s)===Scm reset and prepare --scm-ns-per-socket", step) - engines_per_socket = self.params.get( - "engines_per_socket", "/run/server_config/*", default=1) - num_pmem = self.params.get( - "number_pmem", "/run/server_config/*", default=1) - self.daos_server_scm_reset(step) + server_namespace = "/run/server_config/*" + num_attributes = self.params.get("num_attributes", '/run/container/*') + _engines_per_socket = self.params.get("engines_per_socket", server_namespace, 1) + _num_pmem = self.params.get("number_pmem", server_namespace, 1) + + # Configure PMem for multiple engines per socket + self.daos_server_scm_reset() self.host_reboot(self.hostlist_servers) - self.daos_server_scm_prepare_ns(1.1, engines_per_socket) + self.daos_server_scm_prepare_ns(_engines_per_socket) self.host_reboot(self.hostlist_servers) - self.daos_server_scm_prepare_ns(1.2, engines_per_socket) + self.daos_server_scm_prepare_ns(_engines_per_socket) if not wait_for_result(self.log, self.check_pmem, 160, 1, False, - hosts=self.hostlist_servers, count=num_pmem): - self.fail("#{} pmem devices not found on all hosts.".format(num_pmem)) - self.storage_format() - - # (2) Start server - step += 1 - self.log.info("===(%s)===Start server", step) - start_server_cmds = [ - 'lsblk|grep -E "NAME|pmem"', - "sudo cp /etc/daos/daos_server.yml_4 /etc/daos/daos_server.yml", - "sudo systemctl start daos_server.service"] - for cmd in start_server_cmds: - results = run_remote(self.log, self.hostlist_servers, cmd, timeout=90) - # Check for server start status - if not results.passed: - self.fail("#Fail on {0}".format(cmd)) - - # (3) Start agent - step += 1 - self.log.info("===(%s)===Start agent", step) - start_agent_cmds = [ - "sudo systemctl start daos_agent.service", - "dmg storage scan", - "dmg network scan", - "dmg storage format", - "dmg storage query usage", - "dmg storage query list-devices", - "dmg system query"] - for cmd in start_agent_cmds: - results = run_remote(self.log, self.hostlist_clients, cmd, timeout=90) - # Check for agent start status - if not results.passed and "sudo systemctl" in cmd: - self.fail("#Fail on {0}".format(cmd)) - # (4) Dmg system query - step += 1 - self.log.info("===(%s)===Dmg system query", step) - # Delay is needed for multi ranks to show - query_cmds = [ - "dmg system query", - "dmg system query -v"] - for cmd in query_cmds: - results = run_remote(self.log, self.hostlist_clients, cmd, timeout=90) - - # (5) Pool create - step += 1 - self.log.info("===(%s)===Pool create", step) + hosts=self.hostlist_servers, count=_num_pmem): + self.fail(f"Error {_num_pmem} PMem devices not found on all hosts.") + + # Start servers + self.log_step("Starting servers") + run_remote(self.log, self.hostlist_servers, 'lsblk|grep -E "NAME|pmem"') + self.start_servers() + + # Start agents + self.log_step("Starting agents") + self.start_agents() + + # Run some dmg commands + self.log_step("Query the storage usage") + dmg = self.get_dmg_command() + # dmg.storage_query_usage() + dmg.storage_query_list_devices() + + # Create a pool + self.log_step("Create a pool") self.add_pool(connect=False) # (6) Container create and attributes test - step += 1 - self.log.info("===(%s)===Container create and attributes test", step) + self.log_step("Create a container and verify the attributes") self.add_container(self.pool) self.container.open() - num_attributes = self.params.get("num_attributes", '/run/attrtests/*') attr_dict = self.create_data_set(num_attributes) try: self.container.container.set_attr(data=attr_dict) data = self.container.list_attrs(verbose=False) self.verify_list_attr(attr_dict, data['response']) - data = self.container.list_attrs(verbose=True) self.verify_get_attr(attr_dict, data['response']) - except DaosApiError as excep: - self.log.info(excep) + except DaosApiError as error: + self.log.info(error) self.log.info(traceback.format_exc()) - self.fail("#Test was expected to pass but it failed.\n") + self.fail("Error setting and verify container attributes") self.container.close() self.pool.disconnect() # (7) IOR test - step += 1 - self.log.info("===(%s)===IOR test", step) + self.log_step("Run ior") ior_timeout = self.params.get("ior_timeout", '/run/ior/*') self.run_ior_with_pool( timeout=ior_timeout, create_pool=True, create_cont=True, stop_dfuse=True) # (8) MDTEST - step += 1 - self.log.info("===(%s)===MDTEST", step) + self.log_step("Run mdtest") mdtest_params = self.params.get("mdtest_params", "/run/mdtest/*") self.run_mdtest_multiple_variants(mdtest_params) - - # (9) Cleanup - step += 1 - self.log.info("===(%s)===Cleanup", step) - cmd = "dmg system query -v" - results = run_remote(self.log, self.hostlist_clients, cmd, timeout=90) - self.cleanup() + self.log.info("Test passed") diff --git a/src/tests/ftest/server/multiengine_persocket.yaml b/src/tests/ftest/server/multiengine_persocket.yaml index d183b2cb0dc..628f05273cd 100644 --- a/src/tests/ftest/server/multiengine_persocket.yaml +++ b/src/tests/ftest/server/multiengine_persocket.yaml @@ -1,104 +1,103 @@ hosts: test_servers: 1 test_clients: 1 + timeout: 930 + setup: start_agents: False start_servers: False start_agents_once: False start_servers_once: False + server_config: name: daos_server + provider: ofi+tcp reboot_waittime: 210 + engines_per_host: 4 engines_per_socket: 2 number_pmem: 4 engines: - - + 0: pinned_numa_node: 0 targets: 8 nr_xs_helpers: 0 # count of I/O offload threads per engine fabric_iface: eth0 fabric_iface_port: 31416 log_mask: ERR - log_file: /tmp/daos_engine.1.log + log_file: daos_engine.0.log env_vars: - FI_SOCKETS_MAX_CONN_RETRY=1 - FI_SOCKETS_CONN_TIMEOUT=2000 storage: - - + 0: class: dcpm scm_list: [/dev/pmem0] - scm_mount: /mnt/daos1 - - + scm_mount: /mnt/daos0 + 1: pinned_numa_node: 0 targets: 8 nr_xs_helpers: 0 # count of I/O offload threads per engine fabric_iface: eth0 fabric_iface_port: 32416 log_mask: ERR - log_file: /tmp/daos_engine.2.log + log_file: daos_engine.1.log env_vars: - FI_SOCKETS_MAX_CONN_RETRY=1 - FI_SOCKETS_CONN_TIMEOUT=2000 storage: - - + 0: class: dcpm scm_list: [/dev/pmem0.1] - scm_mount: /mnt/daos2 - - + scm_mount: /mnt/daos1 + 2: pinned_numa_node: 1 targets: 8 nr_xs_helpers: 0 # count of I/O offload threads per engine fabric_iface: eth0 fabric_iface_port: 33416 log_mask: ERR - log_file: /tmp/daos_engine.3.log + log_file: daos_engine.2.log env_vars: - FI_SOCKETS_MAX_CONN_RETRY=1 - FI_SOCKETS_CONN_TIMEOUT=2000 storage: - - + 0: class: dcpm scm_list: [/dev/pmem1] - scm_mount: /mnt/daos3 - - + scm_mount: /mnt/daos2 + 3: pinned_numa_node: 1 targets: 8 nr_xs_helpers: 0 # count of I/O offload threads per engine fabric_iface: eth0 fabric_iface_port: 34416 log_mask: ERR - log_file: /tmp/daos_engine.4.log + log_file: daos_engine.3.log env_vars: - FI_SOCKETS_MAX_CONN_RETRY=1 - FI_SOCKETS_CONN_TIMEOUT=2000 storage: - - + 0: class: dcpm scm_list: [/dev/pmem1.1] - scm_mount: /mnt/daos4 - transport_config: - allow_insecure: false -agent_config: - transport_config: - allow_insecure: false -dmg: - transport_config: - allow_insecure: false -provider: ofi+tcp + scm_mount: /mnt/daos3 + pool: control_method: dmg scm_size: 1G name: daos_server + container: control_method: daos type: POSIX properties: rf:0 -attrtests: num_attributes: 20 + dfuse: mount_dir: "/tmp/daos_dfuse1/" disable_caching: True + ior: ior_timeout: 120 client_processes: @@ -111,6 +110,7 @@ ior: iorflags: write_flg: "-w -W -k -G 1 -i 1" read_flg: "-C -k -e -r -R -g -G 1 -Q 1 -vv" + mdtest: client_processes: ppn: 8 diff --git a/src/tests/ftest/slurm_setup.py b/src/tests/ftest/slurm_setup.py index 0c3d300d5ff..00e95c6e128 100755 --- a/src/tests/ftest/slurm_setup.py +++ b/src/tests/ftest/slurm_setup.py @@ -145,8 +145,9 @@ def start_munge(self, user): non_control = self.nodes.difference(self.control) self.log.debug('Copying the munge key to %s', non_control) command = get_clush_command( - non_control, args=f"-B -S -v --copy {self.MUNGE_KEY} --dest {self.MUNGE_KEY}") - result = run_remote(self.log, self.control, command) + non_control, args=f"-B -S -v --copy {self.MUNGE_KEY} --dest {self.MUNGE_KEY}", + timeout=60) + result = run_remote(self.log, self.control, command, timeout=None) if not result.passed: raise SlurmSetupException(f'Error creating munge key on {result.failed_hosts}') diff --git a/src/tests/ftest/telemetry/wal_metrics.py b/src/tests/ftest/telemetry/wal_metrics.py index 105015aaf29..ee3f85ff2d7 100644 --- a/src/tests/ftest/telemetry/wal_metrics.py +++ b/src/tests/ftest/telemetry/wal_metrics.py @@ -200,8 +200,8 @@ def test_wal_checkpoint_metrics(self): # Check point dirty chunks should be 1-300 ranges[metric][label] = [1, 300] elif '_dirty_pages' in metric: - # Check point dirty pages should be 1-3 - ranges[metric][label] = [1, 3] + # Check point dirty pages should be 1-30 + ranges[metric][label] = [1, 30] elif '_duration' in metric: # Check point duration should be 1-2,000,000 ranges[metric][label] = [1, 2000000] diff --git a/src/tests/ftest/util/data_mover_test_base.py b/src/tests/ftest/util/data_mover_test_base.py index 669a720228c..64a330c09c8 100644 --- a/src/tests/ftest/util/data_mover_test_base.py +++ b/src/tests/ftest/util/data_mover_test_base.py @@ -113,7 +113,6 @@ def __init__(self, *args, **kwargs): self.ddeserialize_cmd = None self.fs_copy_cmd = None self.cont_clone_cmd = None - self.pool = [] self.dfuse_hosts = None self.num_run_datamover = 0 # Number of times run_datamover was called @@ -306,20 +305,6 @@ def _validate_param_type(self, param_type): self.fail("Invalid param_type: {}".format(_type)) return None - def create_pool(self, **params): - """Create a TestPool object and adds to self.pool. - - Returns: - TestPool: the created pool - - """ - pool = self.get_pool(connect=False, **params) - - # Save the pool - self.pool.append(pool) - - return pool - def parse_create_cont_label(self, output): """Parse a uuid or label from create container output. diff --git a/src/tests/ftest/util/ior_utils.py b/src/tests/ftest/util/ior_utils.py index 7851e4587d7..a7fd330d6ef 100644 --- a/src/tests/ftest/util/ior_utils.py +++ b/src/tests/ftest/util/ior_utils.py @@ -461,21 +461,6 @@ def get_ior_metrics(cmdresult): return (write_metrics, read_metrics) - @staticmethod - def log_metrics(logger, message, metrics): - """Log the ior metrics. - - Args: - logger (log): logger object handle - message (str) : Message to print before logging metrics - metric (lst) : IOR write and read metrics - """ - logger.info("\n") - logger.info(message) - for metric in metrics: - logger.info(metric) - logger.info("\n") - class IorMetrics(IntEnum): """Index Name and Number of each column in IOR result summary.""" diff --git a/src/tests/ftest/util/run_utils.py b/src/tests/ftest/util/run_utils.py index 2f9d33b07c5..f4893558fb0 100644 --- a/src/tests/ftest/util/run_utils.py +++ b/src/tests/ftest/util/run_utils.py @@ -345,7 +345,8 @@ def log_result_data(log, data): log.debug("%s%s", " " * indent, line) -def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False): +def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False, + timeout=None, fanout=None): """Get the clush command with optional sudo arguments. Args: @@ -355,14 +356,21 @@ def get_clush_command(hosts, args=None, command="", command_env=None, command_su command_env (EnvironmentVariables, optional): environment variables to export with the command. Defaults to None. sudo (bool, optional): whether to run the command with sudo privileges. Defaults to False. + timeout (int, optional): number of seconds to wait for the command to complete. + Defaults to None. + fanout (int, optional): fanout to use. Default uses the max of the + clush default (64) or available cores Returns: str: the clush command """ - cmd_list = ["clush"] + if fanout is None: + fanout = max(64, len(os.sched_getaffinity(0))) + cmd_list = ["clush", "-f", str(fanout), "-w", str(hosts)] + if timeout is not None: + cmd_list.extend(["-u", str(timeout)]) if args: cmd_list.append(args) - cmd_list.extend(["-w", str(hosts)]) # If ever needed, this is how to disable host key checking: # cmd_list.extend(["-o", "-oStrictHostKeyChecking=no"]) cmd_list.append(command_as_user(command, "root" if command_sudo else "", command_env)) diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index c45bcbfff01..3bd44d646f3 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -15,7 +15,7 @@ Name: daos Version: 2.6.1 -Release: 2%{?relval}%{?dist} +Release: 3%{?relval}%{?dist} Summary: DAOS Storage Engine License: BSD-2-Clause-Patent @@ -594,6 +594,9 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent # No files in a shim package %changelog +* Tue Oct 01 2024 Phillip Henderson 2.6.1-3 +- Third release candidate for 2.6.1 + * Fri Sep 20 2024 Phillip Henderson 2.6.1-2 - Second release candidate for 2.6.1