From b913d3e9685fe8b31b046d197f66477460053e68 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Mon, 21 Oct 2024 14:19:00 -0400 Subject: [PATCH] DAOS-16265 test: Fix erasurecode/rebuild_fio.py out of space (#15020) (#15340) Prevent accumulating large server log files caused by temporarily enabling the DEBUG log mask while creating or destroying pools. Signed-off-by: Phil Henderson --- .../ftest/erasurecode/multiple_failure.yaml | 1 + src/tests/ftest/erasurecode/rebuild_fio.yaml | 1 + src/tests/ftest/util/apricot/apricot/test.py | 22 +++++++++++++++---- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/tests/ftest/erasurecode/multiple_failure.yaml b/src/tests/ftest/erasurecode/multiple_failure.yaml index 78f132474b5..95aab541329 100644 --- a/src/tests/ftest/erasurecode/multiple_failure.yaml +++ b/src/tests/ftest/erasurecode/multiple_failure.yaml @@ -25,6 +25,7 @@ server_config: storage: auto pool: size: 93% + set_logmasks: False container: type: POSIX control_method: daos diff --git a/src/tests/ftest/erasurecode/rebuild_fio.yaml b/src/tests/ftest/erasurecode/rebuild_fio.yaml index a895c356707..a3539d86579 100644 --- a/src/tests/ftest/erasurecode/rebuild_fio.yaml +++ b/src/tests/ftest/erasurecode/rebuild_fio.yaml @@ -39,6 +39,7 @@ pool: aggregation: threshold: 50000000 aggr_timeout: 180 + set_logmasks: False container: type: POSIX control_method: daos diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index 42e05937f37..563ff7adece 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -643,6 +643,7 @@ def __init__(self, *args, **kwargs): self.setup_start_agents = True self.slurm_exclude_servers = False self.slurm_exclude_nodes = NodeSet() + self.max_test_dir_usage_check = 90 self.host_info = HostInfo() self.hostlist_servers = NodeSet() self.hostlist_clients = NodeSet() @@ -693,6 +694,11 @@ def setUp(self): self.slurm_exclude_servers = self.params.get( "slurm_exclude_servers", "/run/setup/*", self.slurm_exclude_servers) + # Max test directory usage percentage - when exceeded will display sizes of files in the + # test directory + self.max_test_dir_usage_check = self.params.get( + "max_test_dir_usage_check", "/run/setup/*", self.max_test_dir_usage_check) + # The server config name should be obtained from each ServerManager # object, but some tests still use this TestWithServers attribute. self.server_group = self.params.get("name", "/run/server_config/*", "daos_server") @@ -765,12 +771,20 @@ def setUp(self): # List common test directory contents before running the test self.log.info("-" * 100) - self.log.debug("Common test directory (%s) contents:", os.path.dirname(self.test_dir)) + self.log.debug( + "Common test directory (%s) contents (check > %s%%):", + os.path.dirname(self.test_dir), self.max_test_dir_usage_check) all_hosts = include_local_host(self.host_info.all_hosts) test_dir_parent = os.path.dirname(self.test_dir) - result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}") - if int(max(re.findall(r" ([\d+])% ", result.joined_stdout) + ["0"])) > 90: - run_remote(self.log, all_hosts, f"du -sh {test_dir_parent}/*") + _result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}") + _details = NodeSet() + for _host, _stdout in _result.all_stdout.items(): + _test_dir_usage = re.findall(r"\s+([\d]+)%\s+", _stdout) + _test_dir_usage_int = int(max(_test_dir_usage + ["0"])) + if _test_dir_usage_int > self.max_test_dir_usage_check: + _details.add(_host) + if _details: + run_remote(self.log, _details, f"du -sh {test_dir_parent}/*") self.log.info("-" * 100) if not self.start_servers_once or self.name.uid == 1: