From e9ecb6bd4ec4a952a30db11f734550eff76633bf Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Wed, 31 Jul 2024 17:26:31 +0000 Subject: [PATCH] DAOS-16298 test: improve clush and run_remote timeout Skip-unit-tests: true Skip-fault-injection-test: true Allow-unstable-test: true Features: soak_smoke dfuse Make sure remote commands are killed by using -t -t. Make clush timeout per host instead of for all hosts. Required-githooks: true Signed-off-by: Dalton Bohning --- src/tests/ftest/dfuse/daos_build.py | 4 +++- src/tests/ftest/util/dfuse_utils.py | 2 +- src/tests/ftest/util/general_utils.py | 1 + src/tests/ftest/util/run_utils.py | 25 +++++++++++++++++++++---- 4 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/tests/ftest/dfuse/daos_build.py b/src/tests/ftest/dfuse/daos_build.py index 49f125b2ada..03b95468f26 100644 --- a/src/tests/ftest/dfuse/daos_build.py +++ b/src/tests/ftest/dfuse/daos_build.py @@ -255,8 +255,10 @@ def run_build_test(self, cache_mode, il_lib=None, run_on_vms=False): timeout = build_time * 60 self.log_step(f"Running '{cmd}' with a {timeout}s timeout") start = time.time() + # Ideally we shouldn't use detach here but the scons environment is incorrect otherwise result = run_remote( - self.log, self.hostlist_clients, command, verbose=True, timeout=timeout) + self.log, self.hostlist_clients, command, verbose=True, timeout=timeout, + detach=True) elapsed = time.time() - start (minutes, seconds) = divmod(elapsed, 60) self.log.info('Command %s completed in %d:%02d (%d%% of timeout)', diff --git a/src/tests/ftest/util/dfuse_utils.py b/src/tests/ftest/util/dfuse_utils.py index a26f372e76d..1d545a2f5bc 100644 --- a/src/tests/ftest/util/dfuse_utils.py +++ b/src/tests/ftest/util/dfuse_utils.py @@ -258,7 +258,7 @@ def run(self, check=True, mount_callback=None): self._setup_mount_point() # run dfuse command - result = run_remote(self.log, self.hosts, self.with_exports, timeout=30) + result = run_remote(self.log, self.hosts, self.with_exports, timeout=30, detach=True) self._running_hosts.add(result.passed_hosts) if mount_callback: mount_callback(result) diff --git a/src/tests/ftest/util/general_utils.py b/src/tests/ftest/util/general_utils.py index 84e55601ff2..b4130aed23b 100644 --- a/src/tests/ftest/util/general_utils.py +++ b/src/tests/ftest/util/general_utils.py @@ -909,6 +909,7 @@ def get_file_listing(hosts, files, user): """Get the file listing from multiple hosts. Args: + log (logger): logger for the messages produced by this method. hosts (NodeSet): hosts with which to use the clush command files (object): list of multiple files to list or a single file as a str user (str): user used to run the ls command diff --git a/src/tests/ftest/util/run_utils.py b/src/tests/ftest/util/run_utils.py index 2f9d33b07c5..ba656a81d66 100644 --- a/src/tests/ftest/util/run_utils.py +++ b/src/tests/ftest/util/run_utils.py @@ -345,7 +345,8 @@ def log_result_data(log, data): log.debug("%s%s", " " * indent, line) -def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False): +def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False, + timeout=None, fanout=None): """Get the clush command with optional sudo arguments. Args: @@ -355,11 +356,21 @@ def get_clush_command(hosts, args=None, command="", command_env=None, command_su command_env (EnvironmentVariables, optional): environment variables to export with the command. Defaults to None. sudo (bool, optional): whether to run the command with sudo privileges. Defaults to False. + timeout (int, optional): number of seconds to wait for the command to complete. + Defaults to None. + fanout (int, optional): fanout to use. Default uses the max of the + clush default (64) or available cores Returns: str: the clush command """ + if fanout is None: + fanout = max(64, len(os.sched_getaffinity(0))) cmd_list = ["clush"] + if timeout is not None: + cmd_list.extend(["-u", str(timeout)]) + if fanout is not None: + cmd_list.extend(["-f", str(fanout)]) if args: cmd_list.append(args) cmd_list.extend(["-w", str(hosts)]) @@ -428,7 +439,7 @@ def run_local(log, command, verbose=True, timeout=None, stderr=False, capture_ou def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False, stderr=False, - fanout=None): + fanout=None, detach=False): """Run the command on the remote hosts. Args: @@ -442,6 +453,8 @@ def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False, stderr (bool, optional): whether to enable stdout/stderr separation. Defaults to False. fanout (int, optional): fanout to use. Default uses the max of the clush default (64) or available cores + detach (bool, optional): whether to detach the process from the clush worker process. + Default is False. Returns: CommandResult: groups of command results from the same hosts with the same return status @@ -453,8 +466,12 @@ def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False, if fanout is None: fanout = max(task.info('fanout'), len(os.sched_getaffinity(0))) task.set_info('fanout', fanout) - # Enable forwarding of the ssh authentication agent connection - task.set_info("ssh_options", "-oForwardAgent=yes") + # Enable forwarding of the ssh authentication agent connection. + ssh_options = "-oForwardAgent=yes" + # Force pseudo-terminal allocation so timed-out commands are killed remotely. + if not detach: + ssh_options += " -q -t -t" + task.set_info("ssh_options", ssh_options) if verbose: if timeout is None: log.debug("Running on %s without a timeout: %s", hosts, command)