Skip to content

Commit

Permalink
DAOS-16298 test: improve clush and run_remote timeout
Browse files Browse the repository at this point in the history
Skip-unit-tests: true
Skip-fault-injection-test: true
Allow-unstable-test: true

Features: soak_smoke dfuse

Make sure remote commands are killed by using -t -t.
Make clush timeout per host instead of for all hosts.

Required-githooks: true

Signed-off-by: Dalton Bohning <dalton.bohning@intel.com>
  • Loading branch information
daltonbohning committed Sep 10, 2024
1 parent 226e283 commit e9ecb6b
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 6 deletions.
4 changes: 3 additions & 1 deletion src/tests/ftest/dfuse/daos_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,10 @@ def run_build_test(self, cache_mode, il_lib=None, run_on_vms=False):
timeout = build_time * 60
self.log_step(f"Running '{cmd}' with a {timeout}s timeout")
start = time.time()
# Ideally we shouldn't use detach here but the scons environment is incorrect otherwise
result = run_remote(
self.log, self.hostlist_clients, command, verbose=True, timeout=timeout)
self.log, self.hostlist_clients, command, verbose=True, timeout=timeout,
detach=True)
elapsed = time.time() - start
(minutes, seconds) = divmod(elapsed, 60)
self.log.info('Command %s completed in %d:%02d (%d%% of timeout)',
Expand Down
2 changes: 1 addition & 1 deletion src/tests/ftest/util/dfuse_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def run(self, check=True, mount_callback=None):
self._setup_mount_point()

# run dfuse command
result = run_remote(self.log, self.hosts, self.with_exports, timeout=30)
result = run_remote(self.log, self.hosts, self.with_exports, timeout=30, detach=True)
self._running_hosts.add(result.passed_hosts)
if mount_callback:
mount_callback(result)
Expand Down
1 change: 1 addition & 0 deletions src/tests/ftest/util/general_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -909,6 +909,7 @@ def get_file_listing(hosts, files, user):
"""Get the file listing from multiple hosts.
Args:
log (logger): logger for the messages produced by this method.
hosts (NodeSet): hosts with which to use the clush command
files (object): list of multiple files to list or a single file as a str
user (str): user used to run the ls command
Expand Down
25 changes: 21 additions & 4 deletions src/tests/ftest/util/run_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,8 @@ def log_result_data(log, data):
log.debug("%s%s", " " * indent, line)


def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False):
def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False,
timeout=None, fanout=None):
"""Get the clush command with optional sudo arguments.
Args:
Expand All @@ -355,11 +356,21 @@ def get_clush_command(hosts, args=None, command="", command_env=None, command_su
command_env (EnvironmentVariables, optional): environment variables to export with the
command. Defaults to None.
sudo (bool, optional): whether to run the command with sudo privileges. Defaults to False.
timeout (int, optional): number of seconds to wait for the command to complete.
Defaults to None.
fanout (int, optional): fanout to use. Default uses the max of the
clush default (64) or available cores
Returns:
str: the clush command
"""
if fanout is None:
fanout = max(64, len(os.sched_getaffinity(0)))
cmd_list = ["clush"]
if timeout is not None:
cmd_list.extend(["-u", str(timeout)])
if fanout is not None:
cmd_list.extend(["-f", str(fanout)])
if args:
cmd_list.append(args)
cmd_list.extend(["-w", str(hosts)])
Expand Down Expand Up @@ -428,7 +439,7 @@ def run_local(log, command, verbose=True, timeout=None, stderr=False, capture_ou


def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False, stderr=False,
fanout=None):
fanout=None, detach=False):
"""Run the command on the remote hosts.
Args:
Expand All @@ -442,6 +453,8 @@ def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False,
stderr (bool, optional): whether to enable stdout/stderr separation. Defaults to False.
fanout (int, optional): fanout to use. Default uses the max of the
clush default (64) or available cores
detach (bool, optional): whether to detach the process from the clush worker process.
Default is False.
Returns:
CommandResult: groups of command results from the same hosts with the same return status
Expand All @@ -453,8 +466,12 @@ def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False,
if fanout is None:
fanout = max(task.info('fanout'), len(os.sched_getaffinity(0)))
task.set_info('fanout', fanout)
# Enable forwarding of the ssh authentication agent connection
task.set_info("ssh_options", "-oForwardAgent=yes")
# Enable forwarding of the ssh authentication agent connection.
ssh_options = "-oForwardAgent=yes"
# Force pseudo-terminal allocation so timed-out commands are killed remotely.
if not detach:
ssh_options += " -q -t -t"
task.set_info("ssh_options", ssh_options)
if verbose:
if timeout is None:
log.debug("Running on %s without a timeout: %s", hosts, command)
Expand Down

0 comments on commit e9ecb6b

Please sign in to comment.