Skip to content

Commit

Permalink
DAOS-16298 test: improve clush and run_remote timeout
Browse files Browse the repository at this point in the history
Skip-unit-tests: true
Skip-fault-injection-test: true

Features: soak_smoke

Make sure remote commands are killed by using -t -t.
Make clush timeout per host instead of for all hosts.

Required-githooks: true

Signed-off-by: Dalton Bohning <dalton.bohning@intel.com>
  • Loading branch information
daltonbohning committed Jul 31, 2024
1 parent aae31a3 commit 78875ad
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 88 deletions.
4 changes: 2 additions & 2 deletions src/tests/ftest/daos_test/dfuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def run_test(self, il_lib=None):
else:
# Bypass, simply create a remote directory and use that.
mount_dir = '/tmp/dfuse-test'
create_directory(self.hostlist_clients, mount_dir)
create_directory(self.log, self.hostlist_clients, mount_dir)

cmocka_utils = CmockaUtils(
self.hostlist_clients, "dfuse", self.outputdir, self.test_dir, self.log)
Expand Down Expand Up @@ -118,7 +118,7 @@ def run_test(self, il_lib=None):
else:
# make D_IL_MOUNT_POINT different from mount_dir so it tests a non-DAOS filesystem
dummy_dir = '/tmp/dummy'
create_directory(self.hostlist_clients, dummy_dir)
create_directory(self.log, self.hostlist_clients, dummy_dir)
daos_test_env['D_IL_MOUNT_POINT'] = dummy_dir
if cache_mode != 'writeback':
command.append('--metadata')
Expand Down
2 changes: 1 addition & 1 deletion src/tests/ftest/recovery/ddb.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ def test_recovery_ddb_load(self):
# Copy the created file to server node.
try:
distribute_files(
hosts=host, source=load_file_path, destination=load_file_path,
self.log, hosts=host, source=load_file_path, destination=load_file_path,
mkdir=False)
except DaosTestError as error:
raise CommandFailure(
Expand Down
4 changes: 1 addition & 3 deletions src/tests/ftest/util/apricot/apricot/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,9 +769,7 @@ def setUp(self):
hosts.add(self.hostlist_clients)
# Copy the fault injection files to the hosts.
self.fault_injection.copy_fault_files(hosts)
lines = get_file_listing(hosts, self.test_dir).stdout_text.splitlines()
for line in lines:
self.log.debug(" %s", line)
get_file_listing(self.log, hosts, self.test_dir)

if not self.start_servers_once or self.name.uid == 1:
# Kill commands left running on the hosts (from a previous test)
Expand Down
18 changes: 9 additions & 9 deletions src/tests/ftest/util/command_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1011,19 +1011,19 @@ def copy_certificates(self, source, hosts):
yaml.get_attribute_names(LogParameter))
for name in data:
create_directory(
hosts, name, verbose=False, raise_exception=False)
self.log, hosts, name, verbose=False, raise_exception=False)
for file_name in data[name]:
src_file = os.path.join(source, file_name)
dst_file = os.path.join(name, file_name)
self.log.debug(" %s -> %s", src_file, dst_file)
result = distribute_files(
hosts, src_file, dst_file, mkdir=False,
self.log, hosts, src_file, dst_file, mkdir=False,
verbose=False, raise_exception=False, sudo=True,
owner=self.certificate_owner)
if result.exit_status != 0:
if not result.passed:
self.log.info(
" WARNING: %s copy failed on %s:\n%s",
dst_file, hosts, result)
dst_file, result.failed_hosts, result)
names.add(name)
yaml = yaml.other_params

Expand All @@ -1032,8 +1032,7 @@ def copy_certificates(self, source, hosts):
self.log.debug(
"Copied certificates for %s (in %s):",
self._command, ", ".join(names))
for line in get_file_listing(hosts, names).stdout_text.splitlines():
self.log.debug(" %s", line)
get_file_listing(self.log, hosts, names)

def copy_configuration(self, hosts):
"""Copy the yaml configuration file to the hosts.
Expand All @@ -1055,7 +1054,7 @@ def copy_configuration(self, hosts):
self.temporary_file, self.yaml.filename, hosts)
try:
distribute_files(
hosts, self.temporary_file, self.yaml.filename,
self.log, hosts, self.temporary_file, self.yaml.filename,
verbose=False, sudo=True)
except DaosTestError as error:
raise CommandFailure(
Expand Down Expand Up @@ -1084,8 +1083,9 @@ def verify_socket_directory(self, user, hosts):
"%s: creating socket directory %s for user %s on %s",
self.command, directory, user, nodes)
try:
create_directory(nodes, directory, sudo=True)
change_file_owner(nodes, directory, user, get_primary_group(user), sudo=True)
create_directory(self.log, nodes, directory, sudo=True)
change_file_owner(
self.log, nodes, directory, user, get_primary_group(user), sudo=True)
except DaosTestError as error:
raise CommandFailure(
"{}: error setting up missing socket directory {} for "
Expand Down
3 changes: 2 additions & 1 deletion src/tests/ftest/util/fault_config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
SPDX-License-Identifier: BSD-2-Clause-Patent
"""

import logging
import os

import yaml
Expand Down Expand Up @@ -302,7 +303,7 @@ def copy_fault_files(self, hosts):
"""
if self._fault_list:
self._hosts = hosts
distribute_files(self._hosts, self.fault_file, self.fault_file)
distribute_files(logging.getLogger(), self._hosts, self.fault_file, self.fault_file)

def stop(self):
"""Remove the fault injection file created during testing.
Expand Down
109 changes: 40 additions & 69 deletions src/tests/ftest/util/general_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from avocado.utils import process
from ClusterShell.NodeSet import NodeSet
from ClusterShell.Task import task_self
from run_utils import get_clush_command, run_local, run_remote
from run_utils import command_as_user, get_clush_command, run_local, run_remote
from user_utils import get_chown_command, get_primary_group


Expand Down Expand Up @@ -891,11 +891,12 @@ def convert_string(item, separator=","):
return item


def create_directory(hosts, directory, timeout=15, verbose=True,
def create_directory(log, hosts, directory, timeout=15, verbose=True,
raise_exception=True, sudo=False):
"""Create the specified directory on the specified hosts.
Args:
log (logger): logger for the messages produced by this method.
hosts (NodeSet): hosts on which to create the directory
directory (str): the directory to create
timeout (int, optional): command timeout. Defaults to 15 seconds.
Expand All @@ -910,28 +911,22 @@ def create_directory(hosts, directory, timeout=15, verbose=True,
DaosTestError: if there is an error running the command
Returns:
CmdResult: an avocado.utils.process CmdResult object containing the
result of the command execution. A CmdResult object has the
following properties:
command - command string
exit_status - exit_status of the command
stdout - the stdout
stderr - the stderr
duration - command execution time
interrupted - whether the command completed within timeout
pid - command's pid
CommandResult: groups of command results from the same hosts with the same return status
"""
mkdir_command = "/usr/bin/mkdir -p {}".format(directory)
command = get_clush_command(hosts, args="-S -v", command=mkdir_command, command_sudo=sudo)
return run_command(command, timeout=timeout, verbose=verbose, raise_exception=raise_exception)
mkdir_command = command_as_user(f"/usr/bin/mkdir -p {directory}", "root" if sudo else None)
result = run_remote(log, hosts, mkdir_command, verbose=verbose, timeout=timeout)
if raise_exception and not result.passed:
raise DaosTestError(f"Error running: {mkdir_command}")
return result


def change_file_owner(hosts, filename, owner, group, timeout=15, verbose=True,
def change_file_owner(log, hosts, filename, owner, group, timeout=15, verbose=True,
raise_exception=True, sudo=False):
"""Create the specified directory on the specified hosts.
Args:
log (logger): logger for the messages produced by this method.
hosts (NodeSet): hosts on which to create the directory
filename (str): the file for which to change ownership
owner (str): new owner of the file
Expand All @@ -948,24 +943,18 @@ def change_file_owner(hosts, filename, owner, group, timeout=15, verbose=True,
DaosTestError: if there is an error running the command
Returns:
CmdResult: an avocado.utils.process CmdResult object containing the
result of the command execution. A CmdResult object has the
following properties:
command - command string
exit_status - exit_status of the command
stdout - the stdout
stderr - the stderr
duration - command execution time
interrupted - whether the command completed within timeout
pid - command's pid
CommandResult: groups of command results from the same hosts with the same return status
"""
chown_command = get_chown_command(owner, group, file=filename)
command = get_clush_command(hosts, args="-S -v", command=chown_command, command_sudo=sudo)
return run_command(command, timeout=timeout, verbose=verbose, raise_exception=raise_exception)
command = command_as_user(chown_command, "root" if sudo else None)
result = run_remote(log, hosts, command, verbose=verbose, timeout=timeout)
if raise_exception and not result.passed:
raise DaosTestError(f"Error running: {command}")
return result


def distribute_files(hosts, source, destination, mkdir=True, timeout=60,
def distribute_files(log, hosts, source, destination, mkdir=True, timeout=60,
verbose=True, raise_exception=True, sudo=False,
owner=None):
"""Copy the source to the destination on each of the specified hosts.
Expand All @@ -974,6 +963,7 @@ def distribute_files(hosts, source, destination, mkdir=True, timeout=60,
the specified hosts prior to copying the source.
Args:
log (logger): logger for the messages produced by this method.
hosts (NodeSet): hosts on which to copy the source
source (str): the file to copy to the hosts
destination (str): the host location in which to copy the source
Expand All @@ -994,24 +984,15 @@ def distribute_files(hosts, source, destination, mkdir=True, timeout=60,
DaosTestError: if there is an error running the command
Returns:
CmdResult: an avocado.utils.process CmdResult object containing the
result of the command execution. A CmdResult object has the
following properties:
command - command string
exit_status - exit_status of the command
stdout - the stdout
stderr - the stderr
duration - command execution time
interrupted - whether the command completed within timeout
pid - command's pid
CommandResult: groups of command results from the same hosts with the same return status
"""
result = None
if mkdir:
result = create_directory(
hosts, os.path.dirname(destination), verbose=verbose,
log, hosts, os.path.dirname(destination), verbose=verbose,
raise_exception=raise_exception)
if result is None or result.exit_status == 0:
if result is None or result.passed:
if sudo:
# In order to copy a protected file to a remote host in CI the
# source will first be copied as is to the remote host
Expand All @@ -1020,32 +1001,32 @@ def distribute_files(hosts, source, destination, mkdir=True, timeout=60,
if other_hosts:
# Existing files with strict file permissions can cause the
# subsequent non-sudo copy to fail, so remove the file first
rm_command = get_clush_command(
other_hosts, args="-S -v", command="rm -f {}".format(source),
command_sudo=True)
run_command(rm_command, verbose=verbose, raise_exception=False)
rm_command = command_as_user(f"rm -f {source}", "root")
run_remote(log, other_hosts, rm_command, verbose=verbose)
result = distribute_files(
other_hosts, source, source, mkdir=True,
log, other_hosts, source, source, mkdir=True,
timeout=timeout, verbose=verbose,
raise_exception=raise_exception, sudo=False, owner=None)
if result is None or result.exit_status == 0:
if result is None or result.passed:
# Then a local sudo copy will be executed on the remote node to
# copy the source to the destination
command = get_clush_command(
hosts, args="-S -v", command="cp {} {}".format(source, destination),
command_sudo=True)
result = run_command(command, timeout, verbose, raise_exception)
cp_cmd = command_as_user(f"cp {source} {destination}", "root")
result = run_remote(log, hosts, cp_cmd, verbose=verbose, timeout=timeout)
if raise_exception and not result.passed:
raise DaosTestError(f"Error running: {cp_cmd}")
else:
# Without the sudo requirement copy the source to the destination
# directly with clush
command = get_clush_command(
hosts, args="-S -v --copy {} --dest {}".format(source, destination))
result = run_command(command, timeout, verbose, raise_exception)
hosts, args=f"-S -v --copy {source} --dest {destination}", timeout=timeout)
result = run_local(log, command, verbose=verbose)
if raise_exception and not result.passed:
raise DaosTestError(f"Error running: {command}")

# If requested update the ownership of the destination file
if owner is not None and result.exit_status == 0:
if owner is not None and result.passed:
change_file_owner(
hosts, destination, owner, get_primary_group(owner), timeout=timeout,
log, hosts, destination, owner, get_primary_group(owner), timeout=timeout,
verbose=verbose, raise_exception=raise_exception, sudo=sudo)
return result

Expand All @@ -1064,30 +1045,20 @@ def get_default_config_file(name):
return os.path.join(os.sep, "etc", "daos", file_name)


def get_file_listing(hosts, files):
def get_file_listing(log, hosts, files):
"""Get the file listing from multiple hosts.
Args:
log (logger): logger for the messages produced by this method.
hosts (NodeSet): hosts with which to use the clush command
files (object): list of multiple files to list or a single file as a str
Returns:
CmdResult: an avocado.utils.process CmdResult object containing the
result of the command execution. A CmdResult object has the
following properties:
command - command string
exit_status - exit_status of the command
stdout - the stdout
stderr - the stderr
duration - command execution time
interrupted - whether the command completed within timeout
pid - command's pid
CommandResult: groups of command results from the same hosts with the same return status
"""
ls_command = "/usr/bin/ls -la {}".format(convert_string(files, " "))
command = get_clush_command(hosts, args="-S -v", command=ls_command, command_sudo=True)
result = run_command(command, verbose=False, raise_exception=False)
return result
return run_remote(log, hosts, ls_command, timeout=60)


def get_subprocess_stdout(subprocess):
Expand Down
18 changes: 15 additions & 3 deletions src/tests/ftest/util/run_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,8 @@ def log_result_data(log, data):
log.debug("%s%s", " " * indent, line)


def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False):
def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False,
timeout=None, fanout=None):
"""Get the clush command with optional sudo arguments.
Args:
Expand All @@ -291,11 +292,21 @@ def get_clush_command(hosts, args=None, command="", command_env=None, command_su
command_env (EnvironmentVariables, optional): environment variables to export with the
command. Defaults to None.
sudo (bool, optional): whether to run the command with sudo privileges. Defaults to False.
timeout (int, optional): number of seconds to wait for the command to complete.
Defaults to None.
fanout (int, optional): fanout to use. Default uses the max of the
clush default (64) or available cores
Returns:
str: the clush command
"""
if fanout is None:
fanout = max(64, len(os.sched_getaffinity(0)))
cmd_list = ["clush"]
if timeout is not None:
cmd_list.extend(["-u", str(timeout)])
if fanout is not None:
cmd_list.extend(["-f", str(fanout)])
if args:
cmd_list.append(args)
cmd_list.extend(["-w", str(hosts)])
Expand Down Expand Up @@ -367,8 +378,9 @@ def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False,
if fanout is None:
fanout = max(task.info('fanout'), len(os.sched_getaffinity(0)))
task.set_info('fanout', fanout)
# Enable forwarding of the ssh authentication agent connection
task.set_info("ssh_options", "-oForwardAgent=yes")
# Enable forwarding of the ssh authentication agent connection.
# Force pseudo-terminal allocation so timed-out commands are killed remotely.
task.set_info("ssh_options", "-oForwardAgent=yes -q -t -t")
if verbose:
if timeout is None:
log.debug("Running on %s without a timeout: %s", hosts, command)
Expand Down

0 comments on commit 78875ad

Please sign in to comment.