Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new test case for container based warm restart #6054

Merged
merged 6 commits into from
Aug 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 91 additions & 7 deletions ansible/roles/test/files/ptftests/advanced-reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,17 @@ def __init__(self):
self.kvm_test = True
else:
self.kvm_test = False
if "service-warm-restart" in self.test_params['reboot_type']:
self.check_param('service_list', None, required=True)
self.check_param('service_data', None, required=True)
self.service_data = self.test_params['service_data']
for service_name in self.test_params['service_list']:
cmd = 'systemctl show -p ExecMainStartTimestamp {}'.format(service_name)
stdout, _, _ = self.dut_connection.execCommand(cmd)
if service_name not in self.service_data:
self.service_data[service_name] = {}
self.service_data[service_name]['service_start_time'] = str(stdout[0]).strip()
self.log("Service start time for {} is {}".format(service_name, self.service_data[service_name]['service_start_time']))
return

def read_json(self, name):
Expand Down Expand Up @@ -437,7 +448,7 @@ def build_vlan_if_port_mapping(self):
portchannel_names = [pc['name'] for pc in portchannel_content.values()]

vlan_content = self.read_json('vlan_ports_file')

vlan_if_port = []
for vlan in self.vlan_ip_range:
for ifname in vlan_content[vlan]['members']:
Expand Down Expand Up @@ -926,6 +937,31 @@ def wait_until_control_plane_up(self):
self.no_control_stop = datetime.datetime.now()
self.log("Dut reboots: control plane up at %s" % str(self.no_control_stop))

def wait_until_service_restart(self):
self.log("Wait until sevice restart")
self.reboot_start = datetime.datetime.now()
service_set = set(self.test_params['service_list'])
wait_time = 120
while wait_time > 0:
for service_name in self.test_params['service_list']:
if service_name not in service_set:
continue
cmd = 'systemctl show -p ExecMainStartTimestamp {}'.format(service_name)
stdout, _, _ = self.dut_connection.execCommand(cmd)
if self.service_data[service_name]['service_start_time'] != str(stdout[0]).strip():
service_set.remove(service_name)
if not service_set:
break
wait_time -= 10
time.sleep(10)

if service_set:
self.fails['dut'].add("Container {} hasn't come back up in {} seconds".format(','.join(service_set), wait_time))
raise TimeoutError

# TODO: add timestamp
self.log("Service has restarted")

def handle_fast_reboot_health_check(self):
self.log("Check that device is still forwarding data plane traffic")
self.fails['dut'].add("Data plane has a forwarding problem after CPU went down")
Expand Down Expand Up @@ -1017,6 +1053,10 @@ def wait_for_ssh_threads(signal):
# verify there are no interface flaps after warm boot
self.neigh_lag_status_check()

if 'service-warm-restart' == self.reboot_type:
# verify there are no interface flaps after warm boot
self.neigh_lag_status_check()

def handle_advanced_reboot_health_check_kvm(self):
self.log("Wait until data plane stops")
forward_stop_signal = multiprocessing.Event()
Expand Down Expand Up @@ -1193,8 +1233,11 @@ def runTest(self):
thr = threading.Thread(target=self.reboot_dut)
thr.setDaemon(True)
thr.start()
self.wait_until_control_plane_down()
self.no_control_start = self.cpu_state.get_state_time('down')
if self.reboot_type != 'service-warm-restart':
self.wait_until_control_plane_down()
self.no_control_start = self.cpu_state.get_state_time('down')
else:
self.wait_until_service_restart()

if 'warm-reboot' in self.reboot_type:
finalizer_timeout = 60 + self.test_params['reboot_limit_in_seconds']
Expand All @@ -1210,7 +1253,7 @@ def runTest(self):
else:
if self.reboot_type == 'fast-reboot':
self.handle_fast_reboot_health_check()
if 'warm-reboot' in self.reboot_type:
if 'warm-reboot' in self.reboot_type or 'service-warm-restart' == self.reboot_type:
self.handle_warm_reboot_health_check()
self.handle_post_reboot_health_check()

Expand Down Expand Up @@ -1276,15 +1319,20 @@ def reboot_dut(self):
time.sleep(self.reboot_delay)

if not self.kvm_test and\
(self.reboot_type == 'fast-reboot' or 'warm-reboot' in self.reboot_type):
(self.reboot_type == 'fast-reboot' or 'warm-reboot' in self.reboot_type or 'service-warm-restart' in self.reboot_type):
self.sender_thr = threading.Thread(target = self.send_in_background)
self.sniff_thr = threading.Thread(target = self.sniff_in_background)
self.sniffer_started = threading.Event() # Event for the sniff_in_background status.
self.sniff_thr.start()
self.sender_thr.start()

self.log("Rebooting remote side")
stdout, stderr, return_code = self.dut_connection.execCommand("sudo " + self.reboot_type, timeout=30)
if self.reboot_type != 'service-warm-restart':
stdout, stderr, return_code = self.dut_connection.execCommand("sudo " + self.reboot_type, timeout=30)
else:
self.restart_service()
return

if stdout != []:
self.log("stdout from %s: %s" % (self.reboot_type, str(stdout)))
if stderr != []:
Expand All @@ -1300,6 +1348,42 @@ def reboot_dut(self):

return

def restart_service(self):
for service_name in self.test_params['service_list']:
if 'image_path_on_dut' in self.service_data[service_name]:
stdout, stderr, return_code = self.dut_connection.execCommand("sudo sonic-installer upgrade-docker {} {} -y --warm".format(service_name, self.service_data[service_name]['image_path_on_dut']), timeout=30)
Junchao-Mellanox marked this conversation as resolved.
Show resolved Hide resolved
else:
self.dut_connection.execCommand('sudo config warm_restart enable {}'.format(service_name))
self.pre_service_warm_restart(service_name)
stdout, stderr, return_code = self.dut_connection.execCommand('sudo service {} restart'.format(service_name))

if stdout != []:
self.log("stdout from %s %s: %s" % (self.reboot_type, service_name, str(stdout)))
if stderr != []:
self.log("stderr from %s %s: %s" % (self.reboot_type, service_name, str(stderr)))
self.fails['dut'].add("service warm restart {} failed with error {}".format(service_name, stderr))
thread.interrupt_main()
raise Exception("{} failed with error {}".format(self.reboot_type, stderr))
self.log("return code from %s %s: %s" % (self.reboot_type, service_name, str(return_code)))
if return_code not in [0, 255]:
thread.interrupt_main()

def pre_service_warm_restart(self, service_name):
"""Copy from src/sonic-utilities/sonic_installer/main.py to do some special operation for particular containers
"""
if service_name == 'swss':
cmd = 'docker exec -i swss orchagent_restart_check -w 2000 -r 5'
stdout, stderr, return_code = self.dut_connection.execCommand(cmd)
if return_code != 0:
self.log('stdout from {}: {}'.format(cmd, str(stdout)))
self.log('stderr from {}: {}'.format(cmd, str(stderr)))
self.log('orchagent is not in clean state, RESTARTCHECK failed: {}'.format(return_code))
Junchao-Mellanox marked this conversation as resolved.
Show resolved Hide resolved
elif service_name == 'bgp':
self.dut_connection.execCommand('docker exec -i bgp pkill -9 zebra')
self.dut_connection.execCommand('docker exec -i bgp pkill -9 bgpd')
elif service_name == 'teamd':
self.dut_connection.execCommand('docker exec -i teamd pkill -USR1 teamd > /dev/null')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about syncd warm restart? In which case, there needs to be syncd preshutdown?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I know, syncd warm restart is not supported.


def cmd(self, cmds):
process = subprocess.Popen(cmds,
shell=False,
Expand All @@ -1325,7 +1409,7 @@ def peer_state_check(self, ip, queue):
lacp_pdu_down_times and len(lacp_pdu_down_times) > 0 else None
lacp_pdu_after_reboot = float(lacp_pdu_up_times[0]) if\
lacp_pdu_up_times and len(lacp_pdu_up_times) > 0 else None
if 'warm-reboot' in self.reboot_type and lacp_pdu_before_reboot and lacp_pdu_after_reboot:
if ('warm-reboot' in self.reboot_type or 'service-warm-restart' in self.reboot_type) and lacp_pdu_before_reboot and lacp_pdu_after_reboot:
lacp_time_diff = lacp_pdu_after_reboot - lacp_pdu_before_reboot
if lacp_time_diff >= 90 and not self.kvm_test:
self.fails['dut'].add("LACP session likely terminated by neighbor ({})".format(ip) +\
Expand Down
11 changes: 9 additions & 2 deletions ansible/roles/test/files/ptftests/arista.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,8 @@ def run(self):
log_data = self.parse_logs(log_lines)
if (self.reboot_type == 'fast-reboot' and \
any(k.startswith('BGP') for k in log_data) and any(k.startswith('PortChannel') for k in log_data)) \
or (self.reboot_type == 'warm-reboot' and any(k.startswith('BGP') for k in log_data)):
or (self.reboot_type == 'warm-reboot' and any(k.startswith('BGP') for k in log_data)) \
or (self.reboot_type == 'service-warm-restart' and any(k.startswith('BGP') for k in log_data)):
log_present = True
break
time.sleep(1) # wait until logs are populated
Expand Down Expand Up @@ -324,6 +325,8 @@ def parse_logs(self, data):
return result
elif self.reboot_type == 'warm-reboot' and initial_time_bgp == -1:
return result
elif self.reboot_type == 'service-warm-restart' and initial_time_bgp == -1:
return result

for events in result_bgp.values():
if events[-1][1] != 'Established':
Expand Down Expand Up @@ -592,7 +595,7 @@ def change_bgp_neigh_state(self, asn, is_up=True):
if is_up == True:
self.do_cmd('%s' % state[is_up])
else:
# shutdown BGP will pop confirm message, the message is
# shutdown BGP will pop confirm message, the message is
# "You are attempting to shutdown BGP. Are you sure you want to shutdown? [confirm]"
self.do_cmd('%s' % state[is_up], prompt = '[confirm]')
self.do_cmd('y')
Expand Down Expand Up @@ -707,6 +710,10 @@ def check_series_status(self, output, entity, what):
self.fails.add("%s must be up when the test stops" % what)
return 0, 0

if len(sorted_keys) == 1:
# for service warm restart, the down count could be 0
return 0, 0

start = sorted_keys[0]
cur_state = True
res = defaultdict(list)
Expand Down
83 changes: 79 additions & 4 deletions tests/common/fixtures/advanced_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, request, duthost, ptfhost, localhost, tbinfo, creds, **kwargs
@param tbinfo: fixture provides information about testbed
@param kwargs: extra parameters including reboot type
'''
assert 'rebootType' in kwargs and ('warm-reboot' in kwargs['rebootType'] or 'fast-reboot' in kwargs['rebootType']) , (
assert 'rebootType' in kwargs and ('warm-reboot' in kwargs['rebootType'] or 'fast-reboot' in kwargs['rebootType'] or 'service-warm-restart' in kwargs['rebootType']) , (
"Please set rebootType var."
)

Expand Down Expand Up @@ -90,6 +90,10 @@ def __init__(self, request, duthost, ptfhost, localhost, tbinfo, creds, **kwargs

self.__buildTestbedData(tbinfo)

if self.rebootType == 'service-warm-restart':
assert hasattr(self, 'service_list')
self.service_data = {}

def __extractTestParam(self):
'''
Extract test parameters from pytest request object. Note that all the parameters have default values.
Expand All @@ -106,6 +110,7 @@ def __extractTestParam(self):
self.replaceFastRebootScript = self.request.config.getoption("--replace_fast_reboot_script")
self.postRebootCheckScript = self.request.config.getoption("--post_reboot_check_script")
self.bgpV4V6TimeDiff = self.request.config.getoption("--bgp_v4_v6_time_diff")
self.new_docker_image = self.request.config.getoption("--new_docker_image")

# Set default reboot limit if it is not given
if self.rebootLimit is None:
Expand Down Expand Up @@ -326,6 +331,35 @@ def __handleRebootImage(self):
logger.info('Remove downloaded tempfile')
self.duthost.shell('rm -f {}'.format(tempfile))

def __handleDockerImage(self):
"""Download and install docker image to DUT
"""
if self.new_docker_image is None:
return

for service_name in self.service_list:
data = {}
docker_image_name = self.duthost.shell('docker ps | grep {} | awk \'{{print $2}}\''.format(service_name))['stdout']
cmd = 'docker images {} --format \{{\{{.ID\}}\}}'.format(docker_image_name)
data['image_id'] = self.duthost.shell(cmd)['stdout']
data['image_name'], data['image_tag'] = docker_image_name.split(':')

local_image_path = '/tmp/{}.gz'.format(data['image_name'])
logger.info('Downloading new docker image for {} to {}'.format(service_name, local_image_path))
output = self.localhost.shell('curl --silent --write-out "%{{http_code}}" {0}/{1}.gz --output {2}'.format(self.new_docker_image, data['image_name'], local_image_path), module_ignore_errors=True)['stdout']
if '404' not in output and os.path.exists(local_image_path):
temp_file = self.duthost.shell('mktemp')['stdout']
self.duthost.copy(src=local_image_path, dest=temp_file)
self.localhost.shell('rm -f /tmp/{}.gz'.format(data['image_name']))
data['image_path_on_dut'] = temp_file
logger.info('Successfully downloaded docker image, will perform in-service upgrade')
else:
data['image_path_on_dut'] = None
logger.info('Docker image not found, will perform in-service reboot')
self.service_data[service_name] = data

logger.info('service data = {}'.format(json.dumps(self.service_data, indent=2)))

def __setupTestbed(self):
'''
Sets testbed up. It tranfers test data files, ARP responder, and runs script to update IPs and MAC addresses.
Expand Down Expand Up @@ -438,7 +472,10 @@ def imageInstall(self, prebootList=None, inbootList=None, prebootFiles=None):
self.__setupTestbed()

# Download and install new sonic image
self.__handleRebootImage()
if self.rebootType != 'service-warm-restart':
self.__handleRebootImage()
else:
self.__handleDockerImage()

# Handle mellanox platform
self.__handleMellanoxDut()
Expand Down Expand Up @@ -586,7 +623,9 @@ def __runPtfRunner(self, rebootOper=None):
"asic_type": self.duthost.facts["asic_type"],
"allow_mac_jumping": self.allowMacJump,
"preboot_files" : self.prebootFiles,
"alt_password": self.duthost.host.options['variable_manager']._hostvars[self.duthost.hostname].get("ansible_altpassword")
"alt_password": self.duthost.host.options['variable_manager']._hostvars[self.duthost.hostname].get("ansible_altpassword"),
"service_list": None if self.rebootType != 'service-warm-restart' else self.service_list,
"service_data": None if self.rebootType != 'service-warm-restart' else self.service_data,
}

if not isinstance(rebootOper, SadOperation):
Expand Down Expand Up @@ -639,6 +678,36 @@ def __restorePrevImage(self):
wait = self.readyTimeout
)

def __restorePrevDockerImage(self):
"""Restore previous docker image.
"""
for service_name, data in self.service_data.items():
if data['image_path_on_dut'] is None:
self.duthost.shell('sudo config warm_restart disable {}'.format(service_name))
continue

# We don't use sonic-installer rollback-docker CLI here because:
# 1. it does not remove the docker image which causes the running container still using the old image
# 2. it runs service restart for some containers which enlarge the test duration
self.duthost.shell('rm -f {}'.format(data['image_path_on_dut']))
logger.info('Restore docker image for {}'.format(service_name))
self.duthost.shell('service {} stop'.format(service_name))
self.duthost.shell('docker rm {}'.format(service_name))
image_ids = self.duthost.shell('docker images {} --format \{{\{{.ID\}}\}}'.format(data['image_name']))['stdout_lines']
for image_id in image_ids:
if image_id != data['image_id']:
self.duthost.shell('docker rmi -f {}'.format(image_id))
break

self.duthost.shell('docker tag {} {}:{}'.format(data['image_id'], data['image_name'], data['image_tag']))

rebootDut(
self.duthost,
self.localhost,
reboot_type='cold',
wait = 300
)

def tearDown(self):
'''
Tears down test case. It also verifies that config_db.json exists.
Expand All @@ -658,7 +727,13 @@ def tearDown(self):
self.__runScript([self.postRebootCheckScript], self.duthost)

if not self.stayInTargetImage:
self.__restorePrevImage()
logger.info('Restoring previous image')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

stay_in_target_image arg is by default set to true. I think it would be better to always revert the device back to pretest state. Otherwise it can impact subsequent testcases that might run in long running regression tests.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

stay_in_target_image is a flag specified by user. I agree with you that the default value should be false, however, I am not sure if we should change the default behavior here. Because the test case is already running on regression system, vendors may not expect to have the default behavior changed.

if self.rebootType != 'service-warm-restart':
self.__restorePrevImage()
else:
self.__restorePrevDockerImage()
else:
logger.info('Stay in new image')

@pytest.fixture
def get_advanced_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, localhost, tbinfo, creds):
Expand Down
16 changes: 16 additions & 0 deletions tests/platform_tests/args/advanced_reboot_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,22 @@ def add_advanced_reboot_args(parser):
help="URL of new sonic image",
)

parser.addoption(
"--new_docker_image",
action="store",
type=str,
default=None,
help="URL of new docker image",
)

parser.addoption(
"--ignore_service",
action="store",
type=str,
default=None,
help="Services that ignore for warm restart test",
)

parser.addoption(
"--ready_timeout",
action="store",
Expand Down
1 change: 0 additions & 1 deletion tests/platform_tests/test_advanced_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
pytest.mark.topology('t0')
]


def pytest_generate_tests(metafunc):
input_sad_cases = metafunc.config.getoption("sad_case_list")
input_sad_list = list()
Expand Down
Loading