From 9b5a4398f5dd8489fc409e89668865fcf06b69de Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Fri, 26 Aug 2022 06:55:55 +0800 Subject: [PATCH] Add new test case for container based warm restart (#6054) Added a new test case test_advanced_reboot::test_service_warm_restart. This test tries to run warm restart for each service in FEATURE table. If a URL is provided by CLI option "--new_docker_image", the test case will try download new docker image from the URL and do in service warm upgrade. The new test case reused the advanced-reboot infrastructure in sonic-mgmt and added some special logic for container based warm restart --- .../test/files/ptftests/advanced-reboot.py | 98 +++++++++++++++++-- ansible/roles/test/files/ptftests/arista.py | 11 ++- tests/common/fixtures/advanced_reboot.py | 83 +++++++++++++++- .../args/advanced_reboot_args.py | 16 +++ tests/platform_tests/test_advanced_reboot.py | 1 - .../test_service_warm_restart.py | 72 ++++++++++++++ 6 files changed, 267 insertions(+), 14 deletions(-) create mode 100644 tests/platform_tests/test_service_warm_restart.py diff --git a/ansible/roles/test/files/ptftests/advanced-reboot.py b/ansible/roles/test/files/ptftests/advanced-reboot.py index 51fb284bf77..cc07d2cc58b 100644 --- a/ansible/roles/test/files/ptftests/advanced-reboot.py +++ b/ansible/roles/test/files/ptftests/advanced-reboot.py @@ -253,6 +253,17 @@ def __init__(self): self.kvm_test = True else: self.kvm_test = False + if "service-warm-restart" in self.test_params['reboot_type']: + self.check_param('service_list', None, required=True) + self.check_param('service_data', None, required=True) + self.service_data = self.test_params['service_data'] + for service_name in self.test_params['service_list']: + cmd = 'systemctl show -p ExecMainStartTimestamp {}'.format(service_name) + stdout, _, _ = self.dut_connection.execCommand(cmd) + if service_name not in self.service_data: + self.service_data[service_name] = {} + self.service_data[service_name]['service_start_time'] = str(stdout[0]).strip() + self.log("Service start time for {} is {}".format(service_name, self.service_data[service_name]['service_start_time'])) return def read_json(self, name): @@ -437,7 +448,7 @@ def build_vlan_if_port_mapping(self): portchannel_names = [pc['name'] for pc in portchannel_content.values()] vlan_content = self.read_json('vlan_ports_file') - + vlan_if_port = [] for vlan in self.vlan_ip_range: for ifname in vlan_content[vlan]['members']: @@ -926,6 +937,31 @@ def wait_until_control_plane_up(self): self.no_control_stop = datetime.datetime.now() self.log("Dut reboots: control plane up at %s" % str(self.no_control_stop)) + def wait_until_service_restart(self): + self.log("Wait until sevice restart") + self.reboot_start = datetime.datetime.now() + service_set = set(self.test_params['service_list']) + wait_time = 120 + while wait_time > 0: + for service_name in self.test_params['service_list']: + if service_name not in service_set: + continue + cmd = 'systemctl show -p ExecMainStartTimestamp {}'.format(service_name) + stdout, _, _ = self.dut_connection.execCommand(cmd) + if self.service_data[service_name]['service_start_time'] != str(stdout[0]).strip(): + service_set.remove(service_name) + if not service_set: + break + wait_time -= 10 + time.sleep(10) + + if service_set: + self.fails['dut'].add("Container {} hasn't come back up in {} seconds".format(','.join(service_set), wait_time)) + raise TimeoutError + + # TODO: add timestamp + self.log("Service has restarted") + def handle_fast_reboot_health_check(self): self.log("Check that device is still forwarding data plane traffic") self.fails['dut'].add("Data plane has a forwarding problem after CPU went down") @@ -1017,6 +1053,10 @@ def wait_for_ssh_threads(signal): # verify there are no interface flaps after warm boot self.neigh_lag_status_check() + if 'service-warm-restart' == self.reboot_type: + # verify there are no interface flaps after warm boot + self.neigh_lag_status_check() + def handle_advanced_reboot_health_check_kvm(self): self.log("Wait until data plane stops") forward_stop_signal = multiprocessing.Event() @@ -1193,8 +1233,11 @@ def runTest(self): thr = threading.Thread(target=self.reboot_dut) thr.setDaemon(True) thr.start() - self.wait_until_control_plane_down() - self.no_control_start = self.cpu_state.get_state_time('down') + if self.reboot_type != 'service-warm-restart': + self.wait_until_control_plane_down() + self.no_control_start = self.cpu_state.get_state_time('down') + else: + self.wait_until_service_restart() if 'warm-reboot' in self.reboot_type: finalizer_timeout = 60 + self.test_params['reboot_limit_in_seconds'] @@ -1210,7 +1253,7 @@ def runTest(self): else: if self.reboot_type == 'fast-reboot': self.handle_fast_reboot_health_check() - if 'warm-reboot' in self.reboot_type: + if 'warm-reboot' in self.reboot_type or 'service-warm-restart' == self.reboot_type: self.handle_warm_reboot_health_check() self.handle_post_reboot_health_check() @@ -1276,7 +1319,7 @@ def reboot_dut(self): time.sleep(self.reboot_delay) if not self.kvm_test and\ - (self.reboot_type == 'fast-reboot' or 'warm-reboot' in self.reboot_type): + (self.reboot_type == 'fast-reboot' or 'warm-reboot' in self.reboot_type or 'service-warm-restart' in self.reboot_type): self.sender_thr = threading.Thread(target = self.send_in_background) self.sniff_thr = threading.Thread(target = self.sniff_in_background) self.sniffer_started = threading.Event() # Event for the sniff_in_background status. @@ -1284,7 +1327,12 @@ def reboot_dut(self): self.sender_thr.start() self.log("Rebooting remote side") - stdout, stderr, return_code = self.dut_connection.execCommand("sudo " + self.reboot_type, timeout=30) + if self.reboot_type != 'service-warm-restart': + stdout, stderr, return_code = self.dut_connection.execCommand("sudo " + self.reboot_type, timeout=30) + else: + self.restart_service() + return + if stdout != []: self.log("stdout from %s: %s" % (self.reboot_type, str(stdout))) if stderr != []: @@ -1300,6 +1348,42 @@ def reboot_dut(self): return + def restart_service(self): + for service_name in self.test_params['service_list']: + if 'image_path_on_dut' in self.service_data[service_name]: + stdout, stderr, return_code = self.dut_connection.execCommand("sudo sonic-installer upgrade-docker {} {} -y --warm".format(service_name, self.service_data[service_name]['image_path_on_dut']), timeout=30) + else: + self.dut_connection.execCommand('sudo config warm_restart enable {}'.format(service_name)) + self.pre_service_warm_restart(service_name) + stdout, stderr, return_code = self.dut_connection.execCommand('sudo service {} restart'.format(service_name)) + + if stdout != []: + self.log("stdout from %s %s: %s" % (self.reboot_type, service_name, str(stdout))) + if stderr != []: + self.log("stderr from %s %s: %s" % (self.reboot_type, service_name, str(stderr))) + self.fails['dut'].add("service warm restart {} failed with error {}".format(service_name, stderr)) + thread.interrupt_main() + raise Exception("{} failed with error {}".format(self.reboot_type, stderr)) + self.log("return code from %s %s: %s" % (self.reboot_type, service_name, str(return_code))) + if return_code not in [0, 255]: + thread.interrupt_main() + + def pre_service_warm_restart(self, service_name): + """Copy from src/sonic-utilities/sonic_installer/main.py to do some special operation for particular containers + """ + if service_name == 'swss': + cmd = 'docker exec -i swss orchagent_restart_check -w 2000 -r 5' + stdout, stderr, return_code = self.dut_connection.execCommand(cmd) + if return_code != 0: + self.log('stdout from {}: {}'.format(cmd, str(stdout))) + self.log('stderr from {}: {}'.format(cmd, str(stderr))) + self.log('orchagent is not in clean state, RESTARTCHECK failed: {}'.format(return_code)) + elif service_name == 'bgp': + self.dut_connection.execCommand('docker exec -i bgp pkill -9 zebra') + self.dut_connection.execCommand('docker exec -i bgp pkill -9 bgpd') + elif service_name == 'teamd': + self.dut_connection.execCommand('docker exec -i teamd pkill -USR1 teamd > /dev/null') + def cmd(self, cmds): process = subprocess.Popen(cmds, shell=False, @@ -1325,7 +1409,7 @@ def peer_state_check(self, ip, queue): lacp_pdu_down_times and len(lacp_pdu_down_times) > 0 else None lacp_pdu_after_reboot = float(lacp_pdu_up_times[0]) if\ lacp_pdu_up_times and len(lacp_pdu_up_times) > 0 else None - if 'warm-reboot' in self.reboot_type and lacp_pdu_before_reboot and lacp_pdu_after_reboot: + if ('warm-reboot' in self.reboot_type or 'service-warm-restart' in self.reboot_type) and lacp_pdu_before_reboot and lacp_pdu_after_reboot: lacp_time_diff = lacp_pdu_after_reboot - lacp_pdu_before_reboot if lacp_time_diff >= 90 and not self.kvm_test: self.fails['dut'].add("LACP session likely terminated by neighbor ({})".format(ip) +\ diff --git a/ansible/roles/test/files/ptftests/arista.py b/ansible/roles/test/files/ptftests/arista.py index 46a15ab5c99..fb69002d886 100644 --- a/ansible/roles/test/files/ptftests/arista.py +++ b/ansible/roles/test/files/ptftests/arista.py @@ -241,7 +241,8 @@ def run(self): log_data = self.parse_logs(log_lines) if (self.reboot_type == 'fast-reboot' and \ any(k.startswith('BGP') for k in log_data) and any(k.startswith('PortChannel') for k in log_data)) \ - or (self.reboot_type == 'warm-reboot' and any(k.startswith('BGP') for k in log_data)): + or (self.reboot_type == 'warm-reboot' and any(k.startswith('BGP') for k in log_data)) \ + or (self.reboot_type == 'service-warm-restart' and any(k.startswith('BGP') for k in log_data)): log_present = True break time.sleep(1) # wait until logs are populated @@ -324,6 +325,8 @@ def parse_logs(self, data): return result elif self.reboot_type == 'warm-reboot' and initial_time_bgp == -1: return result + elif self.reboot_type == 'service-warm-restart' and initial_time_bgp == -1: + return result for events in result_bgp.values(): if events[-1][1] != 'Established': @@ -592,7 +595,7 @@ def change_bgp_neigh_state(self, asn, is_up=True): if is_up == True: self.do_cmd('%s' % state[is_up]) else: - # shutdown BGP will pop confirm message, the message is + # shutdown BGP will pop confirm message, the message is # "You are attempting to shutdown BGP. Are you sure you want to shutdown? [confirm]" self.do_cmd('%s' % state[is_up], prompt = '[confirm]') self.do_cmd('y') @@ -707,6 +710,10 @@ def check_series_status(self, output, entity, what): self.fails.add("%s must be up when the test stops" % what) return 0, 0 + if len(sorted_keys) == 1: + # for service warm restart, the down count could be 0 + return 0, 0 + start = sorted_keys[0] cur_state = True res = defaultdict(list) diff --git a/tests/common/fixtures/advanced_reboot.py b/tests/common/fixtures/advanced_reboot.py index 3a6bdad4032..7381c6c3f9b 100644 --- a/tests/common/fixtures/advanced_reboot.py +++ b/tests/common/fixtures/advanced_reboot.py @@ -42,7 +42,7 @@ def __init__(self, request, duthost, ptfhost, localhost, tbinfo, creds, **kwargs @param tbinfo: fixture provides information about testbed @param kwargs: extra parameters including reboot type ''' - assert 'rebootType' in kwargs and ('warm-reboot' in kwargs['rebootType'] or 'fast-reboot' in kwargs['rebootType']) , ( + assert 'rebootType' in kwargs and ('warm-reboot' in kwargs['rebootType'] or 'fast-reboot' in kwargs['rebootType'] or 'service-warm-restart' in kwargs['rebootType']) , ( "Please set rebootType var." ) @@ -90,6 +90,10 @@ def __init__(self, request, duthost, ptfhost, localhost, tbinfo, creds, **kwargs self.__buildTestbedData(tbinfo) + if self.rebootType == 'service-warm-restart': + assert hasattr(self, 'service_list') + self.service_data = {} + def __extractTestParam(self): ''' Extract test parameters from pytest request object. Note that all the parameters have default values. @@ -106,6 +110,7 @@ def __extractTestParam(self): self.replaceFastRebootScript = self.request.config.getoption("--replace_fast_reboot_script") self.postRebootCheckScript = self.request.config.getoption("--post_reboot_check_script") self.bgpV4V6TimeDiff = self.request.config.getoption("--bgp_v4_v6_time_diff") + self.new_docker_image = self.request.config.getoption("--new_docker_image") # Set default reboot limit if it is not given if self.rebootLimit is None: @@ -326,6 +331,35 @@ def __handleRebootImage(self): logger.info('Remove downloaded tempfile') self.duthost.shell('rm -f {}'.format(tempfile)) + def __handleDockerImage(self): + """Download and install docker image to DUT + """ + if self.new_docker_image is None: + return + + for service_name in self.service_list: + data = {} + docker_image_name = self.duthost.shell('docker ps | grep {} | awk \'{{print $2}}\''.format(service_name))['stdout'] + cmd = 'docker images {} --format \{{\{{.ID\}}\}}'.format(docker_image_name) + data['image_id'] = self.duthost.shell(cmd)['stdout'] + data['image_name'], data['image_tag'] = docker_image_name.split(':') + + local_image_path = '/tmp/{}.gz'.format(data['image_name']) + logger.info('Downloading new docker image for {} to {}'.format(service_name, local_image_path)) + output = self.localhost.shell('curl --silent --write-out "%{{http_code}}" {0}/{1}.gz --output {2}'.format(self.new_docker_image, data['image_name'], local_image_path), module_ignore_errors=True)['stdout'] + if '404' not in output and os.path.exists(local_image_path): + temp_file = self.duthost.shell('mktemp')['stdout'] + self.duthost.copy(src=local_image_path, dest=temp_file) + self.localhost.shell('rm -f /tmp/{}.gz'.format(data['image_name'])) + data['image_path_on_dut'] = temp_file + logger.info('Successfully downloaded docker image, will perform in-service upgrade') + else: + data['image_path_on_dut'] = None + logger.info('Docker image not found, will perform in-service reboot') + self.service_data[service_name] = data + + logger.info('service data = {}'.format(json.dumps(self.service_data, indent=2))) + def __setupTestbed(self): ''' Sets testbed up. It tranfers test data files, ARP responder, and runs script to update IPs and MAC addresses. @@ -438,7 +472,10 @@ def imageInstall(self, prebootList=None, inbootList=None, prebootFiles=None): self.__setupTestbed() # Download and install new sonic image - self.__handleRebootImage() + if self.rebootType != 'service-warm-restart': + self.__handleRebootImage() + else: + self.__handleDockerImage() # Handle mellanox platform self.__handleMellanoxDut() @@ -586,7 +623,9 @@ def __runPtfRunner(self, rebootOper=None): "asic_type": self.duthost.facts["asic_type"], "allow_mac_jumping": self.allowMacJump, "preboot_files" : self.prebootFiles, - "alt_password": self.duthost.host.options['variable_manager']._hostvars[self.duthost.hostname].get("ansible_altpassword") + "alt_password": self.duthost.host.options['variable_manager']._hostvars[self.duthost.hostname].get("ansible_altpassword"), + "service_list": None if self.rebootType != 'service-warm-restart' else self.service_list, + "service_data": None if self.rebootType != 'service-warm-restart' else self.service_data, } if not isinstance(rebootOper, SadOperation): @@ -639,6 +678,36 @@ def __restorePrevImage(self): wait = self.readyTimeout ) + def __restorePrevDockerImage(self): + """Restore previous docker image. + """ + for service_name, data in self.service_data.items(): + if data['image_path_on_dut'] is None: + self.duthost.shell('sudo config warm_restart disable {}'.format(service_name)) + continue + + # We don't use sonic-installer rollback-docker CLI here because: + # 1. it does not remove the docker image which causes the running container still using the old image + # 2. it runs service restart for some containers which enlarge the test duration + self.duthost.shell('rm -f {}'.format(data['image_path_on_dut'])) + logger.info('Restore docker image for {}'.format(service_name)) + self.duthost.shell('service {} stop'.format(service_name)) + self.duthost.shell('docker rm {}'.format(service_name)) + image_ids = self.duthost.shell('docker images {} --format \{{\{{.ID\}}\}}'.format(data['image_name']))['stdout_lines'] + for image_id in image_ids: + if image_id != data['image_id']: + self.duthost.shell('docker rmi -f {}'.format(image_id)) + break + + self.duthost.shell('docker tag {} {}:{}'.format(data['image_id'], data['image_name'], data['image_tag'])) + + rebootDut( + self.duthost, + self.localhost, + reboot_type='cold', + wait = 300 + ) + def tearDown(self): ''' Tears down test case. It also verifies that config_db.json exists. @@ -658,7 +727,13 @@ def tearDown(self): self.__runScript([self.postRebootCheckScript], self.duthost) if not self.stayInTargetImage: - self.__restorePrevImage() + logger.info('Restoring previous image') + if self.rebootType != 'service-warm-restart': + self.__restorePrevImage() + else: + self.__restorePrevDockerImage() + else: + logger.info('Stay in new image') @pytest.fixture def get_advanced_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, localhost, tbinfo, creds): diff --git a/tests/platform_tests/args/advanced_reboot_args.py b/tests/platform_tests/args/advanced_reboot_args.py index 1103a15a0d2..37d1f993fea 100644 --- a/tests/platform_tests/args/advanced_reboot_args.py +++ b/tests/platform_tests/args/advanced_reboot_args.py @@ -69,6 +69,22 @@ def add_advanced_reboot_args(parser): help="URL of new sonic image", ) + parser.addoption( + "--new_docker_image", + action="store", + type=str, + default=None, + help="URL of new docker image", + ) + + parser.addoption( + "--ignore_service", + action="store", + type=str, + default=None, + help="Services that ignore for warm restart test", + ) + parser.addoption( "--ready_timeout", action="store", diff --git a/tests/platform_tests/test_advanced_reboot.py b/tests/platform_tests/test_advanced_reboot.py index 46c85ea08f4..8b4a1501c54 100644 --- a/tests/platform_tests/test_advanced_reboot.py +++ b/tests/platform_tests/test_advanced_reboot.py @@ -13,7 +13,6 @@ pytest.mark.topology('t0') ] - def pytest_generate_tests(metafunc): input_sad_cases = metafunc.config.getoption("sad_case_list") input_sad_list = list() diff --git a/tests/platform_tests/test_service_warm_restart.py b/tests/platform_tests/test_service_warm_restart.py new file mode 100644 index 00000000000..46ee78068bd --- /dev/null +++ b/tests/platform_tests/test_service_warm_restart.py @@ -0,0 +1,72 @@ +import pytest +import logging + +from tests.common.helpers.assertions import pytest_require +from tests.common.utilities import skip_release +from tests.platform_tests.verify_dut_health import verify_dut_health # lgtm[py/unused-import] + +pytestmark = [ + pytest.mark.disable_loganalyzer, + pytest.mark.topology('t0') +] + +@pytest.fixture(autouse=True, scope="module") +def check_image_version(duthost): + """Skips this test if the SONiC image installed on DUT is older than 202205 + + Args: + duthost: Hostname of DUT. + + Returns: + None. + """ + skip_release(duthost, ["201811", "201911", "202012", "202106"]) + + +def test_service_warm_restart(request, duthosts, rand_one_dut_hostname, verify_dut_health, get_advanced_reboot, + advanceboot_loganalyzer, capture_interface_counters): + duthost = duthosts[rand_one_dut_hostname] + + # Get built-in service + spm_data = duthost.show_and_parse('spm list') + built_in_repo_set = {item['repository'] for item in spm_data if item['status'] == 'Built-In'} + built_in_service_set = set() + for built_in_repo in built_in_repo_set: + service_name = duthost.shell('docker ps --filter "ancestor={}" --format \{{\{{.Names\}}\}}'.format(built_in_repo))['stdout'] + service_name = service_name.strip() + if service_name: + built_in_service_set.add(service_name) + else: + logging.info('service with docker repo {} is not enabled or running, skip warm restart for it'.format(built_in_repo)) + + feature_list = duthost.show_and_parse('show feature status') + ignored_services = request.config.getoption("--ignore_service") + candidate_service_list = [] + for feature_data in feature_list: + if feature_data['feature'] in ['database', 'syncd']: + # Features that do not support warm restart + continue + + if feature_data['feature'] not in built_in_service_set: + # There is no guarantee that non built-in feature support warm restart, so ignore them + logging.info('Feature {} is not a built-in feature, skip warm restart for it.'.format(feature_data['feature'])) + continue + + if ignored_services: + ignored_services = ignored_services.split(',') + if feature_data['feature'] in ignored_services: + logging.info("Feature {} is ignored by user, skip warm restart for it.".format(feature_data['feature'])) + continue + + if feature_data['state'] == 'disabled' or feature_data['state'] == 'always_disabled': + logging.info("Feature {} is not enabled, skip warm restart for it.".format(feature_data['feature'])) + continue + + candidate_service_list.append(feature_data['feature']) + + pytest_require(candidate_service_list, 'Skip service warm restart test because candidate_service_list is empty') + + advancedReboot = get_advanced_reboot(rebootType='service-warm-restart', + service_list=candidate_service_list, + advanceboot_loganalyzer=advanceboot_loganalyzer) + advancedReboot.runRebootTestcase()