Skip to content

Commit

Permalink
Add new test case for container based warm restart (#6054)
Browse files Browse the repository at this point in the history
Added a new test case test_advanced_reboot::test_service_warm_restart.

This test tries to run warm restart for each service in FEATURE table.
If a URL is provided by CLI option "--new_docker_image", the test case will try download new docker image from the URL and do in service warm upgrade.

The new test case reused the advanced-reboot infrastructure in sonic-mgmt and added some special logic for container based warm restart
  • Loading branch information
Junchao-Mellanox authored Aug 25, 2022
1 parent 83d7643 commit 9b5a439
Show file tree
Hide file tree
Showing 6 changed files with 267 additions and 14 deletions.
98 changes: 91 additions & 7 deletions ansible/roles/test/files/ptftests/advanced-reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,17 @@ def __init__(self):
self.kvm_test = True
else:
self.kvm_test = False
if "service-warm-restart" in self.test_params['reboot_type']:
self.check_param('service_list', None, required=True)
self.check_param('service_data', None, required=True)
self.service_data = self.test_params['service_data']
for service_name in self.test_params['service_list']:
cmd = 'systemctl show -p ExecMainStartTimestamp {}'.format(service_name)
stdout, _, _ = self.dut_connection.execCommand(cmd)
if service_name not in self.service_data:
self.service_data[service_name] = {}
self.service_data[service_name]['service_start_time'] = str(stdout[0]).strip()
self.log("Service start time for {} is {}".format(service_name, self.service_data[service_name]['service_start_time']))
return

def read_json(self, name):
Expand Down Expand Up @@ -437,7 +448,7 @@ def build_vlan_if_port_mapping(self):
portchannel_names = [pc['name'] for pc in portchannel_content.values()]

vlan_content = self.read_json('vlan_ports_file')

vlan_if_port = []
for vlan in self.vlan_ip_range:
for ifname in vlan_content[vlan]['members']:
Expand Down Expand Up @@ -926,6 +937,31 @@ def wait_until_control_plane_up(self):
self.no_control_stop = datetime.datetime.now()
self.log("Dut reboots: control plane up at %s" % str(self.no_control_stop))

def wait_until_service_restart(self):
self.log("Wait until sevice restart")
self.reboot_start = datetime.datetime.now()
service_set = set(self.test_params['service_list'])
wait_time = 120
while wait_time > 0:
for service_name in self.test_params['service_list']:
if service_name not in service_set:
continue
cmd = 'systemctl show -p ExecMainStartTimestamp {}'.format(service_name)
stdout, _, _ = self.dut_connection.execCommand(cmd)
if self.service_data[service_name]['service_start_time'] != str(stdout[0]).strip():
service_set.remove(service_name)
if not service_set:
break
wait_time -= 10
time.sleep(10)

if service_set:
self.fails['dut'].add("Container {} hasn't come back up in {} seconds".format(','.join(service_set), wait_time))
raise TimeoutError

# TODO: add timestamp
self.log("Service has restarted")

def handle_fast_reboot_health_check(self):
self.log("Check that device is still forwarding data plane traffic")
self.fails['dut'].add("Data plane has a forwarding problem after CPU went down")
Expand Down Expand Up @@ -1017,6 +1053,10 @@ def wait_for_ssh_threads(signal):
# verify there are no interface flaps after warm boot
self.neigh_lag_status_check()

if 'service-warm-restart' == self.reboot_type:
# verify there are no interface flaps after warm boot
self.neigh_lag_status_check()

def handle_advanced_reboot_health_check_kvm(self):
self.log("Wait until data plane stops")
forward_stop_signal = multiprocessing.Event()
Expand Down Expand Up @@ -1193,8 +1233,11 @@ def runTest(self):
thr = threading.Thread(target=self.reboot_dut)
thr.setDaemon(True)
thr.start()
self.wait_until_control_plane_down()
self.no_control_start = self.cpu_state.get_state_time('down')
if self.reboot_type != 'service-warm-restart':
self.wait_until_control_plane_down()
self.no_control_start = self.cpu_state.get_state_time('down')
else:
self.wait_until_service_restart()

if 'warm-reboot' in self.reboot_type:
finalizer_timeout = 60 + self.test_params['reboot_limit_in_seconds']
Expand All @@ -1210,7 +1253,7 @@ def runTest(self):
else:
if self.reboot_type == 'fast-reboot':
self.handle_fast_reboot_health_check()
if 'warm-reboot' in self.reboot_type:
if 'warm-reboot' in self.reboot_type or 'service-warm-restart' == self.reboot_type:
self.handle_warm_reboot_health_check()
self.handle_post_reboot_health_check()

Expand Down Expand Up @@ -1276,15 +1319,20 @@ def reboot_dut(self):
time.sleep(self.reboot_delay)

if not self.kvm_test and\
(self.reboot_type == 'fast-reboot' or 'warm-reboot' in self.reboot_type):
(self.reboot_type == 'fast-reboot' or 'warm-reboot' in self.reboot_type or 'service-warm-restart' in self.reboot_type):
self.sender_thr = threading.Thread(target = self.send_in_background)
self.sniff_thr = threading.Thread(target = self.sniff_in_background)
self.sniffer_started = threading.Event() # Event for the sniff_in_background status.
self.sniff_thr.start()
self.sender_thr.start()

self.log("Rebooting remote side")
stdout, stderr, return_code = self.dut_connection.execCommand("sudo " + self.reboot_type, timeout=30)
if self.reboot_type != 'service-warm-restart':
stdout, stderr, return_code = self.dut_connection.execCommand("sudo " + self.reboot_type, timeout=30)
else:
self.restart_service()
return

if stdout != []:
self.log("stdout from %s: %s" % (self.reboot_type, str(stdout)))
if stderr != []:
Expand All @@ -1300,6 +1348,42 @@ def reboot_dut(self):

return

def restart_service(self):
for service_name in self.test_params['service_list']:
if 'image_path_on_dut' in self.service_data[service_name]:
stdout, stderr, return_code = self.dut_connection.execCommand("sudo sonic-installer upgrade-docker {} {} -y --warm".format(service_name, self.service_data[service_name]['image_path_on_dut']), timeout=30)
else:
self.dut_connection.execCommand('sudo config warm_restart enable {}'.format(service_name))
self.pre_service_warm_restart(service_name)
stdout, stderr, return_code = self.dut_connection.execCommand('sudo service {} restart'.format(service_name))

if stdout != []:
self.log("stdout from %s %s: %s" % (self.reboot_type, service_name, str(stdout)))
if stderr != []:
self.log("stderr from %s %s: %s" % (self.reboot_type, service_name, str(stderr)))
self.fails['dut'].add("service warm restart {} failed with error {}".format(service_name, stderr))
thread.interrupt_main()
raise Exception("{} failed with error {}".format(self.reboot_type, stderr))
self.log("return code from %s %s: %s" % (self.reboot_type, service_name, str(return_code)))
if return_code not in [0, 255]:
thread.interrupt_main()

def pre_service_warm_restart(self, service_name):
"""Copy from src/sonic-utilities/sonic_installer/main.py to do some special operation for particular containers
"""
if service_name == 'swss':
cmd = 'docker exec -i swss orchagent_restart_check -w 2000 -r 5'
stdout, stderr, return_code = self.dut_connection.execCommand(cmd)
if return_code != 0:
self.log('stdout from {}: {}'.format(cmd, str(stdout)))
self.log('stderr from {}: {}'.format(cmd, str(stderr)))
self.log('orchagent is not in clean state, RESTARTCHECK failed: {}'.format(return_code))
elif service_name == 'bgp':
self.dut_connection.execCommand('docker exec -i bgp pkill -9 zebra')
self.dut_connection.execCommand('docker exec -i bgp pkill -9 bgpd')
elif service_name == 'teamd':
self.dut_connection.execCommand('docker exec -i teamd pkill -USR1 teamd > /dev/null')

def cmd(self, cmds):
process = subprocess.Popen(cmds,
shell=False,
Expand All @@ -1325,7 +1409,7 @@ def peer_state_check(self, ip, queue):
lacp_pdu_down_times and len(lacp_pdu_down_times) > 0 else None
lacp_pdu_after_reboot = float(lacp_pdu_up_times[0]) if\
lacp_pdu_up_times and len(lacp_pdu_up_times) > 0 else None
if 'warm-reboot' in self.reboot_type and lacp_pdu_before_reboot and lacp_pdu_after_reboot:
if ('warm-reboot' in self.reboot_type or 'service-warm-restart' in self.reboot_type) and lacp_pdu_before_reboot and lacp_pdu_after_reboot:
lacp_time_diff = lacp_pdu_after_reboot - lacp_pdu_before_reboot
if lacp_time_diff >= 90 and not self.kvm_test:
self.fails['dut'].add("LACP session likely terminated by neighbor ({})".format(ip) +\
Expand Down
11 changes: 9 additions & 2 deletions ansible/roles/test/files/ptftests/arista.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,8 @@ def run(self):
log_data = self.parse_logs(log_lines)
if (self.reboot_type == 'fast-reboot' and \
any(k.startswith('BGP') for k in log_data) and any(k.startswith('PortChannel') for k in log_data)) \
or (self.reboot_type == 'warm-reboot' and any(k.startswith('BGP') for k in log_data)):
or (self.reboot_type == 'warm-reboot' and any(k.startswith('BGP') for k in log_data)) \
or (self.reboot_type == 'service-warm-restart' and any(k.startswith('BGP') for k in log_data)):
log_present = True
break
time.sleep(1) # wait until logs are populated
Expand Down Expand Up @@ -324,6 +325,8 @@ def parse_logs(self, data):
return result
elif self.reboot_type == 'warm-reboot' and initial_time_bgp == -1:
return result
elif self.reboot_type == 'service-warm-restart' and initial_time_bgp == -1:
return result

for events in result_bgp.values():
if events[-1][1] != 'Established':
Expand Down Expand Up @@ -592,7 +595,7 @@ def change_bgp_neigh_state(self, asn, is_up=True):
if is_up == True:
self.do_cmd('%s' % state[is_up])
else:
# shutdown BGP will pop confirm message, the message is
# shutdown BGP will pop confirm message, the message is
# "You are attempting to shutdown BGP. Are you sure you want to shutdown? [confirm]"
self.do_cmd('%s' % state[is_up], prompt = '[confirm]')
self.do_cmd('y')
Expand Down Expand Up @@ -707,6 +710,10 @@ def check_series_status(self, output, entity, what):
self.fails.add("%s must be up when the test stops" % what)
return 0, 0

if len(sorted_keys) == 1:
# for service warm restart, the down count could be 0
return 0, 0

start = sorted_keys[0]
cur_state = True
res = defaultdict(list)
Expand Down
83 changes: 79 additions & 4 deletions tests/common/fixtures/advanced_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, request, duthost, ptfhost, localhost, tbinfo, creds, **kwargs
@param tbinfo: fixture provides information about testbed
@param kwargs: extra parameters including reboot type
'''
assert 'rebootType' in kwargs and ('warm-reboot' in kwargs['rebootType'] or 'fast-reboot' in kwargs['rebootType']) , (
assert 'rebootType' in kwargs and ('warm-reboot' in kwargs['rebootType'] or 'fast-reboot' in kwargs['rebootType'] or 'service-warm-restart' in kwargs['rebootType']) , (
"Please set rebootType var."
)

Expand Down Expand Up @@ -90,6 +90,10 @@ def __init__(self, request, duthost, ptfhost, localhost, tbinfo, creds, **kwargs

self.__buildTestbedData(tbinfo)

if self.rebootType == 'service-warm-restart':
assert hasattr(self, 'service_list')
self.service_data = {}

def __extractTestParam(self):
'''
Extract test parameters from pytest request object. Note that all the parameters have default values.
Expand All @@ -106,6 +110,7 @@ def __extractTestParam(self):
self.replaceFastRebootScript = self.request.config.getoption("--replace_fast_reboot_script")
self.postRebootCheckScript = self.request.config.getoption("--post_reboot_check_script")
self.bgpV4V6TimeDiff = self.request.config.getoption("--bgp_v4_v6_time_diff")
self.new_docker_image = self.request.config.getoption("--new_docker_image")

# Set default reboot limit if it is not given
if self.rebootLimit is None:
Expand Down Expand Up @@ -326,6 +331,35 @@ def __handleRebootImage(self):
logger.info('Remove downloaded tempfile')
self.duthost.shell('rm -f {}'.format(tempfile))

def __handleDockerImage(self):
"""Download and install docker image to DUT
"""
if self.new_docker_image is None:
return

for service_name in self.service_list:
data = {}
docker_image_name = self.duthost.shell('docker ps | grep {} | awk \'{{print $2}}\''.format(service_name))['stdout']
cmd = 'docker images {} --format \{{\{{.ID\}}\}}'.format(docker_image_name)
data['image_id'] = self.duthost.shell(cmd)['stdout']
data['image_name'], data['image_tag'] = docker_image_name.split(':')

local_image_path = '/tmp/{}.gz'.format(data['image_name'])
logger.info('Downloading new docker image for {} to {}'.format(service_name, local_image_path))
output = self.localhost.shell('curl --silent --write-out "%{{http_code}}" {0}/{1}.gz --output {2}'.format(self.new_docker_image, data['image_name'], local_image_path), module_ignore_errors=True)['stdout']
if '404' not in output and os.path.exists(local_image_path):
temp_file = self.duthost.shell('mktemp')['stdout']
self.duthost.copy(src=local_image_path, dest=temp_file)
self.localhost.shell('rm -f /tmp/{}.gz'.format(data['image_name']))
data['image_path_on_dut'] = temp_file
logger.info('Successfully downloaded docker image, will perform in-service upgrade')
else:
data['image_path_on_dut'] = None
logger.info('Docker image not found, will perform in-service reboot')
self.service_data[service_name] = data

logger.info('service data = {}'.format(json.dumps(self.service_data, indent=2)))

def __setupTestbed(self):
'''
Sets testbed up. It tranfers test data files, ARP responder, and runs script to update IPs and MAC addresses.
Expand Down Expand Up @@ -438,7 +472,10 @@ def imageInstall(self, prebootList=None, inbootList=None, prebootFiles=None):
self.__setupTestbed()

# Download and install new sonic image
self.__handleRebootImage()
if self.rebootType != 'service-warm-restart':
self.__handleRebootImage()
else:
self.__handleDockerImage()

# Handle mellanox platform
self.__handleMellanoxDut()
Expand Down Expand Up @@ -586,7 +623,9 @@ def __runPtfRunner(self, rebootOper=None):
"asic_type": self.duthost.facts["asic_type"],
"allow_mac_jumping": self.allowMacJump,
"preboot_files" : self.prebootFiles,
"alt_password": self.duthost.host.options['variable_manager']._hostvars[self.duthost.hostname].get("ansible_altpassword")
"alt_password": self.duthost.host.options['variable_manager']._hostvars[self.duthost.hostname].get("ansible_altpassword"),
"service_list": None if self.rebootType != 'service-warm-restart' else self.service_list,
"service_data": None if self.rebootType != 'service-warm-restart' else self.service_data,
}

if not isinstance(rebootOper, SadOperation):
Expand Down Expand Up @@ -639,6 +678,36 @@ def __restorePrevImage(self):
wait = self.readyTimeout
)

def __restorePrevDockerImage(self):
"""Restore previous docker image.
"""
for service_name, data in self.service_data.items():
if data['image_path_on_dut'] is None:
self.duthost.shell('sudo config warm_restart disable {}'.format(service_name))
continue

# We don't use sonic-installer rollback-docker CLI here because:
# 1. it does not remove the docker image which causes the running container still using the old image
# 2. it runs service restart for some containers which enlarge the test duration
self.duthost.shell('rm -f {}'.format(data['image_path_on_dut']))
logger.info('Restore docker image for {}'.format(service_name))
self.duthost.shell('service {} stop'.format(service_name))
self.duthost.shell('docker rm {}'.format(service_name))
image_ids = self.duthost.shell('docker images {} --format \{{\{{.ID\}}\}}'.format(data['image_name']))['stdout_lines']
for image_id in image_ids:
if image_id != data['image_id']:
self.duthost.shell('docker rmi -f {}'.format(image_id))
break

self.duthost.shell('docker tag {} {}:{}'.format(data['image_id'], data['image_name'], data['image_tag']))

rebootDut(
self.duthost,
self.localhost,
reboot_type='cold',
wait = 300
)

def tearDown(self):
'''
Tears down test case. It also verifies that config_db.json exists.
Expand All @@ -658,7 +727,13 @@ def tearDown(self):
self.__runScript([self.postRebootCheckScript], self.duthost)

if not self.stayInTargetImage:
self.__restorePrevImage()
logger.info('Restoring previous image')
if self.rebootType != 'service-warm-restart':
self.__restorePrevImage()
else:
self.__restorePrevDockerImage()
else:
logger.info('Stay in new image')

@pytest.fixture
def get_advanced_reboot(request, duthosts, enum_rand_one_per_hwsku_frontend_hostname, ptfhost, localhost, tbinfo, creds):
Expand Down
16 changes: 16 additions & 0 deletions tests/platform_tests/args/advanced_reboot_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,22 @@ def add_advanced_reboot_args(parser):
help="URL of new sonic image",
)

parser.addoption(
"--new_docker_image",
action="store",
type=str,
default=None,
help="URL of new docker image",
)

parser.addoption(
"--ignore_service",
action="store",
type=str,
default=None,
help="Services that ignore for warm restart test",
)

parser.addoption(
"--ready_timeout",
action="store",
Expand Down
1 change: 0 additions & 1 deletion tests/platform_tests/test_advanced_reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
pytest.mark.topology('t0')
]


def pytest_generate_tests(metafunc):
input_sad_cases = metafunc.config.getoption("sad_case_list")
input_sad_list = list()
Expand Down
Loading

0 comments on commit 9b5a439

Please sign in to comment.