From 3eed32e3e0dc99249947dbce9679412743b4c85d Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Wed, 26 Jun 2019 14:47:37 +0800 Subject: [PATCH 1/3] [platform] Implement platform phase 2 cases Implement the SONiC platform phase 2 test cases using the pytest-ansible framework. Signed-off-by: Xin Wang --- tests/platform/check_critical_services.py | 83 ++++++++++++ tests/platform/check_interface_status.py | 50 ++++++++ tests/platform/check_transceiver_status.py | 119 +++++++++++++++++ .../mellanox/check_hw_mgmt_service.py | 44 +++++++ tests/platform/mellanox/check_sysfs.py | 73 +++++++++++ tests/platform/test_reboot.py | 121 ++++++++++++++++++ tests/platform/test_reload_config.py | 64 +++++++++ tests/platform/test_sequential_restart.py | 80 ++++++++++++ tests/platform/test_xcvr_info_in_db.py | 32 +---- tests/platform/utilities.py | 41 ++++++ 10 files changed, 679 insertions(+), 28 deletions(-) create mode 100644 tests/platform/check_critical_services.py create mode 100644 tests/platform/check_interface_status.py create mode 100644 tests/platform/check_transceiver_status.py create mode 100644 tests/platform/mellanox/check_hw_mgmt_service.py create mode 100644 tests/platform/mellanox/check_sysfs.py create mode 100644 tests/platform/test_reboot.py create mode 100644 tests/platform/test_reload_config.py create mode 100644 tests/platform/test_sequential_restart.py create mode 100644 tests/platform/utilities.py diff --git a/tests/platform/check_critical_services.py b/tests/platform/check_critical_services.py new file mode 100644 index 00000000000..d7b3ea084c0 --- /dev/null +++ b/tests/platform/check_critical_services.py @@ -0,0 +1,83 @@ +""" +Helper script for checking status of critical services + +This script contains re-usable functions for checking status of critical services. +""" +import time +import logging + +from utilities import wait_until + +critical_services = ["swss", "syncd", "database", "teamd", "bgp", "pmon", "lldp"] + + +def get_service_status(dut, service): + """ + @summary: Get the ActiveState and SubState of a service. This function uses the systemctl tool to get the + ActiveState and SubState of specified service. + @param dut: The ansible_host object of DUT. For interacting with DUT. + @param service: Service name. + @return: Returns a dictionary containing ActiveState and SubState of the specified service, for example: + { + "ActivateState": "active", + "SubState": "running" + } + """ + output = dut.command("systemctl -p ActiveState -p SubState show %s" % service) + result = {} + for line in output["stdout_lines"]: + fields = line.split("=") + if len(fields) >= 2: + result[fields[0]] = fields[1] + return result + + +def service_fully_started(dut, service): + """ + @summary: Check whether the specified service is fully started on DUT. According to the SONiC design, the last + instruction in service starting script is to run "docker wait ". This function take advantage + of this design to check whether a service has been fully started. The trick is to check whether + "docker wait " exists in current running processes. + @param dut: The ansible_host object of DUT. For interacting with DUT. + @param service: Service name. + @return: Return True if the specified service is fully started. Otherwise return False. + """ + try: + output = dut.command('pgrep -f "docker wait %s"' % service) + if output["stdout_lines"]: + return True + else: + return False + except: + return False + + +def critical_services_fully_started(dut): + """ + @summary: Check whether all the critical service have been fully started. + @param dut: The ansible_host object of DUT. For interacting with DUT. + @return: Return True if all the critical services have been fully started. Otherwise return False. + """ + result = {} + for service in critical_services: + result[service] = service_fully_started(dut, service) + logging.debug("Status of critical services: %s" % str(result)) + return all(result.values()) + + +def check_critical_services(dut): + """ + @summary: Use systemctl to check whether all the critical services have expected status. ActiveState of all + services must be "active". SubState of all services must be "running". + @param dut: The ansible_host object of DUT. For interacting with DUT. + """ + logging.info("Wait until all critical services are fully started") + assert wait_until(300, 20, critical_services_fully_started, dut), "Not all critical services are fully started" + + logging.info("Check critical service status") + for service in critical_services: + status = get_service_status(dut, service) + assert status["ActiveState"] == "active", \ + "ActiveState of %s is %s, expected: active" % (service, status["ActiveState"]) + assert status["SubState"] == "running", \ + "SubState of %s is %s, expected: active" % (service, status["SubState"]) diff --git a/tests/platform/check_interface_status.py b/tests/platform/check_interface_status.py new file mode 100644 index 00000000000..c3c3cea26a5 --- /dev/null +++ b/tests/platform/check_interface_status.py @@ -0,0 +1,50 @@ +""" +Helper script for checking status of interfaces + +This script contains re-usable functions for checking status of interfaces on SONiC. +""" + + +def parse_intf_status(lines): + """ + @summary: Parse the output of command "intfutil description". + @param lines: The output lines of command "intfutil description". + @return: Return a dictionary like: + { + "Ethernet0": { + "oper": "up", + "admin": "up", + "alias": "etp1", + "desc": "ARISTA01T2:Ethernet1" + }, + ... + } + """ + result = {} + for line in lines: + fields = line.split() + if len(fields) >= 5: + intf = fields[0] + oper, admin, alias, desc = fields[1], fields[2], fields[3], ' '.join(fields[4:]) + result[intf] = {"oper": oper, "admin": admin, "alias": alias, "desc": desc} + return result + + +def check_interface_status(dut, interfaces): + """ + @summary: Check the admin and oper status of the specified interfaces on DUT. + @param dut: The ansible_host object of DUT. For interacting with DUT. + @param hostname: + @param interfaces: List of interfaces that need to be checked. + """ + mg_ports = dut.minigraph_facts(host=dut.hostname)["ansible_facts"]["minigraph_ports"] + output = dut.command("intfutil description") + intf_status = parse_intf_status(output["stdout_lines"][2:]) + for intf in interfaces: + expected_oper = "up" if intf in mg_ports else "down" + expected_admin = "up" if intf in mg_ports else "down" + assert intf in intf_status, "Missing status for interface %s" % intf + assert intf_status[intf]["oper"] == expected_oper, \ + "Oper status of interface %s is %s, expected '%s'" % (intf, intf_status[intf]["oper"], expected_oper) + assert intf_status[intf]["admin"] == expected_oper, \ + "Admin status of interface %s is %s, expected '%s'" % (intf, intf_status[intf]["admin"], expected_admin) diff --git a/tests/platform/check_transceiver_status.py b/tests/platform/check_transceiver_status.py new file mode 100644 index 00000000000..dec74349929 --- /dev/null +++ b/tests/platform/check_transceiver_status.py @@ -0,0 +1,119 @@ +""" +Helper script for checking status of transceivers + +This script contains re-usable functions for checking status of transceivers. +""" +import logging +import re +import json + + +def parse_transceiver_info(output_lines): + """ + @summary: Parse the list of transceiver from DB table TRANSCEIVER_INFO content + @param output_lines: DB table TRANSCEIVER_INFO content output by 'redis' command + @return: Return parsed transceivers in a list + """ + result = [] + p = re.compile(r"TRANSCEIVER_INFO\|(Ethernet\d+)") + for line in output_lines: + m = p.match(line) + assert m, "Unexpected line %s" % line + result.append(m.group(1)) + return result + + +def parse_transceiver_dom_sensor(output_lines): + """ + @summary: Parse the list of transceiver from DB table TRANSCEIVER_DOM_SENSOR content + @param output_lines: DB table TRANSCEIVER_DOM_SENSOR content output by 'redis' command + @return: Return parsed transceivers in a list + """ + result = [] + p = re.compile(r"TRANSCEIVER_DOM_SENSOR\|(Ethernet\d+)") + for line in output_lines: + m = p.match(line) + assert m, "Unexpected line %s" % line + result.append(m.group(1)) + return result + + +def all_transceivers_detected(dut, interfaces): + """ + Check if transceiver information of all the specified interfaces have been detected. + """ + db_output = dut.command("redis-cli --raw -n 6 keys TRANSCEIVER_INFO\*")["stdout_lines"] + not_detected_interfaces = [intf for intf in interfaces if "TRANSCEIVER_INFO|%s" % intf not in db_output] + if len(not_detected_interfaces) > 0: + logging.debug("Interfaces not detected: %s" % str(not_detected_interfaces)) + return False + return True + + +def check_transceiver_basic(dut, interfaces): + """ + @summary: Check whether all the specified interface are in TRANSCEIVER_INFO redis DB. + @param dut: The ansible_host object of DUT. For interacting with DUT. + @param interfaces: List of interfaces that need to be checked. + """ + logging.info("Check whether transceiver information of all ports are in redis") + xcvr_info = dut.command("redis-cli -n 6 keys TRANSCEIVER_INFO*") + parsed_xcvr_info = parse_transceiver_info(xcvr_info["stdout_lines"]) + for intf in interfaces: + assert intf in parsed_xcvr_info, "TRANSCEIVER INFO of %s is not found in DB" % intf + + +def check_transceiver_details(dut, interfaces): + """ + @summary: Check the detailed TRANSCEIVER_INFO content of all the specified interfaces. + @param dut: The ansible_host object of DUT. For interacting with DUT. + @param interfaces: List of interfaces that need to be checked. + """ + logging.info("Check detailed transceiver information of each connected port") + expected_fields = ["type", "hardwarerev", "serialnum", "manufacturename", "modelname"] + for intf in interfaces: + port_xcvr_info = dut.command('redis-cli -n 6 hgetall "TRANSCEIVER_INFO|%s"' % intf) + for field in expected_fields: + assert port_xcvr_info["stdout"].find(field) >= 0, \ + "Expected field %s is not found in %s while checking %s" % (field, port_xcvr_info["stdout"], intf) + + +def check_transceiver_dom_sensor_basic(dut, interfaces): + """ + @summary: Check whether all the specified interface are in TRANSCEIVER_DOM_SENSOR redis DB. + @param dut: The ansible_host object of DUT. For interacting with DUT. + @param interfaces: List of interfaces that need to be checked. + """ + logging.info("Check whether TRANSCEIVER_DOM_SENSOR of all ports in redis") + xcvr_dom_sensor = dut.command("redis-cli -n 6 keys TRANSCEIVER_DOM_SENSOR*") + parsed_xcvr_dom_sensor = parse_transceiver_dom_sensor(xcvr_dom_sensor["stdout_lines"]) + for intf in interfaces: + assert intf in parsed_xcvr_dom_sensor, "TRANSCEIVER_DOM_SENSOR of %s is not found in DB" % intf + + +def check_transceiver_dom_sensor_details(dut, interfaces): + """ + @summary: Check the detailed TRANSCEIVER_DOM_SENSOR content of all the specified interfaces. + @param dut: The ansible_host object of DUT. For interacting with DUT. + @param interfaces: List of interfaces that need to be checked. + """ + logging.info("Check detailed TRANSCEIVER_DOM_SENSOR information of each connected ports") + expected_fields = ["temperature", "voltage", "rx1power", "rx2power", "rx3power", "rx4power", "tx1bias", + "tx2bias", "tx3bias", "tx4bias", "tx1power", "tx2power", "tx3power", "tx4power"] + for intf in interfaces: + port_xcvr_dom_sensor = dut.command('redis-cli -n 6 hgetall "TRANSCEIVER_DOM_SENSOR|%s"' % intf) + for field in expected_fields: + assert port_xcvr_dom_sensor["stdout"].find(field) >= 0, \ + "Expected field %s is not found in %s while checking %s" % (field, port_xcvr_dom_sensor["stdout"], intf) + + +def check_transceiver_status(dut, interfaces): + """ + @summary: Check transceiver information of all the specified interfaces in redis DB. + @param dut: The ansible_host object of DUT. For interacting with DUT. + @param interfaces: List of interfaces that need to be checked. + """ + check_transceiver_basic(dut, interfaces) + check_transceiver_details(dut, interfaces) + check_transceiver_dom_sensor_basic(dut, interfaces) + check_transceiver_dom_sensor_details(dut, interfaces) diff --git a/tests/platform/mellanox/check_hw_mgmt_service.py b/tests/platform/mellanox/check_hw_mgmt_service.py new file mode 100644 index 00000000000..1b012c74027 --- /dev/null +++ b/tests/platform/mellanox/check_hw_mgmt_service.py @@ -0,0 +1,44 @@ +""" +Helper function for checking the hw-management service +""" +import logging +import re + +from utilities import wait_until + + +def fan_speed_set_to_default(dut): + fan_speed_setting = dut.command("cat /var/run/hw-management/thermal/pwm1")["stdout"].strip() + return fan_speed_setting == "153" + + +def wait_until_fan_speed_set_to_default(dut): + wait_until(300, 10, fan_speed_set_to_default, dut) + + +def check_hw_management_service(dut): + """This function is to check the hw management service and related settings. + """ + logging.info("Check service status using systemctl") + hw_mgmt_service_state = dut.command("systemctl -p ActiveState -p SubState show hw-management") + assert hw_mgmt_service_state["stdout"].find("ActiveState=active") >= 0, "The hw-management service is not active" + assert hw_mgmt_service_state["stdout"].find("SubState=exited") >= 0, "The hw-management service is not exited" + + logging.info("Check the thermal control process") + tc_pid = dut.command("pgrep -f /usr/bin/hw-management-thermal-control.sh") + assert re.match(r"\d+", tc_pid["stdout"]), "The hw-management-thermal-control process is not running" + + logging.info("Check thermal control status") + tc_suspend = dut.command("cat /var/run/hw-management/config/suspend") + assert tc_suspend["stdout"] == "1", "Thermal control is not suspended" + + logging.info("Check fan speed setting") + fan_speed_setting = dut.command("cat /var/run/hw-management/thermal/pwm1") + assert fan_speed_setting["stdout"] == "153", "Fan speed is not default to 60%. 153/255=60%" + + logging.info("Check dmesg") + dmesg = dut.command("sudo dmesg") + error_keywords = ["crash", "Out of memory", "Call Trace", "Exception", "panic"] + for err_kw in error_keywords: + assert not re.match(err_kw, dmesg["stdout"], re.I), \ + "Found error keyword %s in dmesg: %s" % (err_kw, dmesg["stdout"]) diff --git a/tests/platform/mellanox/check_sysfs.py b/tests/platform/mellanox/check_sysfs.py new file mode 100644 index 00000000000..199446832a5 --- /dev/null +++ b/tests/platform/mellanox/check_sysfs.py @@ -0,0 +1,73 @@ +""" +Helper script for checking status of sysfs. + +This script contains re-usable functions for checking status of hw-management related sysfs. +""" +import logging + + +def check_sysfs(dut): + """ + @summary: Check various hw-management related sysfs under /var/run/hw-management + """ + logging.info("Check broken symbolinks") + broken_symbolinks = dut.command("find /var/run/hw-management -xtype l") + assert len(broken_symbolinks["stdout_lines"]) == 0, \ + "Found some broken symbolinks: %s" % str(broken_symbolinks["stdout_lines"]) + + logging.info("Check content of some key files") + + file_suspend = dut.command("cat /var/run/hw-management/config/suspend") + assert file_suspend["stdout"] == "1", "Content of /var/run/hw-management/config/suspend should be 1" + + file_pwm1 = dut.command("cat /var/run/hw-management/thermal/pwm1") + assert file_pwm1["stdout"] == "153", "Content of /var/run/hw-management/thermal/pwm1 should be 153" + + file_asic = dut.command("cat /var/run/hw-management/thermal/asic") + try: + asic_temp = float(file_asic["stdout"]) / 1000 + assert asic_temp > 0 and asic_temp < 85, "Abnormal ASIC temperature: %s" % file_asic["stdout"] + except: + assert "Bad content in /var/run/hw-management/thermal/asic: %s" % file_asic["stdout"] + + fan_status_list = dut.command("find /var/run/hw-management/thermal -name fan*_status") + for fan_status in fan_status_list["stdout_lines"]: + fan_status_content = dut.command("cat %s" % fan_status) + assert fan_status_content["stdout"] == "1", "Content of %s is not 1" % fan_status + + fan_fault_list = dut.command("find /var/run/hw-management/thermal -name fan*_fault") + for fan_fault in fan_fault_list["stdout_lines"]: + fan_fault_content = dut.command("cat %s" % fan_fault) + assert fan_fault_content["stdout"] == "0", "Content of %s is not 0" % fan_fault + + fan_min_list = dut.command("find /var/run/hw-management/thermal -name fan*_min") + for fan_min in fan_min_list["stdout_lines"]: + try: + fan_min_content = dut.command("cat %s" % fan_min) + fan_min_speed = int(fan_min_content["stdout"]) + assert fan_min_speed > 0, "Bad fan minimum speed: %s" % str(fan_min_speed) + except Exception as e: + assert "Get content from %s failed, exception: %s" % (fan_min, repr(e)) + + fan_max_list = dut.command("find /var/run/hw-management/thermal -name fan*_max") + for fan_max in fan_max_list["stdout_lines"]: + try: + fan_max_content = dut.command("cat %s" % fan_max) + fan_max_speed = int(fan_max_content["stdout"]) + assert fan_max_speed > 10000, "Bad fan maximum speed: %s" % str(fan_max_speed) + except Exception as e: + assert "Get content from %s failed, exception: %s" % (fan_max, repr(e)) + + fan_speed_get_list = dut.command("find /var/run/hw-management/thermal -name fan*_speed_get") + for fan_speed_get in fan_speed_get_list["stdout_lines"]: + try: + fan_speed_get_content = dut.command("cat %s" % fan_speed_get) + fan_speed = int(fan_speed_get_content["stdout"]) + assert fan_speed > 1000, "Bad fan speed: %s" % str(fan_speed) + except Exception as e: + assert "Get content from %s failed, exception: %s" % (fan_speed_get, repr(e)) + + fan_speed_set_list = dut.command("find /var/run/hw-management/thermal -name fan*_speed_set") + for fan_speed_set in fan_speed_set_list["stdout_lines"]: + fan_speed_set_content = dut.command("cat %s" % fan_speed_set) + assert fan_speed_set_content["stdout"] == "153", "Fan speed should be set to 60%, 153/255" diff --git a/tests/platform/test_reboot.py b/tests/platform/test_reboot.py new file mode 100644 index 00000000000..5f4128e7284 --- /dev/null +++ b/tests/platform/test_reboot.py @@ -0,0 +1,121 @@ +""" +Check platform status after reboot. Three types of reboot are covered in this script: +* Cold reboot +* Fast reboot +* Warm reboot + +This script is to cover the test case 'Reload configuration' in the SONiC platform test plan: +https://github.com/Azure/SONiC/blob/master/doc/pmon/sonic_platform_test_plan.md +""" +import logging +import re +import os +import time +import sys + +import pytest + +from ansible_host import ansible_host +from utilities import wait_until +from check_critical_services import check_critical_services +from check_interface_status import check_interface_status +from check_transceiver_status import check_transceiver_basic +from check_transceiver_status import all_transceivers_detected + + +def reboot_and_check(localhost, dut, reboot_type="cold"): + """ + Perform the specified type of reboot and check platform status. + """ + dut.command("show platform summary") + lab_conn_graph_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), \ + "../../ansible/files/lab_connection_graph.xml") + conn_graph_facts = localhost.conn_graph_facts(host=dut.hostname, filename=lab_conn_graph_file).\ + contacted['localhost']['ansible_facts'] + interfaces = conn_graph_facts["device_conn"] + asic_type = dut.shell("show platform summary | awk '/ASIC: / {print$2}'")["stdout"].strip() + + logging.info("Run %s reboot on DUT" % reboot_type) + if reboot_type == "cold": + reboot_cmd = "sudo reboot" + elif reboot_type == "fast": + reboot_cmd = "sudo fast-reboot &" + elif reboot_type == "warm": + reboot_cmd = "sudo warm-reboot &" + else: + assert False, "Reboot type %s is not supported" % reboot_type + dut.shell(reboot_cmd) + + logging.info("Wait for DUT to go down") + localhost.wait_for(host=dut.hostname, port=22, state="stopped", delay=10, timeout=300) + + logging.info("Wait for DUT to come back") + localhost.wait_for(host=dut.hostname, port=22, state="started", delay=10, timeout=300) + + logging.info("Wait until all critical services are fully started") + check_critical_services(dut) + + logging.info("Wait some time for all the transceivers to be detected") + assert wait_until(300, 20, all_transceivers_detected, dut, interfaces), \ + "Not all transceivers are detected in 300 seconds" + + logging.info("Check interface status") + check_interface_status(dut, interfaces) + + logging.info("Check transceiver status") + check_transceiver_basic(dut, interfaces) + + if asic_type in ["mellanox"]: + + current_file_dir = os.path.dirname(os.path.realpath(__file__)) + sub_folder_dir = os.path.join(current_file_dir, "mellanox") + if sub_folder_dir not in sys.path: + sys.path.append(sub_folder_dir) + from check_hw_mgmt_service import check_hw_management_service + from check_hw_mgmt_service import wait_until_fan_speed_set_to_default + from check_sysfs import check_sysfs + + logging.info("Wait until fan speed is set to default") + wait_until_fan_speed_set_to_default(dut) + + logging.info("Check the hw-management service") + check_hw_management_service(dut) + + logging.info("Check sysfs") + check_sysfs(dut) + + +def test_cold_reboot(localhost, ansible_adhoc, testbed): + """ + @summary: This test case is to perform cold reboot and check platform status + """ + hostname = testbed['dut'] + ans_host = ansible_host(ansible_adhoc, hostname) + + reboot_and_check(localhost, ans_host, reboot_type="cold") + + +def test_fast_reboot(localhost, ansible_adhoc, testbed): + """ + @summary: This test case is to perform cold reboot and check platform status + """ + hostname = testbed['dut'] + ans_host = ansible_host(ansible_adhoc, hostname) + + reboot_and_check(localhost, ans_host, reboot_type="fast") + + +def test_warm_reboot(localhost, ansible_adhoc, testbed): + """ + @summary: This test case is to perform cold reboot and check platform status + """ + hostname = testbed['dut'] + ans_host = ansible_host(ansible_adhoc, hostname) + asic_type = ans_host.shell("show platform summary | awk '/ASIC: / {print$2}'")["stdout"].strip() + + if asic_type in ["mellanox"]: + issu_capability = ans_host.command("show platform mlnx issu")["stdout"] + if "disabled" in issu_capability: + pytest.skip("ISSU is not supported on this DUT, skip this test case") + + reboot_and_check(localhost, ans_host, reboot_type="warm") diff --git a/tests/platform/test_reload_config.py b/tests/platform/test_reload_config.py new file mode 100644 index 00000000000..e7a7d947612 --- /dev/null +++ b/tests/platform/test_reload_config.py @@ -0,0 +1,64 @@ +""" +Check platform status after config is reloaded + +This script is to cover the test case 'Reload configuration' in the SONiC platform test plan: +https://github.com/Azure/SONiC/blob/master/doc/pmon/sonic_platform_test_plan.md +""" +import logging +import re +import os +import time +import sys + +from ansible_host import ansible_host +from utilities import wait_until +from check_critical_services import check_critical_services +from check_interface_status import check_interface_status +from check_transceiver_status import check_transceiver_basic +from check_transceiver_status import all_transceivers_detected + + +def test_reload_configuration(localhost, ansible_adhoc, testbed): + """ + @summary: This test case is to reload the configuration and check platform status + """ + hostname = testbed['dut'] + ans_host = ansible_host(ansible_adhoc, hostname) + ans_host.command("show platform summary") + lab_conn_graph_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), \ + "../../ansible/files/lab_connection_graph.xml") + conn_graph_facts = localhost.conn_graph_facts(host=hostname, filename=lab_conn_graph_file).\ + contacted['localhost']['ansible_facts'] + interfaces = conn_graph_facts["device_conn"] + asic_type = ans_host.shell("show platform summary | awk '/ASIC: / {print$2}'")["stdout"].strip() + + logging.info("Reload configuration") + ans_host.command("sudo config reload -y") + + logging.info("Wait until all critical services are fully started") + check_critical_services(ans_host) + + logging.info("Wait some time for all the transceivers to be detected") + assert wait_until(300, 20, all_transceivers_detected, ans_host, interfaces), \ + "Not all transceivers are detected in 300 seconds" + + logging.info("Check interface status") + check_interface_status(ans_host, interfaces) + + logging.info("Check transceiver status") + check_transceiver_basic(ans_host, interfaces) + + if asic_type in ["mellanox"]: + + current_file_dir = os.path.dirname(os.path.realpath(__file__)) + sub_folder_dir = os.path.join(current_file_dir, "mellanox") + if sub_folder_dir not in sys.path: + sys.path.append(sub_folder_dir) + from check_hw_mgmt_service import check_hw_management_service + from check_sysfs import check_sysfs + + logging.info("Check the hw-management service") + check_hw_management_service(ans_host) + + logging.info("Check sysfs") + check_sysfs(ans_host) diff --git a/tests/platform/test_sequential_restart.py b/tests/platform/test_sequential_restart.py new file mode 100644 index 00000000000..729cec750ab --- /dev/null +++ b/tests/platform/test_sequential_restart.py @@ -0,0 +1,80 @@ +""" +Check platform status after service is restarted + +This script is to cover the test case 'Sequential syncd/swss restart' in the SONiC platform test plan: +https://github.com/Azure/SONiC/blob/master/doc/pmon/sonic_platform_test_plan.md +""" +import logging +import re +import os +import time +import sys + +from ansible_host import ansible_host +from utilities import wait_until +from check_critical_services import check_critical_services +from check_interface_status import check_interface_status +from check_transceiver_status import check_transceiver_basic +from check_transceiver_status import all_transceivers_detected + + +def restart_service_and_check(localhost, dut, service): + """ + Restart specified service and check platform status + """ + dut.command("show platform summary") + lab_conn_graph_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), \ + "../../ansible/files/lab_connection_graph.xml") + conn_graph_facts = localhost.conn_graph_facts(host=dut.hostname, filename=lab_conn_graph_file).\ + contacted['localhost']['ansible_facts'] + interfaces = conn_graph_facts["device_conn"] + asic_type = dut.shell("show platform summary | awk '/ASIC: / {print$2}'")["stdout"].strip() + + logging.info("Restart the %s service" % service) + dut.command("sudo systemctl restart %s" % service) + + logging.info("Wait until all critical services are fully started") + check_critical_services(dut) + + logging.info("Wait some time for all the transceivers to be detected") + assert wait_until(300, 20, all_transceivers_detected, dut, interfaces), \ + "Not all transceivers are detected in 300 seconds" + + logging.info("Check interface status") + check_interface_status(dut, interfaces) + + logging.info("Check transceiver status") + check_transceiver_basic(dut, interfaces) + + if asic_type in ["mellanox"]: + + current_file_dir = os.path.dirname(os.path.realpath(__file__)) + sub_folder_dir = os.path.join(current_file_dir, "mellanox") + if sub_folder_dir not in sys.path: + sys.path.append(sub_folder_dir) + from check_hw_mgmt_service import check_hw_management_service + from check_sysfs import check_sysfs + + logging.info("Check the hw-management service") + check_hw_management_service(dut) + + logging.info("Check sysfs") + check_sysfs(dut) + + +def test_restart_swss(localhost, ansible_adhoc, testbed): + """ + @summary: This test case is to restart the swss service and check platform status + """ + hostname = testbed['dut'] + ans_host = ansible_host(ansible_adhoc, hostname) + restart_service_and_check(localhost, ans_host, "swss") + + +def test_restart_syncd(localhost, ansible_adhoc, testbed): + """ + @summary: This test case is to restart the syncd service and check platform status + """ + hostname = testbed['dut'] + ans_host = ansible_host(ansible_adhoc, hostname) + restart_service_and_check(localhost, ans_host, "syncd") diff --git a/tests/platform/test_xcvr_info_in_db.py b/tests/platform/test_xcvr_info_in_db.py index 1d50f5c4333..908b5684409 100644 --- a/tests/platform/test_xcvr_info_in_db.py +++ b/tests/platform/test_xcvr_info_in_db.py @@ -9,6 +9,7 @@ import os from ansible_host import ansible_host +from check_transceiver_status import check_transceiver_status def parse_transceiver_info(output_lines): @@ -52,32 +53,7 @@ def test_xcvr_info_in_db(localhost, ansible_adhoc, testbed): "../../ansible/files/lab_connection_graph.xml") conn_graph_facts = localhost.conn_graph_facts(host=hostname, filename=lab_conn_graph_file).\ contacted['localhost']['ansible_facts'] + interfaces = conn_graph_facts["device_conn"] - logging.info("Check whether transceiver information of all ports are in redis") - xcvr_info = ans_host.command("redis-cli -n 6 keys TRANSCEIVER_INFO*") - parsed_xcvr_info = parse_transceiver_info(xcvr_info["stdout_lines"]) - for intf in conn_graph_facts["device_conn"]: - assert intf in parsed_xcvr_info, "TRANSCEIVER INFO of %s is not found in DB" % intf - - logging.info("Check detailed transceiver information of each connected port") - expected_fields = ["type", "hardwarerev", "serialnum", "manufacturename", "modelname"] - for intf in conn_graph_facts["device_conn"]: - port_xcvr_info = ans_host.command('redis-cli -n 6 hgetall "TRANSCEIVER_INFO|%s"' % intf) - for field in expected_fields: - assert port_xcvr_info["stdout"].find(field) >= 0, \ - "Expected field %s is not found in %s while checking %s" % (field, port_xcvr_info["stdout"], intf) - - logging.info("Check whether TRANSCEIVER_DOM_SENSOR of all ports in redis") - xcvr_dom_senspor = ans_host.command("redis-cli -n 6 keys TRANSCEIVER_DOM_SENSOR*") - parsed_xcvr_dom_senspor = parse_transceiver_dom_sensor(xcvr_dom_senspor["stdout_lines"]) - for intf in conn_graph_facts["device_conn"]: - assert intf in parsed_xcvr_dom_senspor, "TRANSCEIVER_DOM_SENSOR of %s is not found in DB" % intf - - logging.info("Check detailed TRANSCEIVER_DOM_SENSOR information of each connected ports") - expected_fields = ["temperature", "voltage", "rx1power", "rx2power", "rx3power", "rx4power", "tx1bias", - "tx2bias", "tx3bias", "tx4bias", "tx1power", "tx2power", "tx3power", "tx4power"] - for intf in conn_graph_facts["device_conn"]: - port_xcvr_dom_sensor = ans_host.command('redis-cli -n 6 hgetall "TRANSCEIVER_DOM_SENSOR|%s"' % intf) - for field in expected_fields: - assert port_xcvr_dom_sensor["stdout"].find(field) >= 0, \ - "Expected field %s is not found in %s while checking %s" % (field, port_xcvr_dom_sensor["stdout"], intf) + logging.info("Check transceiver status") + check_transceiver_status(ans_host, interfaces) diff --git a/tests/platform/utilities.py b/tests/platform/utilities.py new file mode 100644 index 00000000000..6ec3a349d7c --- /dev/null +++ b/tests/platform/utilities.py @@ -0,0 +1,41 @@ +""" +Utility functions can re-used in testing scripts. +""" +import time +import logging + +def wait_until(timeout, interval, condition, *args, **kwargs): + """ + @summary: Wait until the specified condition is True or timeout. + @param timeout: Maximum time to wait + @param interval: Poll interval + @param condition: A function that returns False or True + @param *args: Extra args required by the 'condition' function. + @param **kwargs: Extra args required by the 'condition' function. + @return: If the condition function returns True before timeout, return True. If the condition function raises an + exception, log the error and keep waiting and polling. + """ + logging.debug("Wait until %s is True, timeout is %s seconds, checking interval is %s" % \ + (condition.__name__, timeout, interval)) + start_time = time.time() + elapsed_time = 0 + while elapsed_time < timeout: + logging.debug("Time elapsed: %f seconds" % elapsed_time) + + try: + check_result = condition(*args, **kwargs) + except Exception as e: + logging.debug("Exception caught while checking %s: %s" % (condition.__name__, repr(e))) + check_result = False + + if check_result: + logging.debug("%s is True, exit early with True" % condition.__name__) + return True + else: + logging.debug("%s is False, wait %d seconds and check again" % (condition.__name__, interval)) + time.sleep(interval) + elapsed_time = time.time() - start_time + + if elapsed_time >= timeout: + logging.debug("%s is still False after %d seconds, exit with False" % (condition.__name__, timeout)) + return False From 9d982f779e57ecfbfa42e533c93f1b0c3355445d Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Tue, 2 Jul 2019 09:57:39 +0800 Subject: [PATCH 2/3] [platform] Add interface status checking using the interface_facts module --- tests/platform/check_interface_status.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/platform/check_interface_status.py b/tests/platform/check_interface_status.py index c3c3cea26a5..72d863007b5 100644 --- a/tests/platform/check_interface_status.py +++ b/tests/platform/check_interface_status.py @@ -3,6 +3,7 @@ This script contains re-usable functions for checking status of interfaces on SONiC. """ +import logging def parse_intf_status(lines): @@ -37,6 +38,7 @@ def check_interface_status(dut, interfaces): @param hostname: @param interfaces: List of interfaces that need to be checked. """ + logging.info("Check interface status using cmd 'intfutil'") mg_ports = dut.minigraph_facts(host=dut.hostname)["ansible_facts"]["minigraph_ports"] output = dut.command("intfutil description") intf_status = parse_intf_status(output["stdout_lines"][2:]) @@ -48,3 +50,8 @@ def check_interface_status(dut, interfaces): "Oper status of interface %s is %s, expected '%s'" % (intf, intf_status[intf]["oper"], expected_oper) assert intf_status[intf]["admin"] == expected_oper, \ "Admin status of interface %s is %s, expected '%s'" % (intf, intf_status[intf]["admin"], expected_admin) + + logging.info("Check interface status using the interface_facts module") + intf_facts = dut.interface_facts(up_ports=mg_ports)["ansible_facts"] + down_ports = intf_facts["ansible_interface_link_down_ports"] + assert len(down_ports) == 0, "Some interfaces are down: %s" % str(down_ports) From a93dc8f5a9418e2090a055be3531f432bb06b801 Mon Sep 17 00:00:00 2001 From: Xin Wang Date: Thu, 18 Jul 2019 10:29:55 +0800 Subject: [PATCH 3/3] [platform] Fix some minor issues * Run reboot command in background to avoid command failure caused by SSH connection broken before command returns * Fine tune the reboot wait timeout values * Add delay before checking interface status because the intfutil command may have no output in time Signed-off-by: Xin Wang --- tests/platform/test_reboot.py | 9 ++++++--- tests/platform/test_reload_config.py | 1 + tests/platform/test_sequential_restart.py | 1 + 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/platform/test_reboot.py b/tests/platform/test_reboot.py index 5f4128e7284..38444223a95 100644 --- a/tests/platform/test_reboot.py +++ b/tests/platform/test_reboot.py @@ -37,20 +37,23 @@ def reboot_and_check(localhost, dut, reboot_type="cold"): logging.info("Run %s reboot on DUT" % reboot_type) if reboot_type == "cold": - reboot_cmd = "sudo reboot" + reboot_cmd = "sudo reboot &" + reboot_timeout = 300 elif reboot_type == "fast": reboot_cmd = "sudo fast-reboot &" + reboot_timeout = 180 elif reboot_type == "warm": reboot_cmd = "sudo warm-reboot &" + reboot_timeout = 180 else: assert False, "Reboot type %s is not supported" % reboot_type dut.shell(reboot_cmd) logging.info("Wait for DUT to go down") - localhost.wait_for(host=dut.hostname, port=22, state="stopped", delay=10, timeout=300) + localhost.wait_for(host=dut.hostname, port=22, state="stopped", delay=10, timeout=120) logging.info("Wait for DUT to come back") - localhost.wait_for(host=dut.hostname, port=22, state="started", delay=10, timeout=300) + localhost.wait_for(host=dut.hostname, port=22, state="started", delay=10, timeout=reboot_timeout) logging.info("Wait until all critical services are fully started") check_critical_services(dut) diff --git a/tests/platform/test_reload_config.py b/tests/platform/test_reload_config.py index e7a7d947612..83fe3544189 100644 --- a/tests/platform/test_reload_config.py +++ b/tests/platform/test_reload_config.py @@ -43,6 +43,7 @@ def test_reload_configuration(localhost, ansible_adhoc, testbed): "Not all transceivers are detected in 300 seconds" logging.info("Check interface status") + time.sleep(60) check_interface_status(ans_host, interfaces) logging.info("Check transceiver status") diff --git a/tests/platform/test_sequential_restart.py b/tests/platform/test_sequential_restart.py index 729cec750ab..092bc614a62 100644 --- a/tests/platform/test_sequential_restart.py +++ b/tests/platform/test_sequential_restart.py @@ -41,6 +41,7 @@ def restart_service_and_check(localhost, dut, service): "Not all transceivers are detected in 300 seconds" logging.info("Check interface status") + time.sleep(60) check_interface_status(dut, interfaces) logging.info("Check transceiver status")