From d5db9ec1591d63add296af72ddd17dc984ea0116 Mon Sep 17 00:00:00 2001 From: Chuan Wu Date: Mon, 21 Nov 2022 10:08:08 +0800 Subject: [PATCH] Add reboot cause script Add new reboot cause script for bios cpu and asic causes --- tests/common/reboot.py | 55 +++++++++++---- .../mellanox_thermal_control_test_helper.py | 70 +++++++++++++------ .../mellanox/test_reboot_cause.py | 42 +++++++++++ 3 files changed, 131 insertions(+), 36 deletions(-) create mode 100644 tests/platform_tests/mellanox/test_reboot_cause.py diff --git a/tests/common/reboot.py b/tests/common/reboot.py index ad9f3dac8f..5815399901 100644 --- a/tests/common/reboot.py +++ b/tests/common/reboot.py @@ -22,6 +22,9 @@ REBOOT_TYPE_WATCHDOG = "watchdog" REBOOT_TYPE_UNKNOWN = "Unknown" REBOOT_TYPE_THERMAL_OVERLOAD = "Thermal Overload" +REBOOT_TYPE_CPU = "cpu" +REBOOT_TYPE_BIOS = "bios" +REBOOT_TYPE_ASIC = "asic" # Event to signal DUT activeness DUT_ACTIVE = threading.Event() @@ -87,6 +90,24 @@ "cause": "warm-reboot", "test_reboot_cause_only": False }, + REBOOT_TYPE_CPU: { + "timeout": 300, + "wait": 120, + "cause": "CPU", + "test_reboot_cause_only": True + }, + REBOOT_TYPE_BIOS: { + "timeout": 300, + "wait": 120, + "cause": "BIOS", + "test_reboot_cause_only": True + }, + REBOOT_TYPE_ASIC: { + "timeout": 300, + "wait": 120, + "cause": "ASIC", + "test_reboot_cause_only": True + } } MAX_NUM_REBOOT_CAUSE_HISTORY = 10 @@ -187,7 +208,7 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10, pool = ThreadPool() hostname = duthost.hostname try: - reboot_ctrl = reboot_ctrl_dict[reboot_type] + reboot_ctrl = reboot_ctrl_dict[reboot_type] reboot_command = reboot_ctrl['command'] if reboot_type != REBOOT_TYPE_POWEROFF else None if timeout == 0: timeout = reboot_ctrl['timeout'] @@ -199,7 +220,7 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10, raise ValueError('invalid reboot type: "{} for {}"'.format(reboot_type, hostname)) reboot_res, dut_datetime = perform_reboot(duthost, pool, reboot_command, reboot_helper, reboot_kwargs, reboot_type) - + wait_for_shutdown(duthost, localhost, delay, timeout, reboot_res) # if wait_for_ssh flag is False, do not wait for dut to boot up if not wait_for_ssh: @@ -222,7 +243,8 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10, pool.terminate() dut_uptime = duthost.get_up_time() logger.info('DUT {} up since {}'.format(hostname, dut_uptime)) - assert float(dut_uptime.strftime("%s")) > float(dut_datetime.strftime("%s")), "Device {} did not reboot".format(hostname) + assert float(dut_uptime.strftime("%s")) > float(dut_datetime.strftime("%s")), "Device {} did not reboot". \ + format(hostname) def get_reboot_cause(dut): @@ -232,7 +254,7 @@ def get_reboot_cause(dut): """ logging.info('Getting reboot cause from dut {}'.format(dut.hostname)) output = dut.shell('show reboot-cause') - cause = output['stdout'] + cause = output['stdout'] for type, ctrl in reboot_ctrl_dict.items(): if re.search(ctrl['cause'], cause): @@ -282,13 +304,13 @@ def sync_reboot_history_queue_with_dut(dut): dut_reboot_history_queue = dut.show_and_parse("show reboot-cause history") dut_reboot_history_received = True break - except Exception as e: + except Exception: e_type, e_value, e_traceback = sys.exc_info() logging.info("Exception type: %s" % e_type.__name__) logging.info("Exception message: %s" % e_value) - logging.info("Backing off for %d seconds before retrying", ((retry_count+1) * RETRY_BACKOFF_TIME)) + logging.info("Backing off for %d seconds before retrying", ((retry_count + 1) * RETRY_BACKOFF_TIME)) - time.sleep(((retry_count+1) * RETRY_BACKOFF_TIME)) + time.sleep(((retry_count + 1) * RETRY_BACKOFF_TIME)) continue # If retry logic did not yield reboot cause history from DUT, @@ -349,21 +371,26 @@ def check_reboot_cause_history(dut, reboot_type_history_queue): logging.info("Verify reboot-cause history title") if reboot_cause_history_got: if not set(REBOOT_CAUSE_HISTORY_TITLE) == set(reboot_cause_history_got[0].keys()): - logging.error("Expected reboot-cause history title:{} not match actual reboot-cause history title:{}".format( - REBOOT_CAUSE_HISTORY_TITLE, reboot_cause_history_got[0].keys())) + logging.error("Expected reboot-cause history title:{} not match actual reboot-cause history title:{}". + format(REBOOT_CAUSE_HISTORY_TITLE, reboot_cause_history_got[0].keys())) return False - logging.info("Verify reboot-cause output are sorted in reverse chronological order" ) + logging.info("Verify reboot-cause output are sorted in reverse chronological order") reboot_type_history_len = len(reboot_type_history_queue) if reboot_type_history_len <= len(reboot_cause_history_got): for index, reboot_type in enumerate(reboot_type_history_queue): if reboot_type not in reboot_ctrl_dict: - logging.warn("Reboot type: {} not in dictionary. Skipping history check for this entry.".format(reboot_type)) + logging.warn("Reboot type: {} not in dictionary. Skipping history check for this entry.". + format(reboot_type)) continue - logging.info("index: %d, reboot cause: %s, reboot cause from DUT: %s" % (index, reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index-1]["cause"])) - if not re.search(reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index-1]["cause"]): + logging.info("index: %d, reboot cause: %s, reboot cause from DUT: %s" % + (index, reboot_ctrl_dict[reboot_type]["cause"], + reboot_cause_history_got[reboot_type_history_len - index - 1]["cause"])) + if not re.search(reboot_ctrl_dict[reboot_type]["cause"], + reboot_cause_history_got[reboot_type_history_len - index - 1]["cause"]): logging.error("The {} reboot-cause not match. expected_reboot type={}, actual_reboot_cause={}".format( - index, reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index]["cause"])) + index, reboot_ctrl_dict[reboot_type]["cause"], + reboot_cause_history_got[reboot_type_history_len - index]["cause"])) return False return True logging.error("The number of expected reboot-cause:{} is more than that of actual reboot-cuase:{}".format( diff --git a/tests/platform_tests/mellanox/mellanox_thermal_control_test_helper.py b/tests/platform_tests/mellanox/mellanox_thermal_control_test_helper.py index ed2d79abc3..08a98bc20a 100644 --- a/tests/platform_tests/mellanox/mellanox_thermal_control_test_helper.py +++ b/tests/platform_tests/mellanox/mellanox_thermal_control_test_helper.py @@ -4,7 +4,8 @@ import logging import time from pkg_resources import parse_version -from tests.platform_tests.thermal_control_test_helper import * +from tests.platform_tests.thermal_control_test_helper import mocker, FanStatusMocker, ThermalStatusMocker, \ + SingleFanMocker from tests.common.mellanox_data import get_platform_data from minimum_table import get_min_table @@ -96,6 +97,7 @@ } } + class SysfsNotExistError(Exception): """ Exception when sys fs not exist. @@ -138,7 +140,6 @@ def __init__(self, dut): :param dut: DUT object representing a SONiC switch under test. """ self.dut = dut - #self.unlink_file_list = {} self._extract_num_of_fans_and_fan_drawers() self.deinit_retry = 5 @@ -289,7 +290,7 @@ def deinit(self): for file_path, link_target in self.unlink_file_list.items(): try: self.dut.command('ln -f -s {} {}'.format(link_target, file_path)) - except Exception as e: + except Exception: # Catch any exception for later retry failed_recover_links[file_path] = link_target @@ -300,7 +301,7 @@ def deinit(self): self.dut.shell('rm -f {}'.format(file_path)) else: self.dut.shell('echo \'{}\' > {}'.format(value, file_path)) - except Exception as e: + except Exception: # Catch any exception for later retry failed_recover_files[file_path] = value @@ -417,7 +418,7 @@ def mock_fan_direction_fan_dir_per_fan(self, direction): """ try: _ = int(self.helper.read_value(FanDrawerData.FAN_DIR_PATH_PER_FAN.format(self.index))) - except SysfsNotExistError as e: + except SysfsNotExistError: self.mocked_direction = NOT_AVAILABLE return @@ -438,7 +439,7 @@ def mock_fan_direction_fan_dir_for_all_fans(self, direction): """ try: fan_dir_bits = int(self.helper.read_value(FanDrawerData.FAN_DIR_PATH_ALL_FANS)) - except SysfsNotExistError as e: + except SysfsNotExistError: self.mocked_direction = NOT_AVAILABLE return @@ -479,6 +480,7 @@ def get_expect_led_color(self): return 'green' + class FanData: """ Data mocker of a FAN. @@ -713,7 +715,7 @@ def check_result(self, actual_data): mismatch_in_actual_data = [] for actual_data_item in actual_data: primary = actual_data_item[self.primary_field] - if not primary in expected: + if primary not in expected: extra_in_actual_data.append(actual_data_item) else: for field in actual_data_item.keys(): @@ -726,16 +728,16 @@ def check_result(self, actual_data): result = True if len(extra_in_actual_data) > 0: - logging.error('Found extra data in actual_data: {}'\ - .format(json.dumps(extra_in_actual_data, indent=2))) + logging.error('Found extra data in actual_data: {}' + .format(json.dumps(extra_in_actual_data, indent=2))) result = False if len(mismatch_in_actual_data) > 0: - logging.error('Found mismatch data in actual_data: {}'\ - .format(json.dumps(mismatch_in_actual_data, indent=2))) + logging.error('Found mismatch data in actual_data: {}' + .format(json.dumps(mismatch_in_actual_data, indent=2))) result = False if len(expected.keys()) > 0: - logging.error('Expected data not found in actual_data: {}'\ - .format(json.dumps(expected, indent=2))) + logging.error('Expected data not found in actual_data: {}' + .format(json.dumps(expected, indent=2))) result = False return result @@ -761,7 +763,7 @@ def __init__(self, dut): self.expected_data = {} self.expected_data_headers = ['drawer', 'led', 'fan', 'speed', 'direction', 'presence', 'status'] self.primary_field = 'fan' - self.excluded_fields = ['timestamp',] + self.excluded_fields = ['timestamp', ] def deinit(self): """ @@ -779,12 +781,11 @@ def mock_data(self): drawer_index = 1 drawer_data = None presence = 0 - direction = NOT_AVAILABLE naming_rule = FAN_NAMING_RULE['fan'] # All system fan is controlled to have the same speed, so only # get a random value once here speed = random.randint(60, 100) - FanData.mock_cooling_cur_state(self.mock_helper, speed/10) + FanData.mock_cooling_cur_state(self.mock_helper, speed / 10) while fan_index <= MockerHelper.FAN_NUM: try: if (fan_index - 1) % MockerHelper.FAN_NUM_PER_DRAWER == 0: @@ -806,7 +807,7 @@ def mock_data(self): fan_data.mock_target_speed(speed) self.expected_data[fan_data.name] = [ drawer_data.name, - 'N/A', # update this value later + 'N/A', # update this value later fan_data.name, '{}%'.format(fan_data.mocked_speed), drawer_data.mocked_direction, @@ -894,9 +895,10 @@ def __init__(self, dut): ThermalStatusMocker.__init__(self, dut) self.mock_helper = MockerHelper(dut) self.expected_data = {} - self.expected_data_headers = ['sensor', 'temperature', 'high th', 'low th', 'crit high th', 'crit low th', 'warning'] + self.expected_data_headers = ['sensor', 'temperature', 'high th', 'low th', 'crit high th', 'crit low th', + 'warning'] self.primary_field = 'sensor' - self.excluded_fields = ['timestamp',] + self.excluded_fields = ['timestamp', ] def deinit(self): """ @@ -1097,7 +1099,8 @@ def mock_over_speed(self): Change the mocked FAN speed to faster than target speed and exceed speed tolerance. :return: """ - self.fan_data.mock_speed(AbnormalFanMocker.TARGET_SPEED_VALUE * (100 + AbnormalFanMocker.SPEED_TOLERANCE) / 100 + 10) + self.fan_data.mock_speed( + AbnormalFanMocker.TARGET_SPEED_VALUE * (100 + AbnormalFanMocker.SPEED_TOLERANCE) / 100 + 10) self.fan_data.mock_target_speed(AbnormalFanMocker.TARGET_SPEED_VALUE) self.expect_led_color = 'red' @@ -1106,7 +1109,8 @@ def mock_under_speed(self): Change the mocked FAN speed to slower than target speed and exceed speed tolerance. :return: """ - self.fan_data.mock_speed(AbnormalFanMocker.TARGET_SPEED_VALUE * (100 - AbnormalFanMocker.SPEED_TOLERANCE) / 100 - 10) + self.fan_data.mock_speed( + AbnormalFanMocker.TARGET_SPEED_VALUE * (100 - AbnormalFanMocker.SPEED_TOLERANCE) / 100 - 10) self.fan_data.mock_target_speed(AbnormalFanMocker.TARGET_SPEED_VALUE) self.expect_led_color = 'red' @@ -1237,7 +1241,7 @@ def mock_power_threshold(self, number_psus): if not max_power: power = int(self.mock_helper.read_value(self.PSU_POWER.format(i + 1))) # Round up to 100 watt and then double it to avoid noise when power fluctuate - max_power = int(round(power/100000000.0)) * 100000000 * 2 + max_power = int(round(power / 100000000.0)) * 100000000 * 2 self.mock_helper.mock_value(self.PSU_POWER_CAPACITY.format(i + 1), max_power, True) # Also mock ambient temperatures @@ -1273,3 +1277,25 @@ def read_port_ambient_thermal(self): def read_fan_ambient_thermal(self): return int(self.mock_helper.read_value(self.FAN_AMBIENT_TEMP)) + + +@mocker('RebootCauseMocker') +class RebootCauseMocker(object): + RESET_RELOAD_BIOS = '/var/run/hw-management/system/reset_reload_bios' + RESET_FROM_COMEX = '/var/run/hw-management/system/reset_from_comex' + RESET_FROM_ASIC = '/var/run/hw-management/system/reset_from_asic' + + def __init__(self, dut): + self.mock_helper = MockerHelper(dut) + + def deinit(self): + self.mock_helper.deinit() + + def mock_reset_reload_bios(self): + self.mock_helper.mock_value(self.RESET_RELOAD_BIOS, 1) + + def mock_reset_from_comex(self): + self.mock_helper.mock_value(self.RESET_FROM_COMEX, 1) + + def mock_reset_from_asic(self): + self.mock_helper.mock_value(self.RESET_FROM_ASIC, 1) diff --git a/tests/platform_tests/mellanox/test_reboot_cause.py b/tests/platform_tests/mellanox/test_reboot_cause.py new file mode 100644 index 0000000000..611ca6ba2e --- /dev/null +++ b/tests/platform_tests/mellanox/test_reboot_cause.py @@ -0,0 +1,42 @@ +import allure +import logging +import pytest +from tests.common.reboot import REBOOT_TYPE_CPU, REBOOT_TYPE_BIOS, REBOOT_TYPE_ASIC, check_reboot_cause +from tests.platform_tests.thermal_control_test_helper import mocker_factory # noqa: F401 + +pytestmark = [ + pytest.mark.asic('mellanox'), + pytest.mark.topology('any') +] + +logger = logging.getLogger(__name__) + +mocker = None +REBOOT_CAUSE_TYPES = [REBOOT_TYPE_CPU, REBOOT_TYPE_BIOS, REBOOT_TYPE_ASIC] + + +@pytest.mark.parametrize("reboot_cause", REBOOT_CAUSE_TYPES) +def test_reboot_cause(rand_selected_dut, mocker_factory, reboot_cause): # noqa: F811 + """ + Validate reboot cause from cpu/bios/asic + :param rand_selected_dut: The fixture returns a randomly selected DUT + :param mocker_factory: The fixture returns a mocker + :param reboot_cause: The specific reboot cause + """ + duthost = rand_selected_dut + with allure.step('Create mocker - RebootCauseMocker'): + mocker = mocker_factory(duthost, 'RebootCauseMocker') + + with allure.step('Mock reset from {}'.format(reboot_cause)): + if reboot_cause == REBOOT_TYPE_CPU: + mocker.mock_reset_from_comex() + elif reboot_cause == REBOOT_TYPE_BIOS: + mocker.mock_reset_reload_bios() + elif reboot_cause == REBOOT_TYPE_ASIC: + mocker.mock_reset_from_asic() + + with allure.step('Restart determine-reboot-cause service'): + duthost.restart_service('determine-reboot-cause') + + with allure.step('Check Reboot cause is {}'.format(reboot_cause)): + check_reboot_cause(duthost, reboot_cause)