Skip to content

Commit

Permalink
Enhance Mellanox reboot cause test case (#6944)
Browse files Browse the repository at this point in the history
Add new script to cover following reboot cause scenarios:

BIOS - In case the BIOS upgrade process ended with failure and cause the switch to reset.
CPU - Reset is initiated by SW on the CPU. it could be that SW encountered some catastrophic situation like a memory leak, eventually, the kernel reset the whole switch.
Reset from ASIC - Reset which is caused by ASIC.

- What is the motivation for this PR?
Add test for sonic-net/sonic-platform-common#277

- How did you do it?

- How did you verify/test it?
Add test script for enhance reboot cause

- Any platform specific information?
Mellanox platforms, except for SPC1 and SIMX

- Supported testbed topology if it's a new test case?
Any topology
  • Loading branch information
echuawu authored Dec 5, 2022
1 parent 945eac8 commit 0e46b47
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 36 deletions.
55 changes: 41 additions & 14 deletions tests/common/reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
REBOOT_TYPE_WATCHDOG = "watchdog"
REBOOT_TYPE_UNKNOWN = "Unknown"
REBOOT_TYPE_THERMAL_OVERLOAD = "Thermal Overload"
REBOOT_TYPE_CPU = "cpu"
REBOOT_TYPE_BIOS = "bios"
REBOOT_TYPE_ASIC = "asic"

# Event to signal DUT activeness
DUT_ACTIVE = threading.Event()
Expand Down Expand Up @@ -87,6 +90,24 @@
"cause": "warm-reboot",
"test_reboot_cause_only": False
},
REBOOT_TYPE_CPU: {
"timeout": 300,
"wait": 120,
"cause": "CPU",
"test_reboot_cause_only": True
},
REBOOT_TYPE_BIOS: {
"timeout": 300,
"wait": 120,
"cause": "BIOS",
"test_reboot_cause_only": True
},
REBOOT_TYPE_ASIC: {
"timeout": 300,
"wait": 120,
"cause": "ASIC",
"test_reboot_cause_only": True
}
}

MAX_NUM_REBOOT_CAUSE_HISTORY = 10
Expand Down Expand Up @@ -187,7 +208,7 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,
pool = ThreadPool()
hostname = duthost.hostname
try:
reboot_ctrl = reboot_ctrl_dict[reboot_type]
reboot_ctrl = reboot_ctrl_dict[reboot_type]
reboot_command = reboot_ctrl['command'] if reboot_type != REBOOT_TYPE_POWEROFF else None
if timeout == 0:
timeout = reboot_ctrl['timeout']
Expand All @@ -199,7 +220,7 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,
raise ValueError('invalid reboot type: "{} for {}"'.format(reboot_type, hostname))

reboot_res, dut_datetime = perform_reboot(duthost, pool, reboot_command, reboot_helper, reboot_kwargs, reboot_type)

wait_for_shutdown(duthost, localhost, delay, timeout, reboot_res)
# if wait_for_ssh flag is False, do not wait for dut to boot up
if not wait_for_ssh:
Expand All @@ -222,7 +243,8 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,
pool.terminate()
dut_uptime = duthost.get_up_time()
logger.info('DUT {} up since {}'.format(hostname, dut_uptime))
assert float(dut_uptime.strftime("%s")) > float(dut_datetime.strftime("%s")), "Device {} did not reboot".format(hostname)
assert float(dut_uptime.strftime("%s")) > float(dut_datetime.strftime("%s")), "Device {} did not reboot". \
format(hostname)


def get_reboot_cause(dut):
Expand All @@ -232,7 +254,7 @@ def get_reboot_cause(dut):
"""
logging.info('Getting reboot cause from dut {}'.format(dut.hostname))
output = dut.shell('show reboot-cause')
cause = output['stdout']
cause = output['stdout']

for type, ctrl in reboot_ctrl_dict.items():
if re.search(ctrl['cause'], cause):
Expand Down Expand Up @@ -282,13 +304,13 @@ def sync_reboot_history_queue_with_dut(dut):
dut_reboot_history_queue = dut.show_and_parse("show reboot-cause history")
dut_reboot_history_received = True
break
except Exception as e:
except Exception:
e_type, e_value, e_traceback = sys.exc_info()
logging.info("Exception type: %s" % e_type.__name__)
logging.info("Exception message: %s" % e_value)
logging.info("Backing off for %d seconds before retrying", ((retry_count+1) * RETRY_BACKOFF_TIME))
logging.info("Backing off for %d seconds before retrying", ((retry_count + 1) * RETRY_BACKOFF_TIME))

time.sleep(((retry_count+1) * RETRY_BACKOFF_TIME))
time.sleep(((retry_count + 1) * RETRY_BACKOFF_TIME))
continue

# If retry logic did not yield reboot cause history from DUT,
Expand Down Expand Up @@ -349,21 +371,26 @@ def check_reboot_cause_history(dut, reboot_type_history_queue):
logging.info("Verify reboot-cause history title")
if reboot_cause_history_got:
if not set(REBOOT_CAUSE_HISTORY_TITLE) == set(reboot_cause_history_got[0].keys()):
logging.error("Expected reboot-cause history title:{} not match actual reboot-cause history title:{}".format(
REBOOT_CAUSE_HISTORY_TITLE, reboot_cause_history_got[0].keys()))
logging.error("Expected reboot-cause history title:{} not match actual reboot-cause history title:{}".
format(REBOOT_CAUSE_HISTORY_TITLE, reboot_cause_history_got[0].keys()))
return False

logging.info("Verify reboot-cause output are sorted in reverse chronological order" )
logging.info("Verify reboot-cause output are sorted in reverse chronological order")
reboot_type_history_len = len(reboot_type_history_queue)
if reboot_type_history_len <= len(reboot_cause_history_got):
for index, reboot_type in enumerate(reboot_type_history_queue):
if reboot_type not in reboot_ctrl_dict:
logging.warn("Reboot type: {} not in dictionary. Skipping history check for this entry.".format(reboot_type))
logging.warn("Reboot type: {} not in dictionary. Skipping history check for this entry.".
format(reboot_type))
continue
logging.info("index: %d, reboot cause: %s, reboot cause from DUT: %s" % (index, reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index-1]["cause"]))
if not re.search(reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index-1]["cause"]):
logging.info("index: %d, reboot cause: %s, reboot cause from DUT: %s" %
(index, reboot_ctrl_dict[reboot_type]["cause"],
reboot_cause_history_got[reboot_type_history_len - index - 1]["cause"]))
if not re.search(reboot_ctrl_dict[reboot_type]["cause"],
reboot_cause_history_got[reboot_type_history_len - index - 1]["cause"]):
logging.error("The {} reboot-cause not match. expected_reboot type={}, actual_reboot_cause={}".format(
index, reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index]["cause"]))
index, reboot_ctrl_dict[reboot_type]["cause"],
reboot_cause_history_got[reboot_type_history_len - index]["cause"]))
return False
return True
logging.error("The number of expected reboot-cause:{} is more than that of actual reboot-cuase:{}".format(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import logging
import time
from pkg_resources import parse_version
from tests.platform_tests.thermal_control_test_helper import *
from tests.platform_tests.thermal_control_test_helper import mocker, FanStatusMocker, ThermalStatusMocker, \
SingleFanMocker
from tests.common.mellanox_data import get_platform_data
from minimum_table import get_min_table

Expand Down Expand Up @@ -96,6 +97,7 @@
}
}


class SysfsNotExistError(Exception):
"""
Exception when sys fs not exist.
Expand Down Expand Up @@ -138,7 +140,6 @@ def __init__(self, dut):
:param dut: DUT object representing a SONiC switch under test.
"""
self.dut = dut
#self.unlink_file_list = {}
self._extract_num_of_fans_and_fan_drawers()
self.deinit_retry = 5

Expand Down Expand Up @@ -289,7 +290,7 @@ def deinit(self):
for file_path, link_target in self.unlink_file_list.items():
try:
self.dut.command('ln -f -s {} {}'.format(link_target, file_path))
except Exception as e:
except Exception:
# Catch any exception for later retry
failed_recover_links[file_path] = link_target

Expand All @@ -300,7 +301,7 @@ def deinit(self):
self.dut.shell('rm -f {}'.format(file_path))
else:
self.dut.shell('echo \'{}\' > {}'.format(value, file_path))
except Exception as e:
except Exception:
# Catch any exception for later retry
failed_recover_files[file_path] = value

Expand Down Expand Up @@ -417,7 +418,7 @@ def mock_fan_direction_fan_dir_per_fan(self, direction):
"""
try:
_ = int(self.helper.read_value(FanDrawerData.FAN_DIR_PATH_PER_FAN.format(self.index)))
except SysfsNotExistError as e:
except SysfsNotExistError:
self.mocked_direction = NOT_AVAILABLE
return

Expand All @@ -438,7 +439,7 @@ def mock_fan_direction_fan_dir_for_all_fans(self, direction):
"""
try:
fan_dir_bits = int(self.helper.read_value(FanDrawerData.FAN_DIR_PATH_ALL_FANS))
except SysfsNotExistError as e:
except SysfsNotExistError:
self.mocked_direction = NOT_AVAILABLE
return

Expand Down Expand Up @@ -479,6 +480,7 @@ def get_expect_led_color(self):

return 'green'


class FanData:
"""
Data mocker of a FAN.
Expand Down Expand Up @@ -713,7 +715,7 @@ def check_result(self, actual_data):
mismatch_in_actual_data = []
for actual_data_item in actual_data:
primary = actual_data_item[self.primary_field]
if not primary in expected:
if primary not in expected:
extra_in_actual_data.append(actual_data_item)
else:
for field in actual_data_item.keys():
Expand All @@ -726,16 +728,16 @@ def check_result(self, actual_data):

result = True
if len(extra_in_actual_data) > 0:
logging.error('Found extra data in actual_data: {}'\
.format(json.dumps(extra_in_actual_data, indent=2)))
logging.error('Found extra data in actual_data: {}'
.format(json.dumps(extra_in_actual_data, indent=2)))
result = False
if len(mismatch_in_actual_data) > 0:
logging.error('Found mismatch data in actual_data: {}'\
.format(json.dumps(mismatch_in_actual_data, indent=2)))
logging.error('Found mismatch data in actual_data: {}'
.format(json.dumps(mismatch_in_actual_data, indent=2)))
result = False
if len(expected.keys()) > 0:
logging.error('Expected data not found in actual_data: {}'\
.format(json.dumps(expected, indent=2)))
logging.error('Expected data not found in actual_data: {}'
.format(json.dumps(expected, indent=2)))
result = False

return result
Expand All @@ -761,7 +763,7 @@ def __init__(self, dut):
self.expected_data = {}
self.expected_data_headers = ['drawer', 'led', 'fan', 'speed', 'direction', 'presence', 'status']
self.primary_field = 'fan'
self.excluded_fields = ['timestamp',]
self.excluded_fields = ['timestamp', ]

def deinit(self):
"""
Expand All @@ -779,12 +781,11 @@ def mock_data(self):
drawer_index = 1
drawer_data = None
presence = 0
direction = NOT_AVAILABLE
naming_rule = FAN_NAMING_RULE['fan']
# All system fan is controlled to have the same speed, so only
# get a random value once here
speed = random.randint(60, 100)
FanData.mock_cooling_cur_state(self.mock_helper, speed/10)
FanData.mock_cooling_cur_state(self.mock_helper, speed / 10)
while fan_index <= MockerHelper.FAN_NUM:
try:
if (fan_index - 1) % MockerHelper.FAN_NUM_PER_DRAWER == 0:
Expand All @@ -806,7 +807,7 @@ def mock_data(self):
fan_data.mock_target_speed(speed)
self.expected_data[fan_data.name] = [
drawer_data.name,
'N/A', # update this value later
'N/A', # update this value later
fan_data.name,
'{}%'.format(fan_data.mocked_speed),
drawer_data.mocked_direction,
Expand Down Expand Up @@ -894,9 +895,10 @@ def __init__(self, dut):
ThermalStatusMocker.__init__(self, dut)
self.mock_helper = MockerHelper(dut)
self.expected_data = {}
self.expected_data_headers = ['sensor', 'temperature', 'high th', 'low th', 'crit high th', 'crit low th', 'warning']
self.expected_data_headers = ['sensor', 'temperature', 'high th', 'low th', 'crit high th', 'crit low th',
'warning']
self.primary_field = 'sensor'
self.excluded_fields = ['timestamp',]
self.excluded_fields = ['timestamp', ]

def deinit(self):
"""
Expand Down Expand Up @@ -1097,7 +1099,8 @@ def mock_over_speed(self):
Change the mocked FAN speed to faster than target speed and exceed speed tolerance.
:return:
"""
self.fan_data.mock_speed(AbnormalFanMocker.TARGET_SPEED_VALUE * (100 + AbnormalFanMocker.SPEED_TOLERANCE) / 100 + 10)
self.fan_data.mock_speed(
AbnormalFanMocker.TARGET_SPEED_VALUE * (100 + AbnormalFanMocker.SPEED_TOLERANCE) / 100 + 10)
self.fan_data.mock_target_speed(AbnormalFanMocker.TARGET_SPEED_VALUE)
self.expect_led_color = 'red'

Expand All @@ -1106,7 +1109,8 @@ def mock_under_speed(self):
Change the mocked FAN speed to slower than target speed and exceed speed tolerance.
:return:
"""
self.fan_data.mock_speed(AbnormalFanMocker.TARGET_SPEED_VALUE * (100 - AbnormalFanMocker.SPEED_TOLERANCE) / 100 - 10)
self.fan_data.mock_speed(
AbnormalFanMocker.TARGET_SPEED_VALUE * (100 - AbnormalFanMocker.SPEED_TOLERANCE) / 100 - 10)
self.fan_data.mock_target_speed(AbnormalFanMocker.TARGET_SPEED_VALUE)
self.expect_led_color = 'red'

Expand Down Expand Up @@ -1237,7 +1241,7 @@ def mock_power_threshold(self, number_psus):
if not max_power:
power = int(self.mock_helper.read_value(self.PSU_POWER.format(i + 1)))
# Round up to 100 watt and then double it to avoid noise when power fluctuate
max_power = int(round(power/100000000.0)) * 100000000 * 2
max_power = int(round(power / 100000000.0)) * 100000000 * 2
self.mock_helper.mock_value(self.PSU_POWER_CAPACITY.format(i + 1), max_power, True)

# Also mock ambient temperatures
Expand Down Expand Up @@ -1273,3 +1277,25 @@ def read_port_ambient_thermal(self):

def read_fan_ambient_thermal(self):
return int(self.mock_helper.read_value(self.FAN_AMBIENT_TEMP))


@mocker('RebootCauseMocker')
class RebootCauseMocker(object):
RESET_RELOAD_BIOS = '/var/run/hw-management/system/reset_reload_bios'
RESET_FROM_COMEX = '/var/run/hw-management/system/reset_from_comex'
RESET_FROM_ASIC = '/var/run/hw-management/system/reset_from_asic'

def __init__(self, dut):
self.mock_helper = MockerHelper(dut)

def deinit(self):
self.mock_helper.deinit()

def mock_reset_reload_bios(self):
self.mock_helper.mock_value(self.RESET_RELOAD_BIOS, 1)

def mock_reset_from_comex(self):
self.mock_helper.mock_value(self.RESET_FROM_COMEX, 1)

def mock_reset_from_asic(self):
self.mock_helper.mock_value(self.RESET_FROM_ASIC, 1)
42 changes: 42 additions & 0 deletions tests/platform_tests/mellanox/test_reboot_cause.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import allure
import logging
import pytest
from tests.common.reboot import REBOOT_TYPE_CPU, REBOOT_TYPE_BIOS, REBOOT_TYPE_ASIC, check_reboot_cause
from tests.platform_tests.thermal_control_test_helper import mocker_factory # noqa: F401

pytestmark = [
pytest.mark.asic('mellanox'),
pytest.mark.topology('any')
]

logger = logging.getLogger(__name__)

mocker = None
REBOOT_CAUSE_TYPES = [REBOOT_TYPE_CPU, REBOOT_TYPE_BIOS, REBOOT_TYPE_ASIC]


@pytest.mark.parametrize("reboot_cause", REBOOT_CAUSE_TYPES)
def test_reboot_cause(rand_selected_dut, mocker_factory, reboot_cause): # noqa: F811
"""
Validate reboot cause from cpu/bios/asic
:param rand_selected_dut: The fixture returns a randomly selected DUT
:param mocker_factory: The fixture returns a mocker
:param reboot_cause: The specific reboot cause
"""
duthost = rand_selected_dut
with allure.step('Create mocker - RebootCauseMocker'):
mocker = mocker_factory(duthost, 'RebootCauseMocker')

with allure.step('Mock reset from {}'.format(reboot_cause)):
if reboot_cause == REBOOT_TYPE_CPU:
mocker.mock_reset_from_comex()
elif reboot_cause == REBOOT_TYPE_BIOS:
mocker.mock_reset_reload_bios()
elif reboot_cause == REBOOT_TYPE_ASIC:
mocker.mock_reset_from_asic()

with allure.step('Restart determine-reboot-cause service'):
duthost.restart_service('determine-reboot-cause')

with allure.step('Check Reboot cause is {}'.format(reboot_cause)):
check_reboot_cause(duthost, reboot_cause)

0 comments on commit 0e46b47

Please sign in to comment.