Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance Mellanox reboot cause test case #6944

Merged
merged 1 commit into from
Dec 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 41 additions & 14 deletions tests/common/reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
REBOOT_TYPE_WATCHDOG = "watchdog"
REBOOT_TYPE_UNKNOWN = "Unknown"
REBOOT_TYPE_THERMAL_OVERLOAD = "Thermal Overload"
REBOOT_TYPE_CPU = "cpu"
REBOOT_TYPE_BIOS = "bios"
REBOOT_TYPE_ASIC = "asic"

# Event to signal DUT activeness
DUT_ACTIVE = threading.Event()
Expand Down Expand Up @@ -87,6 +90,24 @@
"cause": "warm-reboot",
"test_reboot_cause_only": False
},
REBOOT_TYPE_CPU: {
"timeout": 300,
"wait": 120,
"cause": "CPU",
"test_reboot_cause_only": True
},
REBOOT_TYPE_BIOS: {
"timeout": 300,
"wait": 120,
"cause": "BIOS",
"test_reboot_cause_only": True
},
REBOOT_TYPE_ASIC: {
"timeout": 300,
"wait": 120,
"cause": "ASIC",
"test_reboot_cause_only": True
}
}

MAX_NUM_REBOOT_CAUSE_HISTORY = 10
Expand Down Expand Up @@ -187,7 +208,7 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,
pool = ThreadPool()
hostname = duthost.hostname
try:
reboot_ctrl = reboot_ctrl_dict[reboot_type]
reboot_ctrl = reboot_ctrl_dict[reboot_type]
reboot_command = reboot_ctrl['command'] if reboot_type != REBOOT_TYPE_POWEROFF else None
if timeout == 0:
timeout = reboot_ctrl['timeout']
Expand All @@ -199,7 +220,7 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,
raise ValueError('invalid reboot type: "{} for {}"'.format(reboot_type, hostname))

reboot_res, dut_datetime = perform_reboot(duthost, pool, reboot_command, reboot_helper, reboot_kwargs, reboot_type)

wait_for_shutdown(duthost, localhost, delay, timeout, reboot_res)
# if wait_for_ssh flag is False, do not wait for dut to boot up
if not wait_for_ssh:
Expand All @@ -222,7 +243,8 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,
pool.terminate()
dut_uptime = duthost.get_up_time()
logger.info('DUT {} up since {}'.format(hostname, dut_uptime))
assert float(dut_uptime.strftime("%s")) > float(dut_datetime.strftime("%s")), "Device {} did not reboot".format(hostname)
assert float(dut_uptime.strftime("%s")) > float(dut_datetime.strftime("%s")), "Device {} did not reboot". \
format(hostname)


def get_reboot_cause(dut):
Expand All @@ -232,7 +254,7 @@ def get_reboot_cause(dut):
"""
logging.info('Getting reboot cause from dut {}'.format(dut.hostname))
output = dut.shell('show reboot-cause')
cause = output['stdout']
cause = output['stdout']

for type, ctrl in reboot_ctrl_dict.items():
if re.search(ctrl['cause'], cause):
Expand Down Expand Up @@ -282,13 +304,13 @@ def sync_reboot_history_queue_with_dut(dut):
dut_reboot_history_queue = dut.show_and_parse("show reboot-cause history")
dut_reboot_history_received = True
break
except Exception as e:
except Exception:
e_type, e_value, e_traceback = sys.exc_info()
logging.info("Exception type: %s" % e_type.__name__)
logging.info("Exception message: %s" % e_value)
logging.info("Backing off for %d seconds before retrying", ((retry_count+1) * RETRY_BACKOFF_TIME))
logging.info("Backing off for %d seconds before retrying", ((retry_count + 1) * RETRY_BACKOFF_TIME))

time.sleep(((retry_count+1) * RETRY_BACKOFF_TIME))
time.sleep(((retry_count + 1) * RETRY_BACKOFF_TIME))
continue

# If retry logic did not yield reboot cause history from DUT,
Expand Down Expand Up @@ -349,21 +371,26 @@ def check_reboot_cause_history(dut, reboot_type_history_queue):
logging.info("Verify reboot-cause history title")
if reboot_cause_history_got:
if not set(REBOOT_CAUSE_HISTORY_TITLE) == set(reboot_cause_history_got[0].keys()):
logging.error("Expected reboot-cause history title:{} not match actual reboot-cause history title:{}".format(
REBOOT_CAUSE_HISTORY_TITLE, reboot_cause_history_got[0].keys()))
logging.error("Expected reboot-cause history title:{} not match actual reboot-cause history title:{}".
format(REBOOT_CAUSE_HISTORY_TITLE, reboot_cause_history_got[0].keys()))
return False

logging.info("Verify reboot-cause output are sorted in reverse chronological order" )
logging.info("Verify reboot-cause output are sorted in reverse chronological order")
reboot_type_history_len = len(reboot_type_history_queue)
if reboot_type_history_len <= len(reboot_cause_history_got):
for index, reboot_type in enumerate(reboot_type_history_queue):
if reboot_type not in reboot_ctrl_dict:
logging.warn("Reboot type: {} not in dictionary. Skipping history check for this entry.".format(reboot_type))
logging.warn("Reboot type: {} not in dictionary. Skipping history check for this entry.".
format(reboot_type))
continue
logging.info("index: %d, reboot cause: %s, reboot cause from DUT: %s" % (index, reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index-1]["cause"]))
if not re.search(reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index-1]["cause"]):
logging.info("index: %d, reboot cause: %s, reboot cause from DUT: %s" %
(index, reboot_ctrl_dict[reboot_type]["cause"],
reboot_cause_history_got[reboot_type_history_len - index - 1]["cause"]))
if not re.search(reboot_ctrl_dict[reboot_type]["cause"],
reboot_cause_history_got[reboot_type_history_len - index - 1]["cause"]):
logging.error("The {} reboot-cause not match. expected_reboot type={}, actual_reboot_cause={}".format(
index, reboot_ctrl_dict[reboot_type]["cause"], reboot_cause_history_got[reboot_type_history_len-index]["cause"]))
index, reboot_ctrl_dict[reboot_type]["cause"],
reboot_cause_history_got[reboot_type_history_len - index]["cause"]))
return False
return True
logging.error("The number of expected reboot-cause:{} is more than that of actual reboot-cuase:{}".format(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import logging
import time
from pkg_resources import parse_version
from tests.platform_tests.thermal_control_test_helper import *
from tests.platform_tests.thermal_control_test_helper import mocker, FanStatusMocker, ThermalStatusMocker, \
SingleFanMocker
from tests.common.mellanox_data import get_platform_data
from minimum_table import get_min_table

Expand Down Expand Up @@ -96,6 +97,7 @@
}
}


class SysfsNotExistError(Exception):
"""
Exception when sys fs not exist.
Expand Down Expand Up @@ -138,7 +140,6 @@ def __init__(self, dut):
:param dut: DUT object representing a SONiC switch under test.
"""
self.dut = dut
#self.unlink_file_list = {}
self._extract_num_of_fans_and_fan_drawers()
self.deinit_retry = 5

Expand Down Expand Up @@ -289,7 +290,7 @@ def deinit(self):
for file_path, link_target in self.unlink_file_list.items():
try:
self.dut.command('ln -f -s {} {}'.format(link_target, file_path))
except Exception as e:
except Exception:
# Catch any exception for later retry
failed_recover_links[file_path] = link_target

Expand All @@ -300,7 +301,7 @@ def deinit(self):
self.dut.shell('rm -f {}'.format(file_path))
else:
self.dut.shell('echo \'{}\' > {}'.format(value, file_path))
except Exception as e:
except Exception:
# Catch any exception for later retry
failed_recover_files[file_path] = value

Expand Down Expand Up @@ -417,7 +418,7 @@ def mock_fan_direction_fan_dir_per_fan(self, direction):
"""
try:
_ = int(self.helper.read_value(FanDrawerData.FAN_DIR_PATH_PER_FAN.format(self.index)))
except SysfsNotExistError as e:
except SysfsNotExistError:
self.mocked_direction = NOT_AVAILABLE
return

Expand All @@ -438,7 +439,7 @@ def mock_fan_direction_fan_dir_for_all_fans(self, direction):
"""
try:
fan_dir_bits = int(self.helper.read_value(FanDrawerData.FAN_DIR_PATH_ALL_FANS))
except SysfsNotExistError as e:
except SysfsNotExistError:
self.mocked_direction = NOT_AVAILABLE
return

Expand Down Expand Up @@ -479,6 +480,7 @@ def get_expect_led_color(self):

return 'green'


class FanData:
"""
Data mocker of a FAN.
Expand Down Expand Up @@ -713,7 +715,7 @@ def check_result(self, actual_data):
mismatch_in_actual_data = []
for actual_data_item in actual_data:
primary = actual_data_item[self.primary_field]
if not primary in expected:
if primary not in expected:
extra_in_actual_data.append(actual_data_item)
else:
for field in actual_data_item.keys():
Expand All @@ -726,16 +728,16 @@ def check_result(self, actual_data):

result = True
if len(extra_in_actual_data) > 0:
logging.error('Found extra data in actual_data: {}'\
.format(json.dumps(extra_in_actual_data, indent=2)))
logging.error('Found extra data in actual_data: {}'
.format(json.dumps(extra_in_actual_data, indent=2)))
result = False
if len(mismatch_in_actual_data) > 0:
logging.error('Found mismatch data in actual_data: {}'\
.format(json.dumps(mismatch_in_actual_data, indent=2)))
logging.error('Found mismatch data in actual_data: {}'
.format(json.dumps(mismatch_in_actual_data, indent=2)))
result = False
if len(expected.keys()) > 0:
logging.error('Expected data not found in actual_data: {}'\
.format(json.dumps(expected, indent=2)))
logging.error('Expected data not found in actual_data: {}'
.format(json.dumps(expected, indent=2)))
result = False

return result
Expand All @@ -761,7 +763,7 @@ def __init__(self, dut):
self.expected_data = {}
self.expected_data_headers = ['drawer', 'led', 'fan', 'speed', 'direction', 'presence', 'status']
self.primary_field = 'fan'
self.excluded_fields = ['timestamp',]
self.excluded_fields = ['timestamp', ]

def deinit(self):
"""
Expand All @@ -779,12 +781,11 @@ def mock_data(self):
drawer_index = 1
drawer_data = None
presence = 0
direction = NOT_AVAILABLE
naming_rule = FAN_NAMING_RULE['fan']
# All system fan is controlled to have the same speed, so only
# get a random value once here
speed = random.randint(60, 100)
FanData.mock_cooling_cur_state(self.mock_helper, speed/10)
FanData.mock_cooling_cur_state(self.mock_helper, speed / 10)
while fan_index <= MockerHelper.FAN_NUM:
try:
if (fan_index - 1) % MockerHelper.FAN_NUM_PER_DRAWER == 0:
Expand All @@ -806,7 +807,7 @@ def mock_data(self):
fan_data.mock_target_speed(speed)
self.expected_data[fan_data.name] = [
drawer_data.name,
'N/A', # update this value later
'N/A', # update this value later
fan_data.name,
'{}%'.format(fan_data.mocked_speed),
drawer_data.mocked_direction,
Expand Down Expand Up @@ -894,9 +895,10 @@ def __init__(self, dut):
ThermalStatusMocker.__init__(self, dut)
self.mock_helper = MockerHelper(dut)
self.expected_data = {}
self.expected_data_headers = ['sensor', 'temperature', 'high th', 'low th', 'crit high th', 'crit low th', 'warning']
self.expected_data_headers = ['sensor', 'temperature', 'high th', 'low th', 'crit high th', 'crit low th',
'warning']
self.primary_field = 'sensor'
self.excluded_fields = ['timestamp',]
self.excluded_fields = ['timestamp', ]

def deinit(self):
"""
Expand Down Expand Up @@ -1097,7 +1099,8 @@ def mock_over_speed(self):
Change the mocked FAN speed to faster than target speed and exceed speed tolerance.
:return:
"""
self.fan_data.mock_speed(AbnormalFanMocker.TARGET_SPEED_VALUE * (100 + AbnormalFanMocker.SPEED_TOLERANCE) / 100 + 10)
self.fan_data.mock_speed(
AbnormalFanMocker.TARGET_SPEED_VALUE * (100 + AbnormalFanMocker.SPEED_TOLERANCE) / 100 + 10)
self.fan_data.mock_target_speed(AbnormalFanMocker.TARGET_SPEED_VALUE)
self.expect_led_color = 'red'

Expand All @@ -1106,7 +1109,8 @@ def mock_under_speed(self):
Change the mocked FAN speed to slower than target speed and exceed speed tolerance.
:return:
"""
self.fan_data.mock_speed(AbnormalFanMocker.TARGET_SPEED_VALUE * (100 - AbnormalFanMocker.SPEED_TOLERANCE) / 100 - 10)
self.fan_data.mock_speed(
AbnormalFanMocker.TARGET_SPEED_VALUE * (100 - AbnormalFanMocker.SPEED_TOLERANCE) / 100 - 10)
self.fan_data.mock_target_speed(AbnormalFanMocker.TARGET_SPEED_VALUE)
self.expect_led_color = 'red'

Expand Down Expand Up @@ -1237,7 +1241,7 @@ def mock_power_threshold(self, number_psus):
if not max_power:
power = int(self.mock_helper.read_value(self.PSU_POWER.format(i + 1)))
# Round up to 100 watt and then double it to avoid noise when power fluctuate
max_power = int(round(power/100000000.0)) * 100000000 * 2
max_power = int(round(power / 100000000.0)) * 100000000 * 2
self.mock_helper.mock_value(self.PSU_POWER_CAPACITY.format(i + 1), max_power, True)

# Also mock ambient temperatures
Expand Down Expand Up @@ -1273,3 +1277,25 @@ def read_port_ambient_thermal(self):

def read_fan_ambient_thermal(self):
return int(self.mock_helper.read_value(self.FAN_AMBIENT_TEMP))


@mocker('RebootCauseMocker')
class RebootCauseMocker(object):
RESET_RELOAD_BIOS = '/var/run/hw-management/system/reset_reload_bios'
RESET_FROM_COMEX = '/var/run/hw-management/system/reset_from_comex'
RESET_FROM_ASIC = '/var/run/hw-management/system/reset_from_asic'

def __init__(self, dut):
self.mock_helper = MockerHelper(dut)

def deinit(self):
self.mock_helper.deinit()

def mock_reset_reload_bios(self):
self.mock_helper.mock_value(self.RESET_RELOAD_BIOS, 1)

def mock_reset_from_comex(self):
self.mock_helper.mock_value(self.RESET_FROM_COMEX, 1)

def mock_reset_from_asic(self):
self.mock_helper.mock_value(self.RESET_FROM_ASIC, 1)
42 changes: 42 additions & 0 deletions tests/platform_tests/mellanox/test_reboot_cause.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import allure
import logging
import pytest
from tests.common.reboot import REBOOT_TYPE_CPU, REBOOT_TYPE_BIOS, REBOOT_TYPE_ASIC, check_reboot_cause
from tests.platform_tests.thermal_control_test_helper import mocker_factory # noqa: F401

pytestmark = [
pytest.mark.asic('mellanox'),
pytest.mark.topology('any')
]

logger = logging.getLogger(__name__)

mocker = None
REBOOT_CAUSE_TYPES = [REBOOT_TYPE_CPU, REBOOT_TYPE_BIOS, REBOOT_TYPE_ASIC]


@pytest.mark.parametrize("reboot_cause", REBOOT_CAUSE_TYPES)
def test_reboot_cause(rand_selected_dut, mocker_factory, reboot_cause): # noqa: F811
"""
Validate reboot cause from cpu/bios/asic
:param rand_selected_dut: The fixture returns a randomly selected DUT
:param mocker_factory: The fixture returns a mocker
:param reboot_cause: The specific reboot cause
"""
duthost = rand_selected_dut
with allure.step('Create mocker - RebootCauseMocker'):
mocker = mocker_factory(duthost, 'RebootCauseMocker')

with allure.step('Mock reset from {}'.format(reboot_cause)):
if reboot_cause == REBOOT_TYPE_CPU:
mocker.mock_reset_from_comex()
elif reboot_cause == REBOOT_TYPE_BIOS:
mocker.mock_reset_reload_bios()
elif reboot_cause == REBOOT_TYPE_ASIC:
mocker.mock_reset_from_asic()

with allure.step('Restart determine-reboot-cause service'):
duthost.restart_service('determine-reboot-cause')

with allure.step('Check Reboot cause is {}'.format(reboot_cause)):
check_reboot_cause(duthost, reboot_cause)