Skip to content

Commit

Permalink
[thermalctld] Update line card thermal sensor status to DB (#211)
Browse files Browse the repository at this point in the history
Update line card thermal sensor status to DB, includes PSU thermal sensors and SFP thermal sensors on line card. Depends on sonic-net/sonic-buildimage#8422.

#### Description

In thermal update function, update PSU, SFP and direct thermal of line card

#### Motivation and Context

To support modular chassis

#### How Has This Been Tested?

1. Full platform regression, 100% passed
2. Unit test passed
  • Loading branch information
Junchao-Mellanox authored Oct 18, 2021
1 parent 294995b commit 1565a23
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 86 deletions.
202 changes: 124 additions & 78 deletions sonic-thermalctld/scripts/thermalctld
Original file line number Diff line number Diff line change
Expand Up @@ -538,11 +538,17 @@ class TemperatureUpdater(logger.Logger):

self.is_chassis_system = chassis.is_modular_chassis()
if self.is_chassis_system:
self.module_thermals = set()
my_slot = try_get(chassis.get_my_slot, INVALID_SLOT)
if my_slot != INVALID_SLOT:
table_name = TemperatureUpdater.TEMPER_INFO_TABLE_NAME+'_'+str(my_slot)
chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB")
self.chassis_table = swsscommon.Table(chassis_state_db, table_name)
try:
# Modular chassis does not have to have table CHASSIS_STATE_DB.
# So catch the exception here and ignore it.
table_name = TemperatureUpdater.TEMPER_INFO_TABLE_NAME+'_'+str(my_slot)
chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB")
self.chassis_table = swsscommon.Table(chassis_state_db, table_name)
except Exception as e:
self.chassis_table = None

def deinit(self):
"""
Expand Down Expand Up @@ -576,31 +582,61 @@ class TemperatureUpdater(logger.Logger):
for index, thermal in enumerate(self.chassis.get_all_thermals()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index)
except Exception as e:
self.log_warning('Failed to update thermal status - {}'.format(repr(e)))

self._refresh_temperature_status(CHASSIS_INFO_KEY, thermal, index)

for psu_index, psu in enumerate(self.chassis.get_all_psus()):
parent_name = 'PSU {}'.format(psu_index + 1)
for thermal_index, thermal in enumerate(psu.get_all_thermals()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_temperature_status(parent_name, thermal, thermal_index)
except Exception as e:
self.log_warning('Failed to update thermal status - {}'.format(repr(e)))

self._refresh_temperature_status(parent_name, thermal, thermal_index)

for sfp_index, sfp in enumerate(self.chassis.get_all_sfps()):
parent_name = 'SFP {}'.format(sfp_index + 1)
for thermal_index, thermal in enumerate(sfp.get_all_thermals()):
if self.task_stopping_event.is_set():
return
try:
self._refresh_temperature_status(parent_name, thermal, thermal_index)
except Exception as e:
self.log_warning('Failed to update thermal status - {}'.format(repr(e)))

self._refresh_temperature_status(parent_name, thermal, thermal_index)

if self.is_chassis_system:
available_thermals = set()
for module_index, module in enumerate(self.chassis.get_all_modules()):
module_name = try_get(module.get_name, 'Module {}'.format(module_index + 1))

for thermal_index, thermal in enumerate(module.get_all_thermals()):
if self.task_stopping_event.is_set():
return

available_thermals.add((thermal, module_name, thermal_index))
self._refresh_temperature_status(module_name, thermal, thermal_index)

for sfp_index, sfp in enumerate(module.get_all_sfps()):
sfp_name = '{} SFP {}'.format(module_name, sfp_index + 1)
for thermal_index, thermal in enumerate(sfp.get_all_thermals()):
if self.task_stopping_event.is_set():
return

available_thermals.add((thermal, sfp_name, thermal_index))
self._refresh_temperature_status(sfp_name, thermal, thermal_index)

for psu_index, psu in enumerate(module.get_all_psus()):
psu_name = '{} PSU {}'.format(module_name, psu_index + 1)
for thermal_index, thermal in enumerate(psu.get_all_thermals()):
if self.task_stopping_event.is_set():
return

available_thermals.add((thermal, psu_name, thermal_index))
self._refresh_temperature_status(psu_name, thermal, thermal_index)


thermals_to_remove = self.module_thermals - available_thermals
self.module_thermals = available_thermals
for thermal, parent_name, thermal_index in thermals_to_remove:
self._remove_thermal_from_db(thermal, parent_name, thermal_index)

self.log_debug("End temperature updating")

def _refresh_temperature_status(self, parent_name, thermal, thermal_index):
Expand All @@ -611,72 +647,82 @@ class TemperatureUpdater(logger.Logger):
:param thermal_index: Index of the thermal object in platform chassis
:return:
"""
name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1))
try:
name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1))

# Only save entity info for thermals that belong to chassis and PSU
# for SFP thermal, they don't need save entity info because snmp can deduce the relation from TRANSCEIVER_DOM_SENSOR
# and as we save logical port in TRANSCEIVER_INFO table, for split cable, a SFP thermal might have multiple parent
# logical port
if 'SFP' not in parent_name:
update_entity_info(self.phy_entity_table, parent_name, name, thermal, thermal_index + 1)

if name not in self.temperature_status_dict:
self.temperature_status_dict[name] = TemperatureStatus()

temperature_status = self.temperature_status_dict[name]

high_threshold = NOT_AVAILABLE
low_threshold = NOT_AVAILABLE
high_critical_threshold = NOT_AVAILABLE
low_critical_threshold = NOT_AVAILABLE
maximum_temperature = NOT_AVAILABLE
minimum_temperature = NOT_AVAILABLE
temperature = try_get(thermal.get_temperature)
is_replaceable = try_get(thermal.is_replaceable, False)
if temperature != NOT_AVAILABLE:
temperature_status.set_temperature(name, temperature)
minimum_temperature = try_get(thermal.get_minimum_recorded)
maximum_temperature = try_get(thermal.get_maximum_recorded)
high_threshold = try_get(thermal.get_high_threshold)
low_threshold = try_get(thermal.get_low_threshold)
high_critical_threshold = try_get(thermal.get_high_critical_threshold)
low_critical_threshold = try_get(thermal.get_low_critical_threshold)

warning = False
if temperature != NOT_AVAILABLE and temperature_status.set_over_temperature(temperature, high_threshold):
self._log_on_status_changed(not temperature_status.over_temperature,
'High temperature warning cleared: {} temperature restored to {}C, high threshold {}C'.
format(name, temperature, high_threshold),
'High temperature warning: {} current temperature {}C, high threshold {}C'.
format(name, temperature, high_threshold)
)
warning = warning | temperature_status.over_temperature

if temperature != NOT_AVAILABLE and temperature_status.set_under_temperature(temperature, low_threshold):
self._log_on_status_changed(not temperature_status.under_temperature,
'Low temperature warning cleared: {} temperature restored to {}C, low threshold {}C'.
format(name, temperature, low_threshold),
'Low temperature warning: {} current temperature {}C, low threshold {}C'.
format(name, temperature, low_threshold)
)
warning = warning | temperature_status.under_temperature

fvs = swsscommon.FieldValuePairs(
[('temperature', str(temperature)),
('minimum_temperature', str(minimum_temperature)),
('maximum_temperature', str(maximum_temperature)),
('high_threshold', str(high_threshold)),
('low_threshold', str(low_threshold)),
('warning_status', str(warning)),
('critical_high_threshold', str(high_critical_threshold)),
('critical_low_threshold', str(low_critical_threshold)),
('is_replaceable', str(is_replaceable)),
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
])

# Only save entity info for thermals that belong to chassis and PSU
# for SFP thermal, they don't need save entity info because snmp can deduce the relation from TRANSCEIVER_DOM_SENSOR
# and as we save logical port in TRANSCEIVER_INFO table, for split cable, a SFP thermal might have multiple parent
# logical port
if 'SFP' not in parent_name:
update_entity_info(self.phy_entity_table, parent_name, name, thermal, thermal_index + 1)

if name not in self.temperature_status_dict:
self.temperature_status_dict[name] = TemperatureStatus()

temperature_status = self.temperature_status_dict[name]

high_threshold = NOT_AVAILABLE
low_threshold = NOT_AVAILABLE
high_critical_threshold = NOT_AVAILABLE
low_critical_threshold = NOT_AVAILABLE
maximum_temperature = NOT_AVAILABLE
minimum_temperature = NOT_AVAILABLE
temperature = try_get(thermal.get_temperature)
is_replaceable = try_get(thermal.is_replaceable, False)
if temperature != NOT_AVAILABLE:
temperature_status.set_temperature(name, temperature)
minimum_temperature = try_get(thermal.get_minimum_recorded)
maximum_temperature = try_get(thermal.get_maximum_recorded)
high_threshold = try_get(thermal.get_high_threshold)
low_threshold = try_get(thermal.get_low_threshold)
high_critical_threshold = try_get(thermal.get_high_critical_threshold)
low_critical_threshold = try_get(thermal.get_low_critical_threshold)

warning = False
if temperature != NOT_AVAILABLE and temperature_status.set_over_temperature(temperature, high_threshold):
self._log_on_status_changed(not temperature_status.over_temperature,
'High temperature warning cleared: {} temperature restored to {}C, high threshold {}C'.
format(name, temperature, high_threshold),
'High temperature warning: {} current temperature {}C, high threshold {}C'.
format(name, temperature, high_threshold)
)
warning = warning | temperature_status.over_temperature

if temperature != NOT_AVAILABLE and temperature_status.set_under_temperature(temperature, low_threshold):
self._log_on_status_changed(not temperature_status.under_temperature,
'Low temperature warning cleared: {} temperature restored to {}C, low threshold {}C'.
format(name, temperature, low_threshold),
'Low temperature warning: {} current temperature {}C, low threshold {}C'.
format(name, temperature, low_threshold)
)
warning = warning | temperature_status.under_temperature
self.table.set(name, fvs)
if self.is_chassis_system and self.chassis_table is not None:
self.chassis_table.set(name, fvs)
except Exception as e:
self.log_warning('Failed to update thermal status for {} - {}'.format(name, repr(e)))

fvs = swsscommon.FieldValuePairs(
[('temperature', str(temperature)),
('minimum_temperature', str(minimum_temperature)),
('maximum_temperature', str(maximum_temperature)),
('high_threshold', str(high_threshold)),
('low_threshold', str(low_threshold)),
('warning_status', str(warning)),
('critical_high_threshold', str(high_critical_threshold)),
('critical_low_threshold', str(low_critical_threshold)),
('is_replaceable', str(is_replaceable)),
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
])
def _remove_thermal_from_db(self, thermal, parent_name, thermal_index):
name = try_get(thermal.get_name, '{} Thermal {}'.format(parent_name, thermal_index + 1))
self.table._del(name)

self.table.set(name, fvs)
if self.is_chassis_system and self.chassis_table is not None:
self.chassis_table.set(name, fvs)
if self.chassis_table is not None:
self.chassis_table._del(name)


class ThermalMonitor(ProcessTaskBase):
Expand Down
16 changes: 16 additions & 0 deletions sonic-thermalctld/tests/mock_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,17 @@ def make_error_thermal(self):
thermal = MockErrorThermal()
self._thermal_list.append(thermal)

def make_module_thermal(self):
module = MockModule()
self._module_list.append(module)
sfp = MockSfp()
sfp._thermal_list.append(MockThermal())
psu = MockPsu()
psu._thermal_list.append(MockThermal())
module._sfp_list.append(sfp)
module._psu_list.append(psu)
module._thermal_list.append(MockThermal())

def is_modular_chassis(self):
return self._is_chassis_system

Expand Down Expand Up @@ -430,3 +441,8 @@ def get_position_in_parent(self):

def is_replaceable(self):
return self._replaceable


class MockModule(module_base.ModuleBase):
def __init__(self):
super(MockModule, self).__init__()
28 changes: 20 additions & 8 deletions sonic-thermalctld/tests/test_thermalctld.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,15 +470,15 @@ def test_update_psu_thermals(self):
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 0

temperature_updater._refresh_temperature_status = mock.MagicMock(side_effect=Exception("Test message"))
mock_thermal.get_temperature = mock.MagicMock(side_effect=Exception("Test message"))
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 1

# TODO: Clean this up once we no longer need to support Python 2
if sys.version_info.major == 3:
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message')")
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for PSU 1 Thermal 1 - Exception('Test message')")
else:
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message',)")
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for PSU 1 Thermal 1 - Exception('Test message',)")

def test_update_sfp_thermals(self):
chassis = MockChassis()
Expand All @@ -490,15 +490,15 @@ def test_update_sfp_thermals(self):
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 0

temperature_updater._refresh_temperature_status = mock.MagicMock(side_effect=Exception("Test message"))
mock_thermal.get_temperature = mock.MagicMock(side_effect=Exception("Test message"))
temperature_updater.update()
assert temperature_updater.log_warning.call_count == 1

# TODO: Clean this up once we no longer need to support Python 2
if sys.version_info.major == 3:
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message')")
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for SFP 1 Thermal 1 - Exception('Test message')")
else:
temperature_updater.log_warning.assert_called_with("Failed to update thermal status - Exception('Test message',)")
temperature_updater.log_warning.assert_called_with("Failed to update thermal status for SFP 1 Thermal 1 - Exception('Test message',)")

def test_update_thermal_with_exception(self):
chassis = MockChassis()
Expand All @@ -514,16 +514,28 @@ def test_update_thermal_with_exception(self):
# TODO: Clean this up once we no longer need to support Python 2
if sys.version_info.major == 3:
expected_calls = [
mock.call("Failed to update thermal status - Exception('Failed to get temperature')"),
mock.call("Failed to update thermal status for chassis 1 Thermal 1 - Exception('Failed to get temperature')"),
mock.call('High temperature warning: chassis 1 Thermal 2 current temperature 3C, high threshold 2C')
]
else:
expected_calls = [
mock.call("Failed to update thermal status - Exception('Failed to get temperature',)"),
mock.call("Failed to update thermal status for chassis 1 Thermal 1 - Exception('Failed to get temperature',)"),
mock.call('High temperature warning: chassis 1 Thermal 2 current temperature 3C, high threshold 2C')
]
assert temperature_updater.log_warning.mock_calls == expected_calls

def test_update_module_thermals(self):
chassis = MockChassis()
chassis.make_module_thermal()
chassis.set_modular_chassis(True)
temperature_updater = thermalctld.TemperatureUpdater(chassis, multiprocessing.Event())
temperature_updater.update()
assert len(temperature_updater.module_thermals) == 3

chassis._module_list = []
temperature_updater.update()
assert len(temperature_updater.module_thermals) == 0


# Modular chassis-related tests

Expand Down

0 comments on commit 1565a23

Please sign in to comment.