From d8318aeee3a30dcdd30912000a11f038b5e5abba Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Tue, 10 Mar 2020 01:41:10 +0800 Subject: [PATCH 01/14] Add thermal control support for SONiC (#3949) Conflicts: src/sonic-platform-common src/sonic-platform-daemons --- .../thermal_policy.json | 1 + .../thermal_policy.json | 1 + .../thermal_policy.json | 1 + .../thermal_policy.json | 1 + .../thermal_policy.json | 72 +++++ .../thermal_policy.json | 1 + .../thermal_policy.json | 1 + .../thermal_policy.json | 1 + .../thermal_policy.json | 1 + .../thermal_policy.json | 1 + .../thermal_policy.json | 1 + .../docker-pmon.supervisord.conf.j2 | 11 + dockers/docker-platform-monitor/start.sh.j2 | 4 + ...-Disable-thermal-policy-running-in-h.patch | 31 ++ platform/mellanox/mlnx-platform-api.mk | 1 + .../mellanox/mlnx-platform-api/.gitignore | 2 + .../mellanox/mlnx-platform-api/pytest.ini | 3 + platform/mellanox/mlnx-platform-api/setup.cfg | 2 + platform/mellanox/mlnx-platform-api/setup.py | 9 + .../sonic_platform/__init__.py | 2 +- .../sonic_platform/chassis.py | 5 + .../mlnx-platform-api/sonic_platform/fan.py | 20 +- .../sonic_platform/platform.py | 1 + .../mlnx-platform-api/sonic_platform/psu.py | 6 +- .../sonic_platform/thermal.py | 52 +++- .../sonic_platform/thermal_actions.py | 108 +++++++ .../sonic_platform/thermal_conditions.py | 63 ++++ .../sonic_platform/thermal_infos.py | 136 +++++++++ .../sonic_platform/thermal_manager.py | 50 ++++ .../mlnx-platform-api/tests/__init__.py | 0 .../mlnx-platform-api/tests/mock_platform.py | 44 +++ .../tests/test_thermal_policy.py | 272 ++++++++++++++++++ .../tests/thermal_policy.json | 72 +++++ rules/docker-platform-monitor.mk | 2 +- rules/sonic-thermalctld.mk | 6 + .../sonic_daemon_base/task_base.py | 50 ++++ src/sonic-platform-common | 2 +- 37 files changed, 1021 insertions(+), 15 deletions(-) create mode 120000 device/mellanox/x86_64-mlnx_lssn2700-r0/thermal_policy.json create mode 120000 device/mellanox/x86_64-mlnx_msn2010-r0/thermal_policy.json create mode 120000 device/mellanox/x86_64-mlnx_msn2100-r0/thermal_policy.json create mode 120000 device/mellanox/x86_64-mlnx_msn2410-r0/thermal_policy.json create mode 100644 device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json create mode 120000 device/mellanox/x86_64-mlnx_msn2700_simx-r0/thermal_policy.json create mode 120000 device/mellanox/x86_64-mlnx_msn2740-r0/thermal_policy.json create mode 120000 device/mellanox/x86_64-mlnx_msn3700-r0/thermal_policy.json create mode 120000 device/mellanox/x86_64-mlnx_msn3700_simx-r0/thermal_policy.json create mode 120000 device/mellanox/x86_64-mlnx_msn3700c-r0/thermal_policy.json create mode 120000 device/mellanox/x86_64-mlnx_msn3800-r0/thermal_policy.json create mode 100644 platform/mellanox/hw-management/0002-hw-management.sh-Disable-thermal-policy-running-in-h.patch create mode 100644 platform/mellanox/mlnx-platform-api/.gitignore create mode 100644 platform/mellanox/mlnx-platform-api/pytest.ini create mode 100644 platform/mellanox/mlnx-platform-api/setup.cfg create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py create mode 100644 platform/mellanox/mlnx-platform-api/tests/__init__.py create mode 100644 platform/mellanox/mlnx-platform-api/tests/mock_platform.py create mode 100644 platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py create mode 100644 platform/mellanox/mlnx-platform-api/tests/thermal_policy.json create mode 100644 rules/sonic-thermalctld.mk create mode 100644 src/sonic-daemon-base/sonic_daemon_base/task_base.py diff --git a/device/mellanox/x86_64-mlnx_lssn2700-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_lssn2700-r0/thermal_policy.json new file mode 120000 index 000000000000..5a25cd87f70c --- /dev/null +++ b/device/mellanox/x86_64-mlnx_lssn2700-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn2010-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2010-r0/thermal_policy.json new file mode 120000 index 000000000000..5a25cd87f70c --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn2010-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn2100-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2100-r0/thermal_policy.json new file mode 120000 index 000000000000..5a25cd87f70c --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn2100-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn2410-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2410-r0/thermal_policy.json new file mode 120000 index 000000000000..5a25cd87f70c --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn2410-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json new file mode 100644 index 000000000000..054d797be951 --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json @@ -0,0 +1,72 @@ +{ + "thermal_control_algorithm": { + "run_at_boot_up": "false", + "fan_speed_when_suspend": "60" + }, + "info_types": [ + { + "type": "fan_info" + }, + { + "type": "psu_info" + }, + { + "type": "chassis_info" + } + ], + "policies": [ + { + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "any psu absence", + "conditions": [ + { + "type": "psu.any.absence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "all fan and psu presence", + "conditions": [ + { + "type": "fan.all.presence" + }, + { + "type": "psu.all.presence" + } + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "60" + } + ] + } + ] +} \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn2700_simx-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2700_simx-r0/thermal_policy.json new file mode 120000 index 000000000000..5a25cd87f70c --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn2700_simx-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn2740-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2740-r0/thermal_policy.json new file mode 120000 index 000000000000..5a25cd87f70c --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn2740-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn3700-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn3700-r0/thermal_policy.json new file mode 120000 index 000000000000..5a25cd87f70c --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn3700-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn3700_simx-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn3700_simx-r0/thermal_policy.json new file mode 120000 index 000000000000..5a25cd87f70c --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn3700_simx-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn3700c-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn3700c-r0/thermal_policy.json new file mode 120000 index 000000000000..5a25cd87f70c --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn3700c-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn3800-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn3800-r0/thermal_policy.json new file mode 120000 index 000000000000..5a25cd87f70c --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn3800-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/dockers/docker-platform-monitor/docker-pmon.supervisord.conf.j2 b/dockers/docker-platform-monitor/docker-pmon.supervisord.conf.j2 index 5d848776f2a7..d33b4e7c3fe7 100644 --- a/dockers/docker-platform-monitor/docker-pmon.supervisord.conf.j2 +++ b/dockers/docker-platform-monitor/docker-pmon.supervisord.conf.j2 @@ -91,3 +91,14 @@ stdout_logfile=syslog stderr_logfile=syslog startsecs=10 {% endif %} + +{% if not skip_thermalctld %} +[program:thermalctld] +command=/usr/bin/thermalctld +priority=9 +autostart=false +autorestart=true +stdout_logfile=syslog +stderr_logfile=syslog +startsecs=0 +{% endif %} diff --git a/dockers/docker-platform-monitor/start.sh.j2 b/dockers/docker-platform-monitor/start.sh.j2 index 5b4fe4588819..03e0b49b8c4d 100644 --- a/dockers/docker-platform-monitor/start.sh.j2 +++ b/dockers/docker-platform-monitor/start.sh.j2 @@ -75,3 +75,7 @@ supervisorctl start psud supervisorctl start syseepromd {% endif %} +{% if not skip_thermalctld %} +supervisorctl start thermalctld +{% endif %} + diff --git a/platform/mellanox/hw-management/0002-hw-management.sh-Disable-thermal-policy-running-in-h.patch b/platform/mellanox/hw-management/0002-hw-management.sh-Disable-thermal-policy-running-in-h.patch new file mode 100644 index 000000000000..d1c34fd16ec0 --- /dev/null +++ b/platform/mellanox/hw-management/0002-hw-management.sh-Disable-thermal-policy-running-in-h.patch @@ -0,0 +1,31 @@ +From 76b02916794be2e2558fcff1d11609a594f633d7 Mon Sep 17 00:00:00 2001 +From: Stephen Sun +Date: Fri, 14 Feb 2020 13:48:00 +0800 +Subject: [PATCH] Disable thermal policy running in hw-mgmt service SONiC + thermal control algorithm has been supported. + +Signed-off-by: Stephen Sun +--- + usr/usr/bin/hw-management.sh | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/usr/usr/bin/hw-management.sh b/usr/usr/bin/hw-management.sh +index 2cdbfb2..48b41d5 100755 +--- a/usr/usr/bin/hw-management.sh ++++ b/usr/usr/bin/hw-management.sh +@@ -799,7 +799,11 @@ do_start() + #disabled for leopard chipless bringup. + echo 1 > $config_path/suspend + +- $THERMAL_CONTROL $thermal_type $max_tachos $max_psus& ++# ++# Disable thermal control algorithm in hw-management service ++# because there has already been that in SONiC ++# ++# $THERMAL_CONTROL $thermal_type $max_tachos $max_psus& + } + + do_stop() +-- +1.9.1 + diff --git a/platform/mellanox/mlnx-platform-api.mk b/platform/mellanox/mlnx-platform-api.mk index 4b70e59debc1..7bbbc3c70b0e 100644 --- a/platform/mellanox/mlnx-platform-api.mk +++ b/platform/mellanox/mlnx-platform-api.mk @@ -3,6 +3,7 @@ SONIC_PLATFORM_API_PY2 = mlnx_platform_api-1.0-py2-none-any.whl $(SONIC_PLATFORM_API_PY2)_SRC_PATH = $(PLATFORM_PATH)/mlnx-platform-api $(SONIC_PLATFORM_API_PY2)_PYTHON_VERSION = 2 +$(SONIC_PLATFORM_API_PY2)_DEPENDS = $(SONIC_PLATFORM_COMMON_PY2) $(SONIC_DAEMON_BASE_PY2) $(SONIC_CONFIG_ENGINE) SONIC_PYTHON_WHEELS += $(SONIC_PLATFORM_API_PY2) export mlnx_platform_api_py2_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_PLATFORM_API_PY2))" diff --git a/platform/mellanox/mlnx-platform-api/.gitignore b/platform/mellanox/mlnx-platform-api/.gitignore new file mode 100644 index 000000000000..07f8a98e1f4a --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/.gitignore @@ -0,0 +1,2 @@ +*.pyc +.cache/ diff --git a/platform/mellanox/mlnx-platform-api/pytest.ini b/platform/mellanox/mlnx-platform-api/pytest.ini new file mode 100644 index 000000000000..c24fe5bb9e65 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +filterwarnings = + ignore::DeprecationWarning diff --git a/platform/mellanox/mlnx-platform-api/setup.cfg b/platform/mellanox/mlnx-platform-api/setup.cfg new file mode 100644 index 000000000000..b7e478982ccf --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/setup.cfg @@ -0,0 +1,2 @@ +[aliases] +test=pytest diff --git a/platform/mellanox/mlnx-platform-api/setup.py b/platform/mellanox/mlnx-platform-api/setup.py index 12809c4085ed..f10f84924d2c 100644 --- a/platform/mellanox/mlnx-platform-api/setup.py +++ b/platform/mellanox/mlnx-platform-api/setup.py @@ -12,6 +12,14 @@ maintainer_email='kevinw@mellanox.com', packages=[ 'sonic_platform', + 'tests' + ], + setup_requires= [ + 'pytest-runner' + ], + tests_require = [ + 'pytest', + 'mock>=2.0.0' ], classifiers=[ 'Development Status :: 3 - Alpha', @@ -26,5 +34,6 @@ 'Topic :: Utilities', ], keywords='sonic SONiC platform PLATFORM', + test_suite='setup.get_test_suite' ) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/__init__.py b/platform/mellanox/mlnx-platform-api/sonic_platform/__init__.py index d94d4c9ec820..d82f3749319c 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/__init__.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/__init__.py @@ -1,2 +1,2 @@ __all__ = ["platform", "chassis"] -from sonic_platform import * \ No newline at end of file +from sonic_platform import * diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index c693b93462de..78f8dbc3c48e 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -442,3 +442,8 @@ def get_change_event(self, timeout=0): return True, {'sfp':port_dict} else: return True, {'sfp':{}} + + def get_thermal_manager(self): + from .thermal_manager import ThermalManager + return ThermalManager + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index 8b057e4123a2..2ec59b8e72fa 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -38,10 +38,12 @@ def __init__(self, fan_index, drawer_index = 1, psu_fan = False): self.fan_speed_set_path = "fan{}_speed_set".format(self.index) self.fan_presence_path = "fan{}_status".format(self.drawer_index) self.fan_max_speed_path = "fan{}_max".format(self.index) + self._name = "fan{}".format(fan_index + 1) else: self.fan_speed_get_path = "psu{}_fan1_speed_get".format(self.index) self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index) - self.fan_max_speed_path = "psu{}_max".format(self.index) + self._name = 'psu_{}_fan_{}'.format(self.index, fan_index) + self.fan_max_speed_path = None self.fan_status_path = "fan{}_fault".format(self.index) self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index) self.fan_red_led_path = "led_fan{}_red".format(self.drawer_index) @@ -49,6 +51,9 @@ def __init__(self, fan_index, drawer_index = 1, psu_fan = False): self.fan_pwm_path = "pwm1" self.fan_led_cap_path = "led_fan{}_capability".format(self.drawer_index) + def get_name(self): + return self._name + def get_status(self): """ Retrieves the operational status of fan @@ -123,7 +128,11 @@ def get_speed(self): speed_in_rpm = int(fan_curr_speed.read()) except (ValueError, IOError): speed_in_rpm = 0 - + + if self.fan_max_speed_path is None: + # in case of max speed unsupported, we just return speed in unit of RPM. + return speed_in_rpm + max_speed_in_rpm = self._get_max_speed_in_rpm() speed = 100*speed_in_rpm/max_speed_in_rpm @@ -136,11 +145,10 @@ def get_target_speed(self): Returns: int: percentage of the max fan speed """ - speed = 0 - if self.is_psu_fan: # Not like system fan, psu fan speed can not be modified, so target speed is N/A - return speed + return self.get_speed() + try: with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'r') as fan_pwm: pwm = int(fan_pwm.read()) @@ -243,4 +251,4 @@ def get_speed_tolerance(self): considered tolerable """ # The tolerance value is fixed as 20% for all the Mellanox platform - return 20 \ No newline at end of file + return 20 diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/platform.py b/platform/mellanox/mlnx-platform-api/sonic_platform/platform.py index 25461986f37a..6d81ca3e7b51 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/platform.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/platform.py @@ -24,6 +24,7 @@ def __init__(self): self._chassis.initialize_psu() self._chassis.initialize_fan() self._chassis.initialize_eeprom() + self._chassis.initialize_thermals() def _is_host(self): """ diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index f403678a6698..22091474e437 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -60,6 +60,7 @@ def __init__(self, psu_index, sku): psu_oper_status = "thermal/psu{}_pwr_status".format(self.index) #psu_oper_status should always be present for all SKUs self.psu_oper_status = os.path.join(self.psu_path, psu_oper_status) + self._name = "PSU{}".format(psu_index + 1) if sku in hwsku_dict_psu: filemap = psu_profile_list[hwsku_dict_psu[sku]] @@ -92,7 +93,10 @@ def __init__(self, psu_index, sku): fan = Fan(psu_index, psu_index, True) if fan.get_presence(): - self._fan = fan + self._fan_list.append(fan) + + def get_name(self): + return self._name def _read_generic_file(self, filename, len): """ diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index 1d03016af4ef..f445c3b25058 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -36,24 +36,31 @@ THERMAL_API_GET_TEMPERATURE = "get_temperature" THERMAL_API_GET_HIGH_THRESHOLD = "get_high_threshold" +THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD = "get_high_critical_threshold" + +THERMAL_API_INVALID_HIGH_THRESHOLD = 0.0 HW_MGMT_THERMAL_ROOT = "/var/run/hw-management/thermal/" thermal_api_handler_cpu_core = { THERMAL_API_GET_TEMPERATURE:"cpu_core{}", - THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max" + THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max", + THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"cpu_core{}_crit" } thermal_api_handler_cpu_pack = { THERMAL_API_GET_TEMPERATURE:"cpu_pack", - THERMAL_API_GET_HIGH_THRESHOLD:"cpu_pack_max" + THERMAL_API_GET_HIGH_THRESHOLD:"cpu_pack_max", + THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"cpu_pack_crit" } thermal_api_handler_module = { THERMAL_API_GET_TEMPERATURE:"module{}_temp_input", - THERMAL_API_GET_HIGH_THRESHOLD:"module{}_temp_crit" + THERMAL_API_GET_HIGH_THRESHOLD:"module{}_temp_crit", + THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:"module{}_temp_emergency" } thermal_api_handler_psu = { THERMAL_API_GET_TEMPERATURE:"psu{}_temp", - THERMAL_API_GET_HIGH_THRESHOLD:"psu{}_temp_max" + THERMAL_API_GET_HIGH_THRESHOLD:"psu{}_temp_max", + THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None } thermal_api_handler_gearbox = { THERMAL_API_GET_TEMPERATURE:"temp_input_gearbox{}", @@ -281,6 +288,7 @@ } ] + def initialize_thermals(sku, thermal_list, psu_list): # create thermal objects for all categories of sensors tp_index = hwsku_dict_thermal[sku] @@ -310,6 +318,8 @@ def initialize_thermals(sku, thermal_list, psu_list): thermal = Thermal(category, start + index, True) thermal_list.append(thermal) + + class Thermal(ThermalBase): def __init__(self, category, index, has_index, dependency = None, hint = None): """ @@ -328,9 +338,11 @@ def __init__(self, category, index, has_index, dependency = None, hint = None): self.category = category self.temperature = self._get_file_from_api(THERMAL_API_GET_TEMPERATURE) self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD) + self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD) self.dependency = dependency self.dependent_hint = hint + def get_name(self): """ Retrieves the name of the device @@ -340,6 +352,7 @@ def get_name(self): """ return self.name + def _read_generic_file(self, filename, len): """ Read a generic file, returns the contents of the file @@ -352,6 +365,7 @@ def _read_generic_file(self, filename, len): logger.log_info("Fail to read file {} due to {}".format(filename, repr(e))) return result + def _get_file_from_api(self, api_name): if self.category == THERMAL_DEV_CATEGORY_AMBIENT: if api_name == THERMAL_API_GET_TEMPERATURE: @@ -363,9 +377,13 @@ def _get_file_from_api(self, api_name): if self.category in thermal_device_categories_singleton: filename = handler else: - filename = handler.format(self.index) + if handler: + filename = handler.format(self.index) + else: + return None return join(HW_MGMT_THERMAL_ROOT, filename) + def get_temperature(self): """ Retrieves current temperature reading from thermal @@ -385,8 +403,11 @@ def get_temperature(self): if value_str is None: return None value_float = float(value_str) + if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD: + return None return value_float / 1000.0 + def get_high_threshold(self): """ Retrieves the high threshold temperature of thermal @@ -401,4 +422,25 @@ def get_high_threshold(self): if value_str is None: return None value_float = float(value_str) + if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD: + return None + return value_float / 1000.0 + + + def get_high_critical_threshold(self): + """ + Retrieves the high critical threshold temperature of thermal + + Returns: + A float number, the high critical threshold temperature of thermal in Celsius + up to nearest thousandth of one degree Celsius, e.g. 30.125 + """ + if self.high_critical_threshold is None: + return None + value_str = self._read_generic_file(self.high_critical_threshold, 0) + if value_str is None: + return None + value_float = float(value_str) + if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD: + return None return value_float / 1000.0 diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py new file mode 100644 index 000000000000..72729287d1c5 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -0,0 +1,108 @@ +from sonic_platform_base.sonic_thermal_control.thermal_action_base import ThermalPolicyActionBase +from sonic_platform_base.sonic_thermal_control.thermal_json_object import thermal_json_object + + +class SetFanSpeedAction(ThermalPolicyActionBase): + """ + Base thermal action class to set speed for fans + """ + # JSON field definition + JSON_FIELD_SPEED = 'speed' + + def __init__(self): + """ + Constructor of SetFanSpeedAction which actually do nothing. + """ + self.speed = None + + def load_from_json(self, json_obj): + """ + Construct SetFanSpeedAction via JSON. JSON example: + { + "type": "fan.all.set_speed" + "speed": "100" + } + :param json_obj: A JSON object representing a SetFanSpeedAction action. + :return: + """ + if SetFanSpeedAction.JSON_FIELD_SPEED in json_obj: + speed = float(json_obj[SetFanSpeedAction.JSON_FIELD_SPEED]) + if speed < 0 or speed > 100: + raise ValueError('SetFanSpeedAction invalid speed value {} in JSON policy file, valid value should be [0, 100]'. + format(speed)) + self.speed = float(json_obj[SetFanSpeedAction.JSON_FIELD_SPEED]) + else: + raise ValueError('SetFanSpeedAction missing mandatory field {} in JSON policy file'. + format(SetFanSpeedAction.JSON_FIELD_SPEED)) + + +@thermal_json_object('fan.all.set_speed') +class SetAllFanSpeedAction(SetFanSpeedAction): + """ + Action to set speed for all fans + """ + def execute(self, thermal_info_dict): + """ + Set speed for all fans + :param thermal_info_dict: A dictionary stores all thermal information. + :return: + """ + from .thermal_infos import FanInfo + if FanInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[FanInfo.INFO_NAME], FanInfo): + fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] + for fan in fan_info_obj.get_presence_fans(): + fan.set_speed(self.speed) + + +@thermal_json_object('thermal_control.control') +class ControlThermalAlgoAction(ThermalPolicyActionBase): + """ + Action to control the thermal control algorithm + """ + # JSON field definition + JSON_FIELD_STATUS = 'status' + + def __init__(self): + self.status = True + + def load_from_json(self, json_obj): + """ + Construct ControlThermalAlgoAction via JSON. JSON example: + { + "type": "thermal_control.control" + "status": "true" + } + :param json_obj: A JSON object representing a ControlThermalAlgoAction action. + :return: + """ + if ControlThermalAlgoAction.JSON_FIELD_STATUS in json_obj: + status_str = json_obj[ControlThermalAlgoAction.JSON_FIELD_STATUS].lower() + if status_str == 'true': + self.status = True + elif status_str == 'false': + self.status = False + else: + raise ValueError('Invalid {} field value, please specify true of false'. + format(ControlThermalAlgoAction.JSON_FIELD_STATUS)) + else: + raise ValueError('ControlThermalAlgoAction ' + 'missing mandatory field {} in JSON policy file'. + format(ControlThermalAlgoAction.JSON_FIELD_STATUS)) + + def execute(self, thermal_info_dict): + """ + Disable thermal control algorithm + :param thermal_info_dict: A dictionary stores all thermal information. + :return: + """ + from .thermal_infos import ChassisInfo + if ChassisInfo.INFO_NAME in thermal_info_dict: + chassis_info_obj = thermal_info_dict[ChassisInfo.INFO_NAME] + chassis = chassis_info_obj.get_chassis() + thermal_manager = chassis.get_thermal_manager() + if self.status: + thermal_manager.start_thermal_control_algorithm() + else: + thermal_manager.stop_thermal_control_algorithm() + + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py new file mode 100644 index 000000000000..2df59acc9bf1 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py @@ -0,0 +1,63 @@ +from sonic_platform_base.sonic_thermal_control.thermal_condition_base import ThermalPolicyConditionBase +from sonic_platform_base.sonic_thermal_control.thermal_json_object import thermal_json_object + + +class FanCondition(ThermalPolicyConditionBase): + def get_fan_info(self, thermal_info_dict): + from .thermal_infos import FanInfo + if FanInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[FanInfo.INFO_NAME], FanInfo): + return thermal_info_dict[FanInfo.INFO_NAME] + else: + return None + + +@thermal_json_object('fan.any.absence') +class AnyFanAbsenceCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_absence_fans()) > 0 if fan_info_obj else False + + +@thermal_json_object('fan.all.absence') +class AllFanAbsenceCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_presence_fans()) == 0 if fan_info_obj else False + + +@thermal_json_object('fan.all.presence') +class AllFanPresenceCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_absence_fans()) == 0 if fan_info_obj else False + + +class PsuCondition(ThermalPolicyConditionBase): + def get_psu_info(self, thermal_info_dict): + from .thermal_infos import PsuInfo + if PsuInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[PsuInfo.INFO_NAME], PsuInfo): + return thermal_info_dict[PsuInfo.INFO_NAME] + else: + return None + + +@thermal_json_object('psu.any.absence') +class AnyPsuAbsenceCondition(PsuCondition): + def is_match(self, thermal_info_dict): + psu_info_obj = self.get_psu_info(thermal_info_dict) + return len(psu_info_obj.get_absence_psus()) > 0 if psu_info_obj else False + + +@thermal_json_object('psu.all.absence') +class AllPsuAbsenceCondition(PsuCondition): + def is_match(self, thermal_info_dict): + psu_info_obj = self.get_psu_info(thermal_info_dict) + return len(psu_info_obj.get_presence_psus()) == 0 if psu_info_obj else False + + +@thermal_json_object('psu.all.presence') +class AllPsuPresenceCondition(PsuCondition): + def is_match(self, thermal_info_dict): + psu_info_obj = self.get_psu_info(thermal_info_dict) + return len(psu_info_obj.get_absence_psus()) == 0 if psu_info_obj else False + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py new file mode 100644 index 000000000000..34d31e47d24c --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py @@ -0,0 +1,136 @@ +from sonic_platform_base.sonic_thermal_control.thermal_info_base import ThermalPolicyInfoBase +from sonic_platform_base.sonic_thermal_control.thermal_json_object import thermal_json_object + + +@thermal_json_object('fan_info') +class FanInfo(ThermalPolicyInfoBase): + """ + Fan information needed by thermal policy + """ + + # Fan information name + INFO_NAME = 'fan_info' + + def __init__(self): + self._absence_fans = set() + self._presence_fans = set() + self._status_changed = False + + def collect(self, chassis): + """ + Collect absence and presence fans. + :param chassis: The chassis object + :return: + """ + self._status_changed = False + for fan in chassis.get_all_fans(): + if fan.get_presence() and fan not in self._presence_fans: + self._presence_fans.add(fan) + self._status_changed = True + if fan in self._absence_fans: + self._absence_fans.remove(fan) + elif not fan.get_presence() and fan not in self._absence_fans: + self._absence_fans.add(fan) + self._status_changed = True + if fan in self._presence_fans: + self._presence_fans.remove(fan) + + def get_absence_fans(self): + """ + Retrieves absence fans + :return: A set of absence fans + """ + return self._absence_fans + + def get_presence_fans(self): + """ + Retrieves presence fans + :return: A set of presence fans + """ + return self._presence_fans + + def is_status_changed(self): + """ + Retrieves if the status of fan information changed + :return: True if status changed else False + """ + return self._status_changed + + +@thermal_json_object('psu_info') +class PsuInfo(ThermalPolicyInfoBase): + """ + PSU information needed by thermal policy + """ + INFO_NAME = 'psu_info' + + def __init__(self): + self._absence_psus = set() + self._presence_psus = set() + self._status_changed = False + + def collect(self, chassis): + """ + Collect absence and presence PSUs. + :param chassis: The chassis object + :return: + """ + self._status_changed = False + for psu in chassis.get_all_psus(): + if psu.get_presence() and psu not in self._presence_psus: + self._presence_psus.add(psu) + self._status_changed = True + if psu in self._absence_psus: + self._absence_psus.remove(psu) + elif not psu.get_presence() and psu not in self._absence_psus: + self._absence_psus.add(psu) + self._status_changed = True + if psu in self._presence_psus: + self._presence_psus.remove(psu) + + def get_absence_psus(self): + """ + Retrieves presence PSUs + :return: A set of absence PSUs + """ + return self._absence_psus + + def get_presence_psus(self): + """ + Retrieves presence PSUs + :return: A set of presence fans + """ + return self._presence_psus + + def is_status_changed(self): + """ + Retrieves if the status of PSU information changed + :return: True if status changed else False + """ + return self._status_changed + + +@thermal_json_object('chassis_info') +class ChassisInfo(ThermalPolicyInfoBase): + """ + Chassis information needed by thermal policy + """ + INFO_NAME = 'chassis_info' + + def __init__(self): + self._chassis = None + + def collect(self, chassis): + """ + Collect platform chassis. + :param chassis: The chassis object + :return: + """ + self._chassis = chassis + + def get_chassis(self): + """ + Retrieves platform chassis object + :return: A platform chassis object. + """ + return self._chassis diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py new file mode 100644 index 000000000000..133bb078ca20 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -0,0 +1,50 @@ +import os +from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase +from .thermal_actions import * +from .thermal_conditions import * +from .thermal_infos import * + + +class ThermalManager(ThermalManagerBase): + THERMAL_ALGORITHM_CONTROL_PATH = '/var/run/hw-management/config/suspend' + + @classmethod + def start_thermal_control_algorithm(cls): + """ + Start thermal control algorithm + + Returns: + bool: True if set success, False if fail. + """ + cls._control_thermal_control_algorithm(False) + + @classmethod + def stop_thermal_control_algorithm(cls): + """ + Stop thermal control algorithm + + Returns: + bool: True if set success, False if fail. + """ + cls._control_thermal_control_algorithm(True) + + @classmethod + def _control_thermal_control_algorithm(cls, suspend): + """ + Control thermal control algorithm + + Args: + suspend: Bool, indicate suspend the algorithm or not + + Returns: + bool: True if set success, False if fail. + """ + status = True + write_value = 1 if suspend else 0 + try: + with open(cls.THERMAL_ALGORITHM_CONTROL_PATH, 'w') as control_file: + control_file.write(str(write_value)) + except (ValueError, IOError): + status = False + + return status diff --git a/platform/mellanox/mlnx-platform-api/tests/__init__.py b/platform/mellanox/mlnx-platform-api/tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py new file mode 100644 index 000000000000..b8d070d44955 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py @@ -0,0 +1,44 @@ +class MockFan: + def __init__(self): + self.presence = True + self.speed = 60 + + def get_presence(self): + return self.presence + + def set_speed(self, speed): + self.speed = speed + + +class MockPsu: + def __init__(self): + self.presence = True + + def get_presence(self): + return self.presence + + +class MockChassis: + def __init__(self): + self.fan_list = [] + self.psu_list = [] + + def get_all_psus(self): + return self.psu_list + + def get_all_fans(self): + return self.fan_list + + def get_thermal_manager(self): + from sonic_platform.thermal_manager import ThermalManager + return ThermalManager + + def make_fan_absence(self): + fan = MockFan() + fan.presence = False + self.fan_list.append(fan) + + def make_psu_absence(self): + psu = MockPsu() + psu.presence = False + self.psu_list.append(psu) diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py new file mode 100644 index 000000000000..ba9e502d4f74 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -0,0 +1,272 @@ +import os +import sys +import pytest +import json +from mock import MagicMock +from .mock_platform import MockChassis, MockFan, MockPsu + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + +from sonic_platform.thermal_manager import ThermalManager +from sonic_platform.thermal_infos import FanInfo, PsuInfo + + +@pytest.fixture(scope='session', autouse=True) +def thermal_manager(): + policy_file = os.path.join(test_path, 'thermal_policy.json') + ThermalManager.load(policy_file) + return ThermalManager + + +def test_load_policy(thermal_manager): + assert 'psu_info' in thermal_manager._thermal_info_dict + assert 'fan_info' in thermal_manager._thermal_info_dict + assert 'chassis_info' in thermal_manager._thermal_info_dict + + assert 'any fan absence' in thermal_manager._policy_dict + assert 'any psu absence' in thermal_manager._policy_dict + assert 'all fan and psu presence' in thermal_manager._policy_dict + + assert thermal_manager._fan_speed_when_suspend == 60 + assert thermal_manager._run_thermal_algorithm_at_boot_up == False + + +def test_fan_info(): + chassis = MockChassis() + chassis.make_fan_absence() + fan_info = FanInfo() + fan_info.collect(chassis) + assert len(fan_info.get_absence_fans()) == 1 + assert len(fan_info.get_presence_fans()) == 0 + assert fan_info.is_status_changed() + + fan_list = chassis.get_all_fans() + fan_list[0].presence = True + fan_info.collect(chassis) + assert len(fan_info.get_absence_fans()) == 0 + assert len(fan_info.get_presence_fans()) == 1 + assert fan_info.is_status_changed() + + +def test_psu_info(): + chassis = MockChassis() + chassis.make_psu_absence() + psu_info = PsuInfo() + psu_info.collect(chassis) + assert len(psu_info.get_absence_psus()) == 1 + assert len(psu_info.get_presence_psus()) == 0 + assert psu_info.is_status_changed() + + psu_list = chassis.get_all_psus() + psu_list[0].presence = True + psu_info.collect(chassis) + assert len(psu_info.get_absence_psus()) == 0 + assert len(psu_info.get_presence_psus()) == 1 + assert psu_info.is_status_changed() + + +def test_fan_policy(thermal_manager): + chassis = MockChassis() + chassis.make_fan_absence() + chassis.fan_list.append(MockFan()) + thermal_manager.start_thermal_control_algorithm = MagicMock() + thermal_manager.stop_thermal_control_algorithm = MagicMock() + thermal_manager.run_policy(chassis) + + fan_list = chassis.get_all_fans() + assert fan_list[1].speed == 100 + thermal_manager.stop_thermal_control_algorithm.assert_called_once() + + fan_list[0].presence = True + thermal_manager.run_policy(chassis) + thermal_manager.start_thermal_control_algorithm.assert_called_once() + + +def test_psu_policy(thermal_manager): + chassis = MockChassis() + chassis.make_psu_absence() + chassis.fan_list.append(MockFan()) + thermal_manager.start_thermal_control_algorithm = MagicMock() + thermal_manager.stop_thermal_control_algorithm = MagicMock() + thermal_manager.run_policy(chassis) + + fan_list = chassis.get_all_fans() + assert fan_list[0].speed == 100 + thermal_manager.stop_thermal_control_algorithm.assert_called_once() + + psu_list = chassis.get_all_psus() + psu_list[0].presence = True + thermal_manager.run_policy(chassis) + thermal_manager.start_thermal_control_algorithm.assert_called_once() + + +def test_any_fan_absence_condition(): + chassis = MockChassis() + chassis.make_fan_absence() + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AnyFanAbsenceCondition + condition = AnyFanAbsenceCondition() + assert condition.is_match({'fan_info': fan_info}) + + fan = chassis.get_all_fans()[0] + fan.presence = True + fan_info.collect(chassis) + assert not condition.is_match({'fan_info': fan_info}) + + +def test_all_fan_absence_condition(): + chassis = MockChassis() + chassis.make_fan_absence() + fan = MockFan() + fan_list = chassis.get_all_fans() + fan_list.append(fan) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AllFanAbsenceCondition + condition = AllFanAbsenceCondition() + assert not condition.is_match({'fan_info': fan_info}) + + fan.presence = False + fan_info.collect(chassis) + assert condition.is_match({'fan_info': fan_info}) + + +def test_all_fan_presence_condition(): + chassis = MockChassis() + chassis.make_fan_absence() + fan = MockFan() + fan_list = chassis.get_all_fans() + fan_list.append(fan) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AllFanPresenceCondition + condition = AllFanPresenceCondition() + assert not condition.is_match({'fan_info': fan_info}) + + fan_list[0].presence = True + fan_info.collect(chassis) + assert condition.is_match({'fan_info': fan_info}) + + +def test_any_psu_absence_condition(): + chassis = MockChassis() + chassis.make_psu_absence() + psu_info = PsuInfo() + psu_info.collect(chassis) + + from sonic_platform.thermal_conditions import AnyPsuAbsenceCondition + condition = AnyPsuAbsenceCondition() + assert condition.is_match({'psu_info': psu_info}) + + psu = chassis.get_all_psus()[0] + psu.presence = True + psu_info.collect(chassis) + assert not condition.is_match({'psu_info': psu_info}) + + +def test_all_psu_absence_condition(): + chassis = MockChassis() + chassis.make_psu_absence() + psu = MockPsu() + psu_list = chassis.get_all_psus() + psu_list.append(psu) + psu_info = PsuInfo() + psu_info.collect(chassis) + + from sonic_platform.thermal_conditions import AllPsuAbsenceCondition + condition = AllPsuAbsenceCondition() + assert not condition.is_match({'psu_info': psu_info}) + + psu.presence = False + psu_info.collect(chassis) + assert condition.is_match({'psu_info': psu_info}) + + +def test_all_fan_presence_condition(): + chassis = MockChassis() + chassis.make_psu_absence() + psu = MockPsu() + psu_list = chassis.get_all_psus() + psu_list.append(psu) + psu_info = PsuInfo() + psu_info.collect(chassis) + + from sonic_platform.thermal_conditions import AllPsuPresenceCondition + condition = AllPsuPresenceCondition() + assert not condition.is_match({'psu_info': psu_info}) + + psu_list[0].presence = True + psu_info.collect(chassis) + assert condition.is_match({'psu_info': psu_info}) + + +def test_load_set_fan_speed_action(): + from sonic_platform.thermal_actions import SetAllFanSpeedAction + action = SetAllFanSpeedAction() + json_str = '{\"speed\": \"50\"}' + json_obj = json.loads(json_str) + action.load_from_json(json_obj) + assert action.speed == 50 + + json_str = '{\"speed\": \"-1\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"speed\": \"101\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"invalid\": \"101\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + +def test_execute_set_fan_speed_action(): + chassis = MockChassis() + fan_list = chassis.get_all_fans() + fan_list.append(MockFan()) + fan_list.append(MockFan()) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_actions import SetAllFanSpeedAction + action = SetAllFanSpeedAction() + action.speed = 99 + action.execute({'fan_info': fan_info}) + assert fan_list[0].speed == 99 + assert fan_list[1].speed == 99 + + +def test_load_control_thermal_algo_action(): + from sonic_platform.thermal_actions import ControlThermalAlgoAction + action = ControlThermalAlgoAction() + json_str = '{\"status\": \"false\"}' + json_obj = json.loads(json_str) + action.load_from_json(json_obj) + assert not action.status + + json_str = '{\"status\": \"true\"}' + json_obj = json.loads(json_str) + action.load_from_json(json_obj) + assert action.status + + json_str = '{\"status\": \"invalid\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"invalid\": \"true\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + diff --git a/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json b/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json new file mode 100644 index 000000000000..5d31b2abd875 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json @@ -0,0 +1,72 @@ +{ + "thermal_control_algorithm": { + "run_at_boot_up": "false", + "fan_speed_when_suspend": "60" + }, + "info_types": [ + { + "type": "fan_info" + }, + { + "type": "psu_info" + }, + { + "type": "chassis_info" + } + ], + "policies": [ + { + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "any psu absence", + "conditions": [ + { + "type": "psu.any.absence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "all fan and psu presence", + "conditions": [ + { + "type": "fan.all.presence" + }, + { + "type": "psu.all.presence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "true" + } + ] + } + ] +} \ No newline at end of file diff --git a/rules/docker-platform-monitor.mk b/rules/docker-platform-monitor.mk index a37f4d2e9ee7..db1c8c5a0289 100644 --- a/rules/docker-platform-monitor.mk +++ b/rules/docker-platform-monitor.mk @@ -10,7 +10,7 @@ $(DOCKER_PLATFORM_MONITOR)_DEPENDS += $(LIBSENSORS) $(LM_SENSORS) $(FANCONTROL) ifeq ($(CONFIGURED_PLATFORM),barefoot) $(DOCKER_PLATFORM_MONITOR)_DEPENDS += $(PYTHON_THRIFT) endif -$(DOCKER_PLATFORM_MONITOR)_PYTHON_DEBS += $(SONIC_LEDD) $(SONIC_XCVRD) $(SONIC_PSUD) $(SONIC_SYSEEPROMD) +$(DOCKER_PLATFORM_MONITOR)_PYTHON_DEBS += $(SONIC_LEDD) $(SONIC_XCVRD) $(SONIC_PSUD) $(SONIC_SYSEEPROMD) $(SONIC_THERMALCTLD) $(DOCKER_PLATFORM_MONITOR)_PYTHON_WHEELS += $(SONIC_PLATFORM_COMMON_PY2) $(DOCKER_PLATFORM_MONITOR)_PYTHON_WHEELS += $(SWSSSDK_PY2) $(DOCKER_PLATFORM_MONITOR)_PYTHON_WHEELS += $(SONIC_PLATFORM_API_PY2) diff --git a/rules/sonic-thermalctld.mk b/rules/sonic-thermalctld.mk new file mode 100644 index 000000000000..775082e7bbce --- /dev/null +++ b/rules/sonic-thermalctld.mk @@ -0,0 +1,6 @@ +# sonic-thermalctld (SONiC Thermal control daemon) Debian package + +SONIC_THERMALCTLD = python-sonic-thermalctld_1.0-1_all.deb +$(SONIC_THERMALCTLD)_SRC_PATH = $(SRC_PATH)/sonic-platform-daemons/sonic-thermalctld +$(SONIC_THERMALCTLD)_WHEEL_DEPENDS = $(SONIC_DAEMON_BASE_PY2) +SONIC_PYTHON_STDEB_DEBS += $(SONIC_THERMALCTLD) diff --git a/src/sonic-daemon-base/sonic_daemon_base/task_base.py b/src/sonic-daemon-base/sonic_daemon_base/task_base.py new file mode 100644 index 000000000000..e1738ffba213 --- /dev/null +++ b/src/sonic-daemon-base/sonic_daemon_base/task_base.py @@ -0,0 +1,50 @@ +import multiprocessing +import os +import signal +import threading + + +# +# ProcessTaskBase ===================================================================== +# +class ProcessTaskBase(object): # TODO: put this class to swss-platform-common + def __init__(self): + self.task_process = None + self.task_stopping_event = multiprocessing.Event() + + def task_worker(self): + pass + + def task_run(self): + if self.task_stopping_event.is_set(): + return + + self.task_process = multiprocessing.Process(target=self.task_worker) + self.task_process.start() + + def task_stop(self): + self.task_stopping_event.set() + os.kill(self.task_process.pid, signal.SIGKILL) + + +# +# ThreadTaskBase ===================================================================== +# +class ThreadTaskBase(object): # TODO: put this class to swss-platform-common; + def __init__(self): + self.task_thread = None + self.task_stopping_event = threading.Event() + + def task_worker(self): + pass + + def task_run(self): + if self.task_stopping_event.is_set(): + return + + self.task_thread = threading.Thread(target=self.task_worker) + self.task_thread.start() + + def task_stop(self): + self.task_stopping_event.set() + self.task_thread.join() diff --git a/src/sonic-platform-common b/src/sonic-platform-common index ee60f546d874..df964ac98dc4 160000 --- a/src/sonic-platform-common +++ b/src/sonic-platform-common @@ -1 +1 @@ -Subproject commit ee60f546d8740418ec2bd2ca922cc3be5fdfd0ac +Subproject commit df964ac98dc46c0096ef19a683ff58637c4e2b05 From 9aa700b7674e7a4046f260f9466f60b522d1a666 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Thu, 26 Mar 2020 01:54:07 +0800 Subject: [PATCH 02/14] [Mellanox] Fix thermal control bugs (#4298) * [thermal control] Fix pmon docker stop issue on 3800 * [thermal fix] Fix QA test issue * [thermal fix] change psu._get_power_available_status to psu.get_power_available_status * [thermal fix] adjust log for PSU absence and power absence * [thermal fix] add unit test for loading thermal policy file with duplicate conditions in different policies * [thermal] fix fan.get_presence for non-removable SKU * [thermal fix] fix issue: fan direction is based on drawer * Fix issue: when fan is not present, should not read fan direction from sysfs but directly return N/A * [thermal fix] add unit test for get_direction for absent FAN * Unplugable PSU has no FAN, no need add a FAN object for this PSU * Update submodules Co-authored-by: Stephen Sun <5379172+stephenxs@users.noreply.github.com> Conflicts: platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py platform/mellanox/mlnx-platform-api/sonic_platform/fan.py platform/mellanox/mlnx-platform-api/sonic_platform/psu.py platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py src/sonic-platform-common src/sonic-platform-daemons --- .../sonic_platform/chassis.py | 4 +- .../mlnx-platform-api/sonic_platform/fan.py | 74 ++++++++-- .../mlnx-platform-api/sonic_platform/psu.py | 126 +++++++++++++++++- .../sonic_platform/thermal.py | 32 +++-- .../sonic_platform/thermal_infos.py | 4 +- .../tests/duplicate_action.json | 18 +++ .../tests/duplicate_condition.json | 17 +++ .../mlnx-platform-api/tests/empty_action.json | 10 ++ .../tests/empty_condition.json | 11 ++ .../mlnx-platform-api/tests/mock_platform.py | 4 + .../tests/policy_with_same_conditions.json | 75 +++++++++++ .../mlnx-platform-api/tests/test_fan_api.py | 17 +++ .../tests/test_thermal_policy.py | 46 +++++++ 13 files changed, 409 insertions(+), 29 deletions(-) create mode 100644 platform/mellanox/mlnx-platform-api/tests/duplicate_action.json create mode 100644 platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json create mode 100644 platform/mellanox/mlnx-platform-api/tests/empty_action.json create mode 100644 platform/mellanox/mlnx-platform-api/tests/empty_condition.json create mode 100644 platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json create mode 100644 platform/mellanox/mlnx-platform-api/tests/test_fan_api.py diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index 78f8dbc3c48e..f55f066dfdc7 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -95,9 +95,9 @@ def initialize_fan(self): for index in range(num_of_fan): if multi_rotor_in_drawer: - fan = Fan(index, index/2) + fan = Fan(has_fan_dir, index, index/2, False, self.sku_name) else: - fan = Fan(index, index) + fan = Fan(has_fan_dir, index, index, False, self.sku_name) self._fan_list.append(fan) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index 2ec59b8e72fa..c25e2288cda5 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -23,15 +23,23 @@ FAN_PATH = "/var/run/hw-management/thermal/" LED_PATH = "/var/run/hw-management/led/" +# SKUs with unplugable FANs: +# 1. don't have fanX_status and should be treated as always present +hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100'] + class Fan(FanBase): """Platform-specific Fan class""" - def __init__(self, fan_index, drawer_index = 1, psu_fan = False): + + STATUS_LED_COLOR_ORANGE = "orange" + + def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None): # API index is starting from 0, Mellanox platform index is starting from 1 self.index = fan_index + 1 self.drawer_index = drawer_index + 1 self.is_psu_fan = psu_fan - + self.always_presence = False if sku not in hwsku_dict_with_unplugable_fan else True + self.fan_min_speed_path = "fan{}_min".format(self.index) if not self.is_psu_fan: self.fan_speed_get_path = "fan{}_speed_get".format(self.index) @@ -42,7 +50,7 @@ def __init__(self, fan_index, drawer_index = 1, psu_fan = False): else: self.fan_speed_get_path = "psu{}_fan1_speed_get".format(self.index) self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index) - self._name = 'psu_{}_fan_{}'.format(self.index, fan_index) + self._name = 'psu_{}_fan_{}'.format(self.index, 1) self.fan_max_speed_path = None self.fan_status_path = "fan{}_fault".format(self.index) self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index) @@ -50,6 +58,45 @@ def __init__(self, fan_index, drawer_index = 1, psu_fan = False): self.fan_orange_led_path = "led_fan{}_orange".format(self.drawer_index) self.fan_pwm_path = "pwm1" self.fan_led_cap_path = "led_fan{}_capability".format(self.drawer_index) + if has_fan_dir: + self.fan_dir = FAN_DIR + else: + self.fan_dir = None + + + def get_direction(self): + """ + Retrieves the fan's direction + + Returns: + A string, either FAN_DIRECTION_INTAKE or FAN_DIRECTION_EXHAUST + depending on fan direction + + Notes: + What Mellanox calls forward: + Air flows from fans side to QSFP side, for example: MSN2700-CS2F + which means intake in community + What Mellanox calls reverse: + Air flow from QSFP side to fans side, for example: MSN2700-CS2R + which means exhaust in community + According to hw-mgmt: + 1 stands for forward, in other words intake + 0 stands for reverse, in other words exhaust + """ + if not self.fan_dir or self.is_psu_fan or not self.get_presence(): + return self.FAN_DIRECTION_NOT_APPLICABLE + + try: + with open(os.path.join(self.fan_dir), 'r') as fan_dir: + fan_dir_bits = int(fan_dir.read()) + fan_mask = 1 << self.drawer_index - 1 + if fan_dir_bits & fan_mask: + return self.FAN_DIRECTION_INTAKE + else: + return self.FAN_DIRECTION_EXHAUST + except (ValueError, IOError) as e: + raise RuntimeError("Failed to read fan direction status to {}".format(repr(e))) + def get_name(self): return self._name @@ -63,15 +110,15 @@ def get_status(self): """ status = 0 if self.is_psu_fan: - status = 1 + status = 0 else: try: with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status: status = int(fault_status.read()) except (ValueError, IOError): - status = 0 + status = 1 - return status == 1 + return status == 0 def get_presence(self): """ @@ -87,11 +134,14 @@ def get_presence(self): else: status = 0 else: - try: - with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status: - status = int(presence_status.read()) - except (ValueError, IOError): - status = 0 + if self.always_presence: + status = 1 + else: + try: + with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status: + status = int(presence_status.read()) + except (ValueError, IOError): + status = 0 return status == 1 @@ -135,6 +185,8 @@ def get_speed(self): max_speed_in_rpm = self._get_max_speed_in_rpm() speed = 100*speed_in_rpm/max_speed_in_rpm + if speed > 100: + speed = 100 return speed diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index 22091474e437..b627a79ee888 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -91,13 +91,21 @@ def __init__(self, psu_index, sku): psu_presence = os.path.join(self.psu_path, psu_presence) self.psu_presence = psu_presence - fan = Fan(psu_index, psu_index, True) - if fan.get_presence(): + # unplugable PSU has no FAN + if sku not in hwsku_dict_with_unplugable_psu: + fan = Fan(sku, psu_index, psu_index, True) self._fan_list.append(fan) + self.psu_green_led_path = "led_psu_green" + self.psu_red_led_path = "led_psu_red" + self.psu_orange_led_path = "led_psu_orange" + self.psu_led_cap_path = "led_psu_capability" + + def get_name(self): return self._name + def _read_generic_file(self, filename, len): """ Read a generic file, returns the contents of the file @@ -173,3 +181,117 @@ def get_power(self): return float(power) / 1000000 else: return None + + + def _get_led_capability(self): + cap_list = None + try: + with open(os.path.join(LED_PATH, self.psu_led_cap_path), 'r') as psu_led_cap: + caps = psu_led_cap.read() + cap_list = caps.split() + except (ValueError, IOError): + status = 0 + + return cap_list + + + def set_status_led(self, color): + """ + Sets the state of the PSU status LED + + Args: + color: A string representing the color with which to set the + PSU status LED + + Returns: + bool: True if status LED state is set successfully, False if not + + Notes: + Only one led for all PSUs. + """ + led_cap_list = self._get_led_capability() + if led_cap_list is None: + return False + + status = False + try: + if color == self.STATUS_LED_COLOR_GREEN: + with open(os.path.join(LED_PATH, self.psu_green_led_path), 'w') as psu_led: + psu_led.write(LED_ON) + status = True + elif color == self.STATUS_LED_COLOR_RED: + # Some fan don't support red led but support orange led, in this case we set led to orange + if self.STATUS_LED_COLOR_RED in led_cap_list: + led_path = os.path.join(LED_PATH, self.psu_red_led_path) + elif self.STATUS_LED_COLOR_ORANGE in led_cap_list: + led_path = os.path.join(LED_PATH, self.psu_orange_led_path) + else: + return False + with open(led_path, 'w') as psu_led: + psu_led.write(LED_ON) + status = True + elif color == self.STATUS_LED_COLOR_OFF: + if self.STATUS_LED_COLOR_GREEN in led_cap_list: + with open(os.path.join(LED_PATH, self.psu_green_led_path), 'w') as psu_led: + psu_led.write(str(LED_OFF)) + if self.STATUS_LED_COLOR_RED in led_cap_list: + with open(os.path.join(LED_PATH, self.psu_red_led_path), 'w') as psu_led: + psu_led.write(str(LED_OFF)) + if self.STATUS_LED_COLOR_ORANGE in led_cap_list: + with open(os.path.join(LED_PATH, self.psu_orange_led_path), 'w') as psu_led: + psu_led.write(str(LED_OFF)) + + status = True + else: + status = False + except (ValueError, IOError): + status = False + + return status + + + def get_status_led(self): + """ + Gets the state of the PSU status LED + + Returns: + A string, one of the predefined STATUS_LED_COLOR_* strings above + """ + led_cap_list = self._get_led_capability() + if led_cap_list is None: + return self.STATUS_LED_COLOR_OFF + + try: + with open(os.path.join(LED_PATH, self.psu_green_led_path), 'r') as psu_led: + if LED_OFF != psu_led.read().rstrip('\n'): + return self.STATUS_LED_COLOR_GREEN + if self.STATUS_LED_COLOR_RED in led_cap_list: + with open(os.path.join(LED_PATH, self.psu_red_led_path), 'r') as psu_led: + if LED_OFF != psu_led.read().rstrip('\n'): + return self.STATUS_LED_COLOR_RED + if self.STATUS_LED_COLOR_ORANGE in led_cap_list: + with open(os.path.join(LED_PATH, self.psu_orange_led_path), 'r') as psu_led: + if LED_OFF != psu_led.read().rstrip('\n'): + return self.STATUS_LED_COLOR_RED + except (ValueError, IOError) as e: + raise RuntimeError("Failed to read led status for psu due to {}".format(repr(e))) + + return self.STATUS_LED_COLOR_OFF + + + def get_power_available_status(self): + """ + Gets the power available status + + Returns: + True if power is present and power on. + False and "absence of PSU" if power is not present. + False and "absence of power" if power is present but not power on. + """ + if not self.get_presence(): + return False, "absence of PSU" + elif not self.get_powergood_status(): + return False, "absence of power" + else: + return True, "" + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index f445c3b25058..581ebc1255aa 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -63,8 +63,9 @@ THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None } thermal_api_handler_gearbox = { - THERMAL_API_GET_TEMPERATURE:"temp_input_gearbox{}", - THERMAL_API_GET_HIGH_THRESHOLD:None + THERMAL_API_GET_TEMPERATURE:"gearbox{}_temp_input", + THERMAL_API_GET_HIGH_THRESHOLD:None, + THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None } thermal_ambient_apis = { THERMAL_DEV_ASIC_AMBIENT : "asic", @@ -311,7 +312,7 @@ def initialize_thermals(sku, thermal_list, psu_list): else: if category == THERMAL_DEV_CATEGORY_PSU: for index in range(count): - thermal = Thermal(category, start + index, True, psu_list[index].get_powergood_status, "power off") + thermal = Thermal(category, start + index, True, psu_list[index].get_power_available_status) thermal_list.append(thermal) else: for index in range(count): @@ -321,7 +322,7 @@ def initialize_thermals(sku, thermal_list, psu_list): class Thermal(ThermalBase): - def __init__(self, category, index, has_index, dependency = None, hint = None): + def __init__(self, category, index, has_index, dependency = None): """ index should be a string for category ambient and int for other categories """ @@ -340,7 +341,6 @@ def __init__(self, category, index, has_index, dependency = None, hint = None): self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD) self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD) self.dependency = dependency - self.dependent_hint = hint def get_name(self): @@ -392,13 +392,11 @@ def get_temperature(self): A float number of current temperature in Celsius up to nearest thousandth of one degree Celsius, e.g. 30.125 """ - if self.dependency and not self.dependency(): - if self.dependent_hint: - hint = self.dependent_hint - else: - hint = "unknown reason" - logger.log_info("get_temperature for {} failed due to {}".format(self.name, hint)) - return None + if self.dependency: + status, hint = self.dependency() + if not status: + logger.log_debug("get_temperature for {} failed due to {}".format(self.name, hint)) + return None value_str = self._read_generic_file(self.temperature, 0) if value_str is None: return None @@ -418,6 +416,11 @@ def get_high_threshold(self): """ if self.high_threshold is None: return None + if self.dependency: + status, hint = self.dependency() + if not status: + logger.log_debug("get_high_threshold for {} failed due to {}".format(self.name, hint)) + return None value_str = self._read_generic_file(self.high_threshold, 0) if value_str is None: return None @@ -437,6 +440,11 @@ def get_high_critical_threshold(self): """ if self.high_critical_threshold is None: return None + if self.dependency: + status, hint = self.dependency() + if not status: + logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint)) + return None value_str = self._read_generic_file(self.high_critical_threshold, 0) if value_str is None: return None diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py index 34d31e47d24c..82c186495f5e 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py @@ -77,12 +77,12 @@ def collect(self, chassis): """ self._status_changed = False for psu in chassis.get_all_psus(): - if psu.get_presence() and psu not in self._presence_psus: + if psu.get_presence() and psu.get_powergood_status() and psu not in self._presence_psus: self._presence_psus.add(psu) self._status_changed = True if psu in self._absence_psus: self._absence_psus.remove(psu) - elif not psu.get_presence() and psu not in self._absence_psus: + elif (not psu.get_presence() or not psu.get_powergood_status()) and psu not in self._absence_psus: self._absence_psus.add(psu) self._status_changed = True if psu in self._presence_psus: diff --git a/platform/mellanox/mlnx-platform-api/tests/duplicate_action.json b/platform/mellanox/mlnx-platform-api/tests/duplicate_action.json new file mode 100644 index 000000000000..c19787aa26e0 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/duplicate_action.json @@ -0,0 +1,18 @@ +{ + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + } + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] +} diff --git a/platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json b/platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json new file mode 100644 index 000000000000..c25d84762e2a --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json @@ -0,0 +1,17 @@ +{ + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + }, + { + "type": "fan.any.absence" + } + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] +} diff --git a/platform/mellanox/mlnx-platform-api/tests/empty_action.json b/platform/mellanox/mlnx-platform-api/tests/empty_action.json new file mode 100644 index 000000000000..b1051b5a6f60 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/empty_action.json @@ -0,0 +1,10 @@ +{ + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + } + ], + "actions": [ + ] +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/empty_condition.json b/platform/mellanox/mlnx-platform-api/tests/empty_condition.json new file mode 100644 index 000000000000..e7a588459246 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/empty_condition.json @@ -0,0 +1,11 @@ +{ + "name": "any fan absence", + "conditions": [ + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py index b8d070d44955..f34ace97968d 100644 --- a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py +++ b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py @@ -13,10 +13,14 @@ def set_speed(self, speed): class MockPsu: def __init__(self): self.presence = True + self.powergood = True def get_presence(self): return self.presence + def get_powergood_status(self): + return self.powergood + class MockChassis: def __init__(self): diff --git a/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json b/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json new file mode 100644 index 000000000000..ace291be1c55 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json @@ -0,0 +1,75 @@ +{ + "thermal_control_algorithm": { + "run_at_boot_up": "false", + "fan_speed_when_suspend": "60" + }, + "info_types": [ + { + "type": "fan_info" + }, + { + "type": "psu_info" + }, + { + "type": "chassis_info" + } + ], + "policies": [ + { + "name": "all fan and psu presence", + "conditions": [ + { + "type": "fan.all.presence" + }, + { + "type": "psu.all.presence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "any psu absence", + "conditions": [ + { + "type": "psu.any.absence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "all fan and psu presence 1", + "conditions": [ + { + "type": "fan.all.presence" + }, + { + "type": "psu.all.presence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "true" + } + ] + } + ] +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/test_fan_api.py b/platform/mellanox/mlnx-platform-api/tests/test_fan_api.py new file mode 100644 index 000000000000..381260163c0f --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_fan_api.py @@ -0,0 +1,17 @@ +import os +import sys +from mock import MagicMock + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + +from sonic_platform.fan import Fan + + +def test_get_absence_fan_direction(): + fan = Fan(True, 0, 0) + fan.get_presence = MagicMock(return_value=False) + assert fan.fan_dir is not None + assert not fan.is_psu_fan + assert fan.get_direction() == Fan.FAN_DIRECTION_NOT_APPLICABLE diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index ba9e502d4f74..843244e937fa 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -66,6 +66,12 @@ def test_psu_info(): assert len(psu_info.get_presence_psus()) == 1 assert psu_info.is_status_changed() + psu_list[0].powergood = False + psu_info.collect(chassis) + assert len(psu_info.get_absence_psus()) == 1 + assert len(psu_info.get_presence_psus()) == 0 + assert psu_info.is_status_changed() + def test_fan_policy(thermal_manager): chassis = MockChassis() @@ -269,4 +275,44 @@ def test_load_control_thermal_algo_action(): with pytest.raises(ValueError): action.load_from_json(json_obj) +def test_load_duplicate_condition(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'duplicate_condition.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_duplicate_action(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'duplicate_action.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_empty_condition(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'empty_condition.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_empty_action(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'empty_action.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_policy_with_same_conditions(): + from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase + class MockThermalManager(ThermalManagerBase): + pass + + with pytest.raises(Exception): + MockThermalManager.load(os.path.join(test_path, 'policy_with_same_conditions.json')) + From 7be4a6c453705841e1be09ab0a09b7cf168df42a Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Tue, 21 Apr 2020 23:09:53 +0800 Subject: [PATCH 03/14] [Mellanox] thermal control enhancement for dynamic minimum fan speed and PSU fan speed policy (#4403) Conflicts: platform/mellanox/mlnx-platform-api/sonic_platform/fan.py --- .../thermal_policy.json | 27 ++- dockers/docker-platform-monitor/Dockerfile.j2 | 3 +- .../sonic_platform/chassis.py | 12 +- .../sonic_platform/device_data.py | 101 ++++++++ .../mlnx-platform-api/sonic_platform/fan.py | 106 +++++++-- .../mlnx-platform-api/sonic_platform/psu.py | 4 +- .../sonic_platform/thermal.py | 148 +++++++++++- .../sonic_platform/thermal_actions.py | 114 ++++++++- .../sonic_platform/thermal_conditions.py | 68 ++++++ .../sonic_platform/thermal_infos.py | 22 +- .../sonic_platform/thermal_manager.py | 55 +++-- .../mlnx-platform-api/tests/mock_platform.py | 14 +- .../tests/test_thermal_policy.py | 217 +++++++++++++++++- .../tests/thermal_policy.json | 25 ++ 14 files changed, 851 insertions(+), 65 deletions(-) create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py diff --git a/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json index 054d797be951..f16f68dd002e 100644 --- a/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json +++ b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json @@ -1,6 +1,6 @@ { "thermal_control_algorithm": { - "run_at_boot_up": "false", + "run_at_boot_up": "true", "fan_speed_when_suspend": "60" }, "info_types": [ @@ -51,6 +51,24 @@ } ] }, + { + "name": "any fan broken", + "conditions": [ + { + "type": "fan.any.fault" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, { "name": "all fan and psu presence", "conditions": [ @@ -59,12 +77,15 @@ }, { "type": "psu.all.presence" + }, + { + "type": "fan.all.good" } ], "actions": [ { - "type": "fan.all.set_speed", - "speed": "60" + "type": "thermal_control.control", + "status": "true" } ] } diff --git a/dockers/docker-platform-monitor/Dockerfile.j2 b/dockers/docker-platform-monitor/Dockerfile.j2 index 61374af7d544..2ce609fff178 100755 --- a/dockers/docker-platform-monitor/Dockerfile.j2 +++ b/dockers/docker-platform-monitor/Dockerfile.j2 @@ -18,7 +18,8 @@ RUN apt-get update && \ rrdtool \ python-smbus \ ethtool \ - dmidecode && \ + dmidecode \ + i2c-tools && \ pip install enum34 {% if docker_platform_monitor_debs.strip() -%} diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index f55f066dfdc7..1ac782555673 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -28,6 +28,7 @@ MLNX_NUM_PSU = 2 GET_HWSKU_CMD = "sonic-cfggen -d -v DEVICE_METADATA.localhost.hwsku" +GET_PLATFORM_CMD = "sonic-cfggen -d -v DEVICE_METADATA.localhost.platform" EEPROM_CACHE_ROOT = '/var/cache/sonic/decode-syseeprom' EEPROM_CACHE_FILE = 'syseeprom_cache' @@ -55,6 +56,7 @@ def __init__(self): # Initialize SKU name self.sku_name = self._get_sku_name() + self.platform_name = self._get_platform_name() mi = get_machine_info() if mi is not None: self.name = mi['onie_platform'] @@ -95,9 +97,9 @@ def initialize_fan(self): for index in range(num_of_fan): if multi_rotor_in_drawer: - fan = Fan(has_fan_dir, index, index/2, False, self.sku_name) + fan = Fan(has_fan_dir, index, index/2, False, self.platform_name) else: - fan = Fan(has_fan_dir, index, index, False, self.sku_name) + fan = Fan(has_fan_dir, index, index, False, self.platform_name) self._fan_list.append(fan) @@ -230,6 +232,12 @@ def _get_sku_name(self): return out.rstrip('\n') + def _get_platform_name(self): + p = subprocess.Popen(GET_PLATFORM_CMD, shell=True, stdout=subprocess.PIPE) + out, err = p.communicate() + return out.rstrip('\n') + + def _get_port_position_tuple_by_sku_name(self): position_tuple = port_position_tuple_list[hwsku_dict_port[self.sku_name]] return position_tuple diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py new file mode 100644 index 000000000000..f006281c511f --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -0,0 +1,101 @@ +DEVICE_DATA = { + 'x86_64-mlnx_msn2700-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:40":13, "41:120":15}, + "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, + "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + } + } + }, + 'x86_64-mlnx_msn2740-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:120":13}, + "p2c_untrust": {"-127:35":13, "36:40":14 , "41:120":15}, + "c2p_trust": {"-127:120":13}, + "c2p_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17}, + "unk_trust": {"-127:120":13}, + "unk_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17}, + } + } + }, + 'x86_64-mlnx_msn2100-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:120":12}, + "p2c_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16}, + "c2p_trust": {"-127:40":12, "41:120":13}, + "c2p_untrust": {"-127:40":12, "41:120":13}, + "unk_trust": {"-127:40":12, "41:120":13}, + "unk_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16} + } + } + }, + 'x86_64-mlnx_msn2410-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:40":13, "41:120":15}, + "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, + "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + } + } + }, + 'x86_64-mlnx_msn2010-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:120":12}, + "p2c_untrust": {"-127:15":12, "16:20":13, "21:30":14, "31:35":15, "36:120":16}, + "c2p_trust": {"-127:120":12}, + "c2p_untrust": {"-127:20":12, "21:25":13 , "26:30":14, "31:35":15, "36:120":16}, + "unk_trust": {"-127:120":12}, + "unk_untrust": {"-127:15":12, "16:20":13 , "21:30":14, "31:35":15, "36:120":16} + } + } + }, + 'x86_64-mlnx_msn3700-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "p2c_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, + "c2p_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "c2p_untrust": {"-127:25":12, "26:40":13 , "41:120":14}, + "unk_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, + } + } + }, + 'x86_64-mlnx_msn3700c-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "p2c_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, + "c2p_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "c2p_untrust": {"-127:25":12, "26:40":13 , "41:120":14}, + "unk_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, + } + } + }, + 'x86_64-mlnx_msn3800-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:35":12, "36:120":13}, + "p2c_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, + "c2p_trust": {"-127:30":12, "31:40":13 , "41:120":14}, + "c2p_untrust": {"-127:20":12, "21:30":13 , "31:35":14, "36:40":15, "41:120":16}, + "unk_trust": {"-127:30":12, "31:40":13 , "41:120":14}, + "unk_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, + } + } + }, + 'x86_64-mlnx_msn4700-r0': { + + } +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index c25e2288cda5..985924f30857 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -9,6 +9,7 @@ ############################################################################# import os.path +import subprocess try: from sonic_platform_base.fan_base import FanBase @@ -22,23 +23,34 @@ FAN_PATH = "/var/run/hw-management/thermal/" LED_PATH = "/var/run/hw-management/led/" +CONFIG_PATH = "/var/run/hw-management/config" +# fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches +FAN_DIR = "/var/run/hw-management/system/fan_dir" +COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state" -# SKUs with unplugable FANs: +# Platforms with unplugable FANs: # 1. don't have fanX_status and should be treated as always present -hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100'] +platform_with_unplugable_fan = ['x86_64-mlnx_msn2010-r0', 'x86_64-mlnx_msn2100-r0'] + class Fan(FanBase): """Platform-specific Fan class""" STATUS_LED_COLOR_ORANGE = "orange" - - def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None): + min_cooling_level = 2 + MIN_VALID_COOLING_LEVEL = 1 + MAX_VALID_COOLING_LEVEL = 10 + # PSU fan speed vector + PSU_FAN_SPEED = ['0x3c', '0x3c', '0x3c', '0x3c', '0x3c', + '0x3c', '0x3c', '0x46', '0x50', '0x5a', '0x64'] + + def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, platform = None): # API index is starting from 0, Mellanox platform index is starting from 1 self.index = fan_index + 1 self.drawer_index = drawer_index + 1 self.is_psu_fan = psu_fan - self.always_presence = False if sku not in hwsku_dict_with_unplugable_fan else True + self.always_presence = False if platform not in platform_with_unplugable_fan else True self.fan_min_speed_path = "fan{}_min".format(self.index) if not self.is_psu_fan: @@ -52,6 +64,10 @@ def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sk self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index) self._name = 'psu_{}_fan_{}'.format(self.index, 1) self.fan_max_speed_path = None + self.psu_i2c_bus_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_bus'.format(self.index)) + self.psu_i2c_addr_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_addr'.format(self.index)) + self.psu_i2c_command_path = os.path.join(CONFIG_PATH, 'fan_command') + self.fan_status_path = "fan{}_fault".format(self.index) self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index) self.fan_red_led_path = "led_fan{}_red".format(self.drawer_index) @@ -88,7 +104,7 @@ def get_direction(self): try: with open(os.path.join(self.fan_dir), 'r') as fan_dir: - fan_dir_bits = int(fan_dir.read()) + fan_dir_bits = int(fan_dir.read().strip()) fan_mask = 1 << self.drawer_index - 1 if fan_dir_bits & fan_mask: return self.FAN_DIRECTION_INTAKE @@ -114,7 +130,7 @@ def get_status(self): else: try: with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status: - status = int(fault_status.read()) + status = int(fault_status.read().strip()) except (ValueError, IOError): status = 1 @@ -139,7 +155,7 @@ def get_presence(self): else: try: with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status: - status = int(presence_status.read()) + status = int(presence_status.read().strip()) except (ValueError, IOError): status = 0 @@ -159,7 +175,7 @@ def _get_max_speed_in_rpm(self): speed = 0 try: with open(os.path.join(FAN_PATH, self.fan_max_speed_path), 'r') as max_fan_speed: - speed = int(max_fan_speed.read()) + speed = int(max_fan_speed.read().strip()) except (ValueError, IOError): speed = 0 @@ -175,7 +191,7 @@ def get_speed(self): speed = 0 try: with open(os.path.join(FAN_PATH, self.fan_speed_get_path), 'r') as fan_curr_speed: - speed_in_rpm = int(fan_curr_speed.read()) + speed_in_rpm = int(fan_curr_speed.read().strip()) except (ValueError, IOError): speed_in_rpm = 0 @@ -203,7 +219,7 @@ def get_target_speed(self): try: with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'r') as fan_pwm: - pwm = int(fan_pwm.read()) + pwm = int(fan_pwm.read().strip()) except (ValueError, IOError): pwm = 0 @@ -223,13 +239,34 @@ def set_speed(self, speed): bool: True if set success, False if fail. """ status = True - pwm = int(round(PWM_MAX*speed/100.0)) if self.is_psu_fan: - #PSU fan speed is not setable. - return False - + from .thermal import logger + try: + with open(self.psu_i2c_bus_path, 'r') as f: + bus = f.read().strip() + with open(self.psu_i2c_addr_path, 'r') as f: + addr = f.read().strip() + with open(self.psu_i2c_command_path, 'r') as f: + command = f.read().strip() + speed = Fan.PSU_FAN_SPEED[int(speed / 10)] + command = "i2cset -f -y {0} {1} {2} {3} wp".format(bus, addr, command, speed) + subprocess.check_call(command, shell = True) + return True + except subprocess.CalledProcessError as ce: + logger.log_error('Failed to call command {}, return code={}, command output={}'.format(ce.cmd, ce.returncode, ce.output)) + return False + except Exception as e: + logger.log_error('Failed to set PSU FAN speed - {}'.format(e)) + return False + try: + cooling_level = int(speed / 10) + if cooling_level < self.min_cooling_level: + cooling_level = self.min_cooling_level + speed = self.min_cooling_level * 10 + self.set_cooling_level(cooling_level, cooling_level) + pwm = int(round(PWM_MAX*speed/100.0)) with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'w') as fan_pwm: fan_pwm.write(str(pwm)) except (ValueError, IOError): @@ -304,3 +341,42 @@ def get_speed_tolerance(self): """ # The tolerance value is fixed as 20% for all the Mellanox platform return 20 + + @classmethod + def set_cooling_level(cls, level, cur_state): + """ + Change cooling level. The input level should be an integer value [1, 10]. + 1 means 10%, 2 means 20%, 10 means 100%. + """ + if not isinstance(level, int): + raise RuntimeError("Failed to set cooling level, input parameter must be integer") + + if level < cls.MIN_VALID_COOLING_LEVEL or level > cls.MAX_VALID_COOLING_LEVEL: + raise RuntimeError("Failed to set cooling level, level value must be in range [{}, {}], got {}".format( + cls.MIN_VALID_COOLING_LEVEL, + cls.MAX_VALID_COOLING_LEVEL, + level + )) + + try: + # Reset FAN cooling level vector. According to low level team, + # if we need set cooling level to X, we need first write a (10+X) + # to cooling_cur_state file to reset the cooling level vector. + with open(COOLING_STATE_PATH, 'w') as cooling_state: + cooling_state.write(str(level + 10)) + + # We need set cooling level after resetting the cooling level vector + with open(COOLING_STATE_PATH, 'w') as cooling_state: + cooling_state.write(str(cur_state)) + except (ValueError, IOError) as e: + raise RuntimeError("Failed to set cooling level - {}".format(e)) + + @classmethod + def get_cooling_level(cls): + try: + with open(COOLING_STATE_PATH, 'r') as cooling_state: + cooling_level = int(cooling_state.read().strip()) + return cooling_level + except (ValueError, IOError) as e: + raise RuntimeError("Failed to get cooling level - {}".format(e)) + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index b627a79ee888..4da87d95e1e5 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -93,7 +93,7 @@ def __init__(self, psu_index, sku): # unplugable PSU has no FAN if sku not in hwsku_dict_with_unplugable_psu: - fan = Fan(sku, psu_index, psu_index, True) + fan = Fan(False, psu_index, psu_index, True) self._fan_list.append(fan) self.psu_green_led_path = "led_psu_green" @@ -113,7 +113,7 @@ def _read_generic_file(self, filename, len): result = 0 try: with open(filename, 'r') as fileobj: - result = int(fileobj.read()) + result = int(fileobj.read().strip()) except Exception as e: logger.log_info("Fail to read file {} due to {}".format(filename, repr(e))) return result diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index 581ebc1255aa..bb0ef5fb776a 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -42,6 +42,16 @@ HW_MGMT_THERMAL_ROOT = "/var/run/hw-management/thermal/" +THERMAL_ZONE_ASIC_PATH = "/var/run/hw-management/thermal/mlxsw/" +THERMAL_ZONE_MODULE_PATH = "/var/run/hw-management/thermal/mlxsw-module{}/" +THERMAL_ZONE_GEARBOX_PATH = "/var/run/hw-management/thermal/mlxsw-gearbox{}/" +THERMAL_ZONE_MODE = "thermal_zone_mode" +THERMAL_ZONE_POLICY = "thermal_zone_policy" +THERMAL_ZONE_TEMPERATURE = "thermal_zone_temp" +THERMAL_ZONE_NORMAL_TEMPERATURE = "temp_trip_norm" + +MODULE_TEMPERATURE_FAULT_PATH = "/var/run/hw-management/thermal/module{}_temp_fault" + thermal_api_handler_cpu_core = { THERMAL_API_GET_TEMPERATURE:"cpu_core{}", THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max", @@ -294,6 +304,7 @@ def initialize_thermals(sku, thermal_list, psu_list): # create thermal objects for all categories of sensors tp_index = hwsku_dict_thermal[sku] thermal_profile = thermal_profile_list[tp_index] + Thermal.thermal_profile = thermal_profile for category in thermal_device_categories_all: if category == THERMAL_DEV_CATEGORY_AMBIENT: count, ambient_list = thermal_profile[category] @@ -322,6 +333,9 @@ def initialize_thermals(sku, thermal_list, psu_list): class Thermal(ThermalBase): + thermal_profile = None + thermal_algorithm_status = False + def __init__(self, category, index, has_index, dependency = None): """ index should be a string for category ambient and int for other categories @@ -353,14 +367,15 @@ def get_name(self): return self.name - def _read_generic_file(self, filename, len): + @classmethod + def _read_generic_file(cls, filename, len): """ Read a generic file, returns the contents of the file """ result = None try: with open(filename, 'r') as fileobj: - result = fileobj.read() + result = fileobj.read().strip() except Exception as e: logger.log_info("Fail to read file {} due to {}".format(filename, repr(e))) return result @@ -452,3 +467,132 @@ def get_high_critical_threshold(self): if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD: return None return value_float / 1000.0 + + + @classmethod + def _write_generic_file(cls, filename, content): + """ + Generic functions to write content to a specified file path if + the content has changed. + """ + try: + with open(filename, 'w+') as file_obj: + origin_content = file_obj.read() + if origin_content != content: + file_obj.write(content) + except Exception as e: + logger.log_info("Fail to write file {} due to {}".format(filename, repr(e))) + + @classmethod + def set_thermal_algorithm_status(cls, status, force=True): + """ + Enable/disable kernel thermal algorithm. + When enable kernel thermal algorithm, kernel will adjust fan speed + according to thermal zones temperature. Please note that kernel will + only adjust fan speed when temperature across some "edge", e.g temperature + changes to exceed high threshold. + When disable kernel thermal algorithm, kernel no longer adjust fan speed. + We usually disable the algorithm when we want to set a fix speed. E.g, when + a fan unit is removed from system, we will set fan speed to 100% and disable + the algorithm to avoid it adjust the speed. + """ + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + + if not force and cls.thermal_algorithm_status == status: + return + + cls.thermal_algorithm_status = status + content = "enabled" if status else "disabled" + policy = "step_wise" if status else "user_space" + cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_POLICY), policy) + + if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + if count != 0: + for index in range(count): + cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_POLICY), policy) + + if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX] + if count != 0: + for index in range(count): + cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_POLICY), policy) + + @classmethod + def check_thermal_zone_temperature(cls): + """ + Check thermal zone current temperature with normal temperature + + Returns: + True if all thermal zones current temperature less or equal than normal temperature + """ + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_ASIC_PATH): + return False + + if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + if count != 0: + for index in range(count): + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_MODULE_PATH.format(start + index)): + return False + + if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX] + if count != 0: + for index in range(count): + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_GEARBOX_PATH.format(start + index)): + return False + + return True + + @classmethod + def _check_thermal_zone_temperature(cls, thermal_zone_path): + normal_temp_path = join(thermal_zone_path, THERMAL_ZONE_NORMAL_TEMPERATURE) + current_temp_path = join(thermal_zone_path, THERMAL_ZONE_TEMPERATURE) + normal = None + current = None + try: + with open(normal_temp_path, 'r') as file_obj: + normal = float(file_obj.read()) + + with open(current_temp_path, 'r') as file_obj: + current = float(file_obj.read()) + + return current <= normal + except Exception as e: + logger.log_info("Fail to check thermal zone temperature for file {} due to {}".format(thermal_zone_path, repr(e))) + + @classmethod + def check_module_temperature_trustable(cls): + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + for index in range(count): + fault_file_path = MODULE_TEMPERATURE_FAULT_PATH.format(index + start) + fault = cls._read_generic_file(fault_file_path, 0) + if fault.strip() != '0': + return 'untrust' + return 'trust' + + @classmethod + def get_air_flow_direction(cls): + fan_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_FAN_AMBIENT) + port_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_PORT_AMBIENT) + + # if there is any exception, let it raise + fan_ambient_temp = int(cls._read_generic_file(fan_ambient_path, 0)) + port_ambient_temp = int(cls._read_generic_file(port_ambient_path, 0)) + if fan_ambient_temp > port_ambient_temp: + return 'p2c', fan_ambient_temp + elif fan_ambient_temp < port_ambient_temp: + return 'c2p', port_ambient_temp + else: + return 'unk', fan_ambient_temp diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 72729287d1c5..1f8292763ddd 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -1,5 +1,6 @@ from sonic_platform_base.sonic_thermal_control.thermal_action_base import ThermalPolicyActionBase from sonic_platform_base.sonic_thermal_control.thermal_json_object import thermal_json_object +from .thermal import logger class SetFanSpeedAction(ThermalPolicyActionBase): @@ -52,7 +53,38 @@ def execute(self, thermal_info_dict): fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] for fan in fan_info_obj.get_presence_fans(): fan.set_speed(self.speed) + logger.log_info('Set all system FAN speed to {}'.format(self.speed)) + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, self.speed) + + @classmethod + def set_psu_fan_speed(cls, thermal_info_dict, speed): + from .thermal_infos import ChassisInfo + if ChassisInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[ChassisInfo.INFO_NAME], ChassisInfo): + chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() + for psu in chassis.get_all_psus(): + for psu_fan in psu.get_all_fans(): + psu_fan.set_speed(speed) + + logger.log_info('Updated PSU FAN speed to {}%'.format(speed)) + + + +@thermal_json_object('fan.all.check_and_set_speed') +class CheckAndSetAllFanSpeedAction(SetAllFanSpeedAction): + """ + Action to check thermal zone temperature and recover speed for all fans + """ + def execute(self, thermal_info_dict): + """ + Check thermal zone and set speed for all fans + :param thermal_info_dict: A dictionary stores all thermal information. + :return: + """ + from .thermal import Thermal + if Thermal.check_thermal_zone_temperature(): + SetAllFanSpeedAction.execute(self, thermal_info_dict) + @thermal_json_object('thermal_control.control') class ControlThermalAlgoAction(ThermalPolicyActionBase): @@ -95,14 +127,80 @@ def execute(self, thermal_info_dict): :param thermal_info_dict: A dictionary stores all thermal information. :return: """ + from .thermal_infos import FanInfo + from .thermal import Thermal + from .thermal_conditions import UpdateCoolingLevelToMinCondition + from .fan import Fan + Thermal.set_thermal_algorithm_status(self.status, False) + if self.status: + # Check thermal zone temperature, if all thermal zone temperature + # back to normal, set it to minimum allowed speed to + # save power + UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) + + logger.log_info('Changed thermal algorithm status to {}'.format(self.status)) + + +class ChangeMinCoolingLevelAction(ThermalPolicyActionBase): + UNKNOWN_SKU_COOLING_LEVEL = 6 + def execute(self, thermal_info_dict): + from .device_data import DEVICE_DATA + from .fan import Fan from .thermal_infos import ChassisInfo - if ChassisInfo.INFO_NAME in thermal_info_dict: - chassis_info_obj = thermal_info_dict[ChassisInfo.INFO_NAME] - chassis = chassis_info_obj.get_chassis() - thermal_manager = chassis.get_thermal_manager() - if self.status: - thermal_manager.start_thermal_control_algorithm() - else: - thermal_manager.stop_thermal_control_algorithm() + from .thermal_conditions import MinCoolingLevelChangeCondition + from .thermal_conditions import UpdateCoolingLevelToMinCondition + chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() + if chassis.platform_name not in DEVICE_DATA or 'thermal' not in DEVICE_DATA[chassis.platform_name] or 'minimum_table' not in DEVICE_DATA[chassis.platform_name]['thermal']: + Fan.min_cooling_level = ChangeMinCoolingLevelAction.UNKNOWN_SKU_COOLING_LEVEL + else: + air_flow_dir = MinCoolingLevelChangeCondition.air_flow_dir + trust_state = MinCoolingLevelChangeCondition.trust_state + temperature = MinCoolingLevelChangeCondition.temperature + minimum_table = DEVICE_DATA[chassis.platform_name]['thermal']['minimum_table']['{}_{}'.format(air_flow_dir, trust_state)] + + for key, cooling_level in minimum_table.items(): + temp_range = key.split(':') + temp_min = int(temp_range[0].strip()) + temp_max = int(temp_range[1].strip()) + if temp_min <= temperature <= temp_max: + Fan.min_cooling_level = cooling_level - 10 + break + + current_cooling_level = Fan.get_cooling_level() + if current_cooling_level < Fan.min_cooling_level: + Fan.set_cooling_level(Fan.min_cooling_level, Fan.min_cooling_level) + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, Fan.min_cooling_level * 10) + else: + Fan.set_cooling_level(Fan.min_cooling_level, current_cooling_level) + UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) + + logger.log_info('Changed minimum cooling level to {}'.format(Fan.min_cooling_level)) + + +class UpdatePsuFanSpeedAction(ThermalPolicyActionBase): + def execute(self, thermal_info_dict): + from .thermal_conditions import CoolingLevelChangeCondition + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, CoolingLevelChangeCondition.cooling_level * 10) + + +class UpdateCoolingLevelToMinAction(ThermalPolicyActionBase): + def execute(self, thermal_info_dict): + self.update_cooling_level_to_minimum(thermal_info_dict) + + @classmethod + def update_cooling_level_to_minimum(cls, thermal_info_dict): + from .fan import Fan + from .thermal import Thermal + from .thermal_conditions import UpdateCoolingLevelToMinCondition + from .thermal_infos import FanInfo + if Thermal.check_thermal_zone_temperature(): + fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] + speed = Fan.min_cooling_level * 10 + for fan in fan_info_obj.get_presence_fans(): + fan.set_speed(speed) + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, speed) + UpdateCoolingLevelToMinCondition.enable = False + else: + UpdateCoolingLevelToMinCondition.enable = True diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py index 2df59acc9bf1..6bd2d282862b 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py @@ -32,6 +32,20 @@ def is_match(self, thermal_info_dict): return len(fan_info_obj.get_absence_fans()) == 0 if fan_info_obj else False +@thermal_json_object('fan.any.fault') +class AnyFanFaultCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_fault_fans()) > 0 if fan_info_obj else False + + +@thermal_json_object('fan.all.good') +class AllFanGoodCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_fault_fans()) == 0 if fan_info_obj else False + + class PsuCondition(ThermalPolicyConditionBase): def get_psu_info(self, thermal_info_dict): from .thermal_infos import PsuInfo @@ -61,3 +75,57 @@ def is_match(self, thermal_info_dict): psu_info_obj = self.get_psu_info(thermal_info_dict) return len(psu_info_obj.get_absence_psus()) == 0 if psu_info_obj else False + +class MinCoolingLevelChangeCondition(ThermalPolicyConditionBase): + trust_state = None + air_flow_dir = None + temperature = None + + def is_match(self, thermal_info_dict): + from .thermal import Thermal + + trust_state = Thermal.check_module_temperature_trustable() + air_flow_dir, temperature = Thermal.get_air_flow_direction() + temperature = temperature / 1000 + + change_cooling_level = False + if trust_state != MinCoolingLevelChangeCondition.trust_state: + MinCoolingLevelChangeCondition.trust_state = trust_state + change_cooling_level = True + + if air_flow_dir != MinCoolingLevelChangeCondition.air_flow_dir: + MinCoolingLevelChangeCondition.air_flow_dir = air_flow_dir + change_cooling_level = True + + if temperature != MinCoolingLevelChangeCondition.temperature: + MinCoolingLevelChangeCondition.temperature = temperature + change_cooling_level = True + + return change_cooling_level + + +class CoolingLevelChangeCondition(ThermalPolicyConditionBase): + cooling_level = None + + def is_match(self, thermal_info_dict): + from .fan import Fan + current_cooling_level = Fan.get_cooling_level() + if current_cooling_level != CoolingLevelChangeCondition.cooling_level: + CoolingLevelChangeCondition.cooling_level = current_cooling_level + return True + else: + return False + + +class UpdateCoolingLevelToMinCondition(ThermalPolicyConditionBase): + enable = False + def is_match(self, thermal_info_dict): + if not UpdateCoolingLevelToMinCondition.enable: + return False + + from .fan import Fan + current_cooling_level = Fan.get_cooling_level() + if current_cooling_level == Fan.min_cooling_level: + UpdateCoolingLevelToMinCondition.enable = False + return False + return True diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py index 82c186495f5e..e810a5646456 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py @@ -14,6 +14,7 @@ class FanInfo(ThermalPolicyInfoBase): def __init__(self): self._absence_fans = set() self._presence_fans = set() + self._fault_fans = set() self._status_changed = False def collect(self, chassis): @@ -24,17 +25,27 @@ def collect(self, chassis): """ self._status_changed = False for fan in chassis.get_all_fans(): - if fan.get_presence() and fan not in self._presence_fans: + presence = fan.get_presence() + status = fan.get_status() + if presence and fan not in self._presence_fans: self._presence_fans.add(fan) self._status_changed = True if fan in self._absence_fans: self._absence_fans.remove(fan) - elif not fan.get_presence() and fan not in self._absence_fans: + elif not presence and fan not in self._absence_fans: self._absence_fans.add(fan) self._status_changed = True if fan in self._presence_fans: self._presence_fans.remove(fan) + if not status and fan not in self._fault_fans: + self._fault_fans.add(fan) + self._status_changed = True + elif status and fan in self._fault_fans: + self._fault_fans.remove(fan) + self._status_changed = True + + def get_absence_fans(self): """ Retrieves absence fans @@ -49,6 +60,13 @@ def get_presence_fans(self): """ return self._presence_fans + def get_fault_fans(self): + """ + Retrieves fault fans + :return: A set of fault fans + """ + return self._fault_fans + def is_status_changed(self): """ Retrieves if the status of fan information changed diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index 133bb078ca20..914eec79816c 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -1,12 +1,29 @@ import os from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase +from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy from .thermal_actions import * from .thermal_conditions import * from .thermal_infos import * class ThermalManager(ThermalManagerBase): - THERMAL_ALGORITHM_CONTROL_PATH = '/var/run/hw-management/config/suspend' + @classmethod + def initialize(cls): + """ + Initialize thermal manager, including register thermal condition types and thermal action types + and any other vendor specific initialization. + :return: + """ + cls._add_private_thermal_policy() + + @classmethod + def deinitialize(cls): + """ + Destroy thermal manager, including any vendor specific cleanup. The default behavior of this function + is a no-op. + :return: + """ + cls.start_thermal_control_algorithm() @classmethod def start_thermal_control_algorithm(cls): @@ -16,7 +33,8 @@ def start_thermal_control_algorithm(cls): Returns: bool: True if set success, False if fail. """ - cls._control_thermal_control_algorithm(False) + from .thermal import Thermal + Thermal.set_thermal_algorithm_status(True) @classmethod def stop_thermal_control_algorithm(cls): @@ -26,25 +44,22 @@ def stop_thermal_control_algorithm(cls): Returns: bool: True if set success, False if fail. """ - cls._control_thermal_control_algorithm(True) + from .thermal import Thermal + Thermal.set_thermal_algorithm_status(False) @classmethod - def _control_thermal_control_algorithm(cls, suspend): - """ - Control thermal control algorithm - - Args: - suspend: Bool, indicate suspend the algorithm or not + def _add_private_thermal_policy(cls): + dynamic_min_speed_policy = ThermalPolicy() + dynamic_min_speed_policy.conditions[MinCoolingLevelChangeCondition] = MinCoolingLevelChangeCondition() + dynamic_min_speed_policy.actions[ChangeMinCoolingLevelAction] = ChangeMinCoolingLevelAction() + cls._policy_dict['DynamicMinCoolingLevelPolicy'] = dynamic_min_speed_policy - Returns: - bool: True if set success, False if fail. - """ - status = True - write_value = 1 if suspend else 0 - try: - with open(cls.THERMAL_ALGORITHM_CONTROL_PATH, 'w') as control_file: - control_file.write(str(write_value)) - except (ValueError, IOError): - status = False + update_psu_fan_speed_policy = ThermalPolicy() + update_psu_fan_speed_policy.conditions[CoolingLevelChangeCondition] = CoolingLevelChangeCondition() + update_psu_fan_speed_policy.actions[UpdatePsuFanSpeedAction] = UpdatePsuFanSpeedAction() + cls._policy_dict['UpdatePsuFanSpeedPolicy'] = update_psu_fan_speed_policy - return status + update_cooling_level_policy = ThermalPolicy() + update_cooling_level_policy.conditions[UpdateCoolingLevelToMinCondition] = UpdateCoolingLevelToMinCondition() + update_cooling_level_policy.actions[UpdateCoolingLevelToMinAction] = UpdateCoolingLevelToMinAction() + cls._policy_dict['UpdateCoolingLevelPolicy'] = update_cooling_level_policy diff --git a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py index f34ace97968d..c53480584889 100644 --- a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py +++ b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py @@ -1,13 +1,20 @@ class MockFan: + speed = 60 def __init__(self): self.presence = True - self.speed = 60 + self.status = True def get_presence(self): return self.presence def set_speed(self, speed): - self.speed = speed + MockFan.speed = speed + + def get_status(self): + return self.status + + def get_target_speed(self): + return MockFan.speed class MockPsu: @@ -21,6 +28,9 @@ def get_presence(self): def get_powergood_status(self): return self.powergood + def get_all_fans(self): + return [] + class MockChassis: def __init__(self): diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index 843244e937fa..835d7a495bbb 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -11,6 +11,11 @@ from sonic_platform.thermal_manager import ThermalManager from sonic_platform.thermal_infos import FanInfo, PsuInfo +from sonic_platform.fan import Fan +from sonic_platform.thermal import Thermal + +Thermal.check_thermal_zone_temperature = MagicMock() +Thermal.set_thermal_algorithm_status = MagicMock() @pytest.fixture(scope='session', autouse=True) @@ -27,6 +32,7 @@ def test_load_policy(thermal_manager): assert 'any fan absence' in thermal_manager._policy_dict assert 'any psu absence' in thermal_manager._policy_dict + assert 'any fan broken' in thermal_manager._policy_dict assert 'all fan and psu presence' in thermal_manager._policy_dict assert thermal_manager._fan_speed_when_suspend == 60 @@ -40,6 +46,7 @@ def test_fan_info(): fan_info.collect(chassis) assert len(fan_info.get_absence_fans()) == 1 assert len(fan_info.get_presence_fans()) == 0 + assert len(fan_info.get_fault_fans()) == 0 assert fan_info.is_status_changed() fan_list = chassis.get_all_fans() @@ -47,8 +54,15 @@ def test_fan_info(): fan_info.collect(chassis) assert len(fan_info.get_absence_fans()) == 0 assert len(fan_info.get_presence_fans()) == 1 + assert len(fan_info.get_fault_fans()) == 0 assert fan_info.is_status_changed() + fan_list[0].status = False + fan_info.collect(chassis) + assert len(fan_info.get_absence_fans()) == 0 + assert len(fan_info.get_presence_fans()) == 1 + assert len(fan_info.get_fault_fans()) == 1 + assert fan_info.is_status_changed() def test_psu_info(): chassis = MockChassis() @@ -77,35 +91,47 @@ def test_fan_policy(thermal_manager): chassis = MockChassis() chassis.make_fan_absence() chassis.fan_list.append(MockFan()) - thermal_manager.start_thermal_control_algorithm = MagicMock() - thermal_manager.stop_thermal_control_algorithm = MagicMock() thermal_manager.run_policy(chassis) fan_list = chassis.get_all_fans() assert fan_list[1].speed == 100 - thermal_manager.stop_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) fan_list[0].presence = True + Thermal.check_thermal_zone_temperature = MagicMock(return_value=True) + thermal_manager.run_policy(chassis) + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) + assert Thermal.check_thermal_zone_temperature.call_count == 2 + assert fan_list[0].speed == 60 + assert fan_list[1].speed == 60 + + fan_list[0].status = False + thermal_manager.run_policy(chassis) + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) + + fan_list[0].status = True + Thermal.check_thermal_zone_temperature = MagicMock(return_value=False) thermal_manager.run_policy(chassis) - thermal_manager.start_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) + assert Thermal.check_thermal_zone_temperature.call_count == 2 + assert fan_list[0].speed == 100 + assert fan_list[1].speed == 100 def test_psu_policy(thermal_manager): chassis = MockChassis() chassis.make_psu_absence() chassis.fan_list.append(MockFan()) - thermal_manager.start_thermal_control_algorithm = MagicMock() - thermal_manager.stop_thermal_control_algorithm = MagicMock() thermal_manager.run_policy(chassis) fan_list = chassis.get_all_fans() assert fan_list[0].speed == 100 - thermal_manager.stop_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) psu_list = chassis.get_all_psus() psu_list[0].presence = True thermal_manager.run_policy(chassis) - thermal_manager.start_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) def test_any_fan_absence_condition(): @@ -159,6 +185,44 @@ def test_all_fan_presence_condition(): fan_info.collect(chassis) assert condition.is_match({'fan_info': fan_info}) +def test_any_fan_fault_condition(): + chassis = MockChassis() + fan = MockFan() + fan_list = chassis.get_all_fans() + fan_list.append(fan) + fault_fan = MockFan() + fault_fan.status = False + fan_list.append(fault_fan) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AnyFanFaultCondition + condition = AnyFanFaultCondition() + assert condition.is_match({'fan_info': fan_info}) + + fault_fan.status = True + fan_info.collect(chassis) + assert not condition.is_match({'fan_info': fan_info}) + +def test_all_fan_good_condition(): + chassis = MockChassis() + fan = MockFan() + fan_list = chassis.get_all_fans() + fan_list.append(fan) + fault_fan = MockFan() + fault_fan.status = False + fan_list.append(fault_fan) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AllFanGoodCondition + condition = AllFanGoodCondition() + assert not condition.is_match({'fan_info': fan_info}) + + fault_fan.status = True + fan_info.collect(chassis) + assert condition.is_match({'fan_info': fan_info}) + def test_any_psu_absence_condition(): chassis = MockChassis() @@ -275,6 +339,53 @@ def test_load_control_thermal_algo_action(): with pytest.raises(ValueError): action.load_from_json(json_obj) +def test_load_check_and_set_speed_action(): + from sonic_platform.thermal_actions import CheckAndSetAllFanSpeedAction + action = CheckAndSetAllFanSpeedAction() + json_str = '{\"speed\": \"40\"}' + json_obj = json.loads(json_str) + action.load_from_json(json_obj) + assert action.speed == 40 + + json_str = '{\"speed\": \"-1\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"speed\": \"101\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"invalid\": \"60\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + +def test_execute_check_and_set_fan_speed_action(): + chassis = MockChassis() + fan_list = chassis.get_all_fans() + fan_list.append(MockFan()) + fan_list.append(MockFan()) + fan_info = FanInfo() + fan_info.collect(chassis) + Thermal.check_thermal_zone_temperature = MagicMock(return_value=True) + + from sonic_platform.thermal_actions import CheckAndSetAllFanSpeedAction + action = CheckAndSetAllFanSpeedAction() + action.speed = 99 + action.execute({'fan_info': fan_info}) + assert fan_list[0].speed == 99 + assert fan_list[1].speed == 99 + + Thermal.check_thermal_zone_temperature = MagicMock(return_value=False) + fan_list[0].speed = 100 + fan_list[1].speed = 100 + action.speed = 60 + action.execute({'fan_info': fan_info}) + assert fan_list[0].speed == 100 + assert fan_list[1].speed == 100 + def test_load_duplicate_condition(): from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy with open(os.path.join(test_path, 'duplicate_condition.json')) as f: @@ -315,4 +426,94 @@ class MockThermalManager(ThermalManagerBase): with pytest.raises(Exception): MockThermalManager.load(os.path.join(test_path, 'policy_with_same_conditions.json')) +def test_dynamic_minimum_table_data(): + from sonic_platform.device_data import DEVICE_DATA + for platform, platform_data in DEVICE_DATA.items(): + if 'thermal' in platform_data and 'minimum_table' in platform_data['thermal']: + minimum_table = platform_data['thermal']['minimum_table'] + check_minimum_table_data(platform, minimum_table) + +def check_minimum_table_data(platform, minimum_table): + valid_dir = ['p2c', 'c2p', 'unk'] + valid_trust_state = ['trust', 'untrust'] + + for category, data in minimum_table.items(): + key_data = category.split('_') + assert key_data[0] in valid_dir + assert key_data[1] in valid_trust_state + + data_list = [(value, key) for key, value in data.items()] + data_list.sort(key=lambda x : x[0]) + + previous_edge = None + previous_cooling_level = None + for item in data_list: + cooling_level = item[0] + range_str = item[1] + + ranges = range_str.split(':') + low = int(ranges[0]) + high = int(ranges[1]) + assert low < high + + if previous_edge is None: + assert low == -127 + else: + assert low - previous_edge == 1, '{}-{}-{} error, item={}'.format(platform, key_data[0], key_data[1], item) + previous_edge = high + + assert 10 <= cooling_level <= 20 + if previous_cooling_level is not None: + assert cooling_level > previous_cooling_level + previous_cooling_level = cooling_level + +def test_dynamic_minimum_policy(thermal_manager): + from sonic_platform.thermal_conditions import MinCoolingLevelChangeCondition + from sonic_platform.thermal_actions import ChangeMinCoolingLevelAction + from sonic_platform.thermal_infos import ChassisInfo + from sonic_platform.thermal import Thermal + from sonic_platform.fan import Fan + ThermalManager.initialize() + assert 'DynamicMinCoolingLevelPolicy' in thermal_manager._policy_dict + policy = thermal_manager._policy_dict['DynamicMinCoolingLevelPolicy'] + assert MinCoolingLevelChangeCondition in policy.conditions + assert ChangeMinCoolingLevelAction in policy.actions + + condition = policy.conditions[MinCoolingLevelChangeCondition] + action = policy.actions[ChangeMinCoolingLevelAction] + Thermal.check_module_temperature_trustable = MagicMock(return_value='trust') + Thermal.get_air_flow_direction = MagicMock(return_value=('p2c', 35000)) + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.trust_state == 'trust' + assert MinCoolingLevelChangeCondition.air_flow_dir == 'p2c' + assert MinCoolingLevelChangeCondition.temperature == 35 + assert not condition.is_match(None) + + Thermal.check_module_temperature_trustable = MagicMock(return_value='untrust') + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.trust_state == 'untrust' + + Thermal.get_air_flow_direction = MagicMock(return_value=('c2p', 35000)) + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.air_flow_dir == 'c2p' + + Thermal.get_air_flow_direction = MagicMock(return_value=('c2p', 25000)) + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.temperature == 25 + chassis = MockChassis() + chassis.platform_name = 'invalid' + info = ChassisInfo() + info._chassis = chassis + thermal_info_dict = {ChassisInfo.INFO_NAME: info} + Fan.get_cooling_level = MagicMock(return_value=5) + Fan.set_cooling_level = MagicMock() + action.execute(thermal_info_dict) + assert Fan.min_cooling_level == 6 + Fan.set_cooling_level.assert_called_with(6, 6) + Fan.set_cooling_level.call_count = 0 + + chassis.platform_name = 'x86_64-mlnx_msn2700-r0' + action.execute(thermal_info_dict) + assert Fan.min_cooling_level == 4 + Fan.set_cooling_level.assert_called_with(4, 5) diff --git a/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json b/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json index 5d31b2abd875..413211b21220 100644 --- a/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json +++ b/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json @@ -51,6 +51,24 @@ } ] }, + { + "name": "any fan broken", + "conditions": [ + { + "type": "fan.any.fault" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, { "name": "all fan and psu presence", "conditions": [ @@ -59,12 +77,19 @@ }, { "type": "psu.all.presence" + }, + { + "type": "fan.all.good" } ], "actions": [ { "type": "thermal_control.control", "status": "true" + }, + { + "type": "fan.all.check_and_set_speed", + "speed": "60" } ] } From 1eecae2339ebcde4a893bee3b4f5ec7151a08dd0 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Tue, 28 Apr 2020 11:52:57 +0800 Subject: [PATCH 04/14] [Mellanox] Adjust dynamic minimum fan speed algorithm (#4476) * remove air flow direction from dynamic minimum algorithm * adjust minimum table according to thermal data --- .../sonic_platform/device_data.py | 53 +++++-------------- .../sonic_platform/thermal.py | 9 +--- .../sonic_platform/thermal_actions.py | 3 +- .../sonic_platform/thermal_conditions.py | 7 +-- .../tests/test_thermal_policy.py | 13 ++--- 5 files changed, 21 insertions(+), 64 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py index f006281c511f..35b1f14d5bf9 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -2,34 +2,22 @@ 'x86_64-mlnx_msn2700-r0': { 'thermal': { 'minimum_table': { - "p2c_trust": {"-127:40":13, "41:120":15}, - "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, - "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, - "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, - "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, - "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + "unk_trust": {"-127:30":13, "31:40":14 , "41:120":15}, + "unk_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16} } } }, 'x86_64-mlnx_msn2740-r0': { 'thermal': { 'minimum_table': { - "p2c_trust": {"-127:120":13}, - "p2c_untrust": {"-127:35":13, "36:40":14 , "41:120":15}, - "c2p_trust": {"-127:120":13}, - "c2p_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17}, "unk_trust": {"-127:120":13}, - "unk_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17}, + "unk_untrust": {"-127:15":13, "16:25":14 , "26:30":15, "31:120":17}, } } }, 'x86_64-mlnx_msn2100-r0': { 'thermal': { 'minimum_table': { - "p2c_trust": {"-127:120":12}, - "p2c_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16}, - "c2p_trust": {"-127:40":12, "41:120":13}, - "c2p_untrust": {"-127:40":12, "41:120":13}, "unk_trust": {"-127:40":12, "41:120":13}, "unk_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16} } @@ -38,22 +26,14 @@ 'x86_64-mlnx_msn2410-r0': { 'thermal': { 'minimum_table': { - "p2c_trust": {"-127:40":13, "41:120":15}, - "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, - "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, - "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, - "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, - "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + "unk_trust": {"-127:30":13, "31:40":14 , "41:120":15}, + "unk_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16} } } }, 'x86_64-mlnx_msn2010-r0': { 'thermal': { 'minimum_table': { - "p2c_trust": {"-127:120":12}, - "p2c_untrust": {"-127:15":12, "16:20":13, "21:30":14, "31:35":15, "36:120":16}, - "c2p_trust": {"-127:120":12}, - "c2p_untrust": {"-127:20":12, "21:25":13 , "26:30":14, "31:35":15, "36:120":16}, "unk_trust": {"-127:120":12}, "unk_untrust": {"-127:15":12, "16:20":13 , "21:30":14, "31:35":15, "36:120":16} } @@ -62,10 +42,6 @@ 'x86_64-mlnx_msn3700-r0': { 'thermal': { 'minimum_table': { - "p2c_trust": {"-127:25":12, "26:40":13 , "41:120":14}, - "p2c_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, - "c2p_trust": {"-127:25":12, "26:40":13 , "41:120":14}, - "c2p_untrust": {"-127:25":12, "26:40":13 , "41:120":14}, "unk_trust": {"-127:25":12, "26:40":13 , "41:120":14}, "unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, } @@ -74,28 +50,25 @@ 'x86_64-mlnx_msn3700c-r0': { 'thermal': { 'minimum_table': { - "p2c_trust": {"-127:25":12, "26:40":13 , "41:120":14}, - "p2c_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, - "c2p_trust": {"-127:25":12, "26:40":13 , "41:120":14}, - "c2p_untrust": {"-127:25":12, "26:40":13 , "41:120":14}, - "unk_trust": {"-127:25":12, "26:40":13 , "41:120":14}, - "unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, + "unk_trust": {"-127:40":12, "41:120":13}, + "unk_untrust": {"-127:10":12, "11:20":13 , "21:30":14, "31:35":15, "36:120":16}, } } }, 'x86_64-mlnx_msn3800-r0': { 'thermal': { 'minimum_table': { - "p2c_trust": {"-127:35":12, "36:120":13}, - "p2c_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, - "c2p_trust": {"-127:30":12, "31:40":13 , "41:120":14}, - "c2p_untrust": {"-127:20":12, "21:30":13 , "31:35":14, "36:40":15, "41:120":16}, "unk_trust": {"-127:30":12, "31:40":13 , "41:120":14}, "unk_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, } } }, 'x86_64-mlnx_msn4700-r0': { - + 'thermal': { + 'minimum_table': { + "unk_trust": {"-127:120":16}, + "unk_untrust": {"-127:120":16}, + } + } } } \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index bb0ef5fb776a..be91cf9d523e 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -583,16 +583,11 @@ def check_module_temperature_trustable(cls): return 'trust' @classmethod - def get_air_flow_direction(cls): + def get_min_amb_temperature(cls): fan_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_FAN_AMBIENT) port_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_PORT_AMBIENT) # if there is any exception, let it raise fan_ambient_temp = int(cls._read_generic_file(fan_ambient_path, 0)) port_ambient_temp = int(cls._read_generic_file(port_ambient_path, 0)) - if fan_ambient_temp > port_ambient_temp: - return 'p2c', fan_ambient_temp - elif fan_ambient_temp < port_ambient_temp: - return 'c2p', port_ambient_temp - else: - return 'unk', fan_ambient_temp + return fan_ambient_temp if fan_ambient_temp < port_ambient_temp else port_ambient_temp diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 1f8292763ddd..3a4d5f2a8a68 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -154,10 +154,9 @@ def execute(self, thermal_info_dict): if chassis.platform_name not in DEVICE_DATA or 'thermal' not in DEVICE_DATA[chassis.platform_name] or 'minimum_table' not in DEVICE_DATA[chassis.platform_name]['thermal']: Fan.min_cooling_level = ChangeMinCoolingLevelAction.UNKNOWN_SKU_COOLING_LEVEL else: - air_flow_dir = MinCoolingLevelChangeCondition.air_flow_dir trust_state = MinCoolingLevelChangeCondition.trust_state temperature = MinCoolingLevelChangeCondition.temperature - minimum_table = DEVICE_DATA[chassis.platform_name]['thermal']['minimum_table']['{}_{}'.format(air_flow_dir, trust_state)] + minimum_table = DEVICE_DATA[chassis.platform_name]['thermal']['minimum_table']['unk_{}'.format(trust_state)] for key, cooling_level in minimum_table.items(): temp_range = key.split(':') diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py index 6bd2d282862b..94e18a2e00b0 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py @@ -78,14 +78,13 @@ def is_match(self, thermal_info_dict): class MinCoolingLevelChangeCondition(ThermalPolicyConditionBase): trust_state = None - air_flow_dir = None temperature = None def is_match(self, thermal_info_dict): from .thermal import Thermal trust_state = Thermal.check_module_temperature_trustable() - air_flow_dir, temperature = Thermal.get_air_flow_direction() + temperature = Thermal.get_min_amb_temperature() temperature = temperature / 1000 change_cooling_level = False @@ -93,10 +92,6 @@ def is_match(self, thermal_info_dict): MinCoolingLevelChangeCondition.trust_state = trust_state change_cooling_level = True - if air_flow_dir != MinCoolingLevelChangeCondition.air_flow_dir: - MinCoolingLevelChangeCondition.air_flow_dir = air_flow_dir - change_cooling_level = True - if temperature != MinCoolingLevelChangeCondition.temperature: MinCoolingLevelChangeCondition.temperature = temperature change_cooling_level = True diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index 835d7a495bbb..87fac359b2fd 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -482,10 +482,9 @@ def test_dynamic_minimum_policy(thermal_manager): condition = policy.conditions[MinCoolingLevelChangeCondition] action = policy.actions[ChangeMinCoolingLevelAction] Thermal.check_module_temperature_trustable = MagicMock(return_value='trust') - Thermal.get_air_flow_direction = MagicMock(return_value=('p2c', 35000)) + Thermal.get_min_amb_temperature = MagicMock(return_value=35000) assert condition.is_match(None) assert MinCoolingLevelChangeCondition.trust_state == 'trust' - assert MinCoolingLevelChangeCondition.air_flow_dir == 'p2c' assert MinCoolingLevelChangeCondition.temperature == 35 assert not condition.is_match(None) @@ -493,11 +492,7 @@ def test_dynamic_minimum_policy(thermal_manager): assert condition.is_match(None) assert MinCoolingLevelChangeCondition.trust_state == 'untrust' - Thermal.get_air_flow_direction = MagicMock(return_value=('c2p', 35000)) - assert condition.is_match(None) - assert MinCoolingLevelChangeCondition.air_flow_dir == 'c2p' - - Thermal.get_air_flow_direction = MagicMock(return_value=('c2p', 25000)) + Thermal.get_min_amb_temperature = MagicMock(return_value=25000) assert condition.is_match(None) assert MinCoolingLevelChangeCondition.temperature == 25 @@ -515,5 +510,5 @@ def test_dynamic_minimum_policy(thermal_manager): chassis.platform_name = 'x86_64-mlnx_msn2700-r0' action.execute(thermal_info_dict) - assert Fan.min_cooling_level == 4 - Fan.set_cooling_level.assert_called_with(4, 5) + assert Fan.min_cooling_level == 3 + Fan.set_cooling_level.assert_called_with(3, 5) From 1a94141a720d3a3eae3ff2d8bc2c4cbbd14a6b89 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Wed, 27 May 2020 01:45:25 +0800 Subject: [PATCH 05/14] [Mellanox] Adjust log level to avoid too many thermal logs (#4631) * Trigger thermal action log only if thermal condition changes * test file existence before read file content * fix error for set psu fan speed * Remove logs because it print too frequently --- .../mlnx-platform-api/sonic_platform/fan.py | 2 ++ .../mlnx-platform-api/sonic_platform/psu.py | 2 ++ .../sonic_platform/thermal.py | 6 ++++- .../sonic_platform/thermal_actions.py | 22 +++++++++---------- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index 985924f30857..d0114dedaee7 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -241,6 +241,8 @@ def set_speed(self, speed): status = True if self.is_psu_fan: + if not self.get_presence(): + return False from .thermal import logger try: with open(self.psu_i2c_bus_path, 'r') as f: diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index 4da87d95e1e5..2a44a917c7a2 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -112,6 +112,8 @@ def _read_generic_file(self, filename, len): """ result = 0 try: + if not os.path.exists(filename): + return result with open(filename, 'r') as fileobj: result = int(fileobj.read().strip()) except Exception as e: diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index be91cf9d523e..7f462b9c3092 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -495,12 +495,15 @@ def set_thermal_algorithm_status(cls, status, force=True): We usually disable the algorithm when we want to set a fix speed. E.g, when a fan unit is removed from system, we will set fan speed to 100% and disable the algorithm to avoid it adjust the speed. + + Returns: + True if thermal algorithm status changed. """ if not cls.thermal_profile: raise Exception("Fail to get thermal profile for this switch") if not force and cls.thermal_algorithm_status == status: - return + return False cls.thermal_algorithm_status = status content = "enabled" if status else "disabled" @@ -521,6 +524,7 @@ def set_thermal_algorithm_status(cls, status, force=True): for index in range(count): cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_MODE), content) cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_POLICY), policy) + return True @classmethod def check_thermal_zone_temperature(cls): diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 3a4d5f2a8a68..9a4cde05b842 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -66,9 +66,6 @@ def set_psu_fan_speed(cls, thermal_info_dict, speed): for psu_fan in psu.get_all_fans(): psu_fan.set_speed(speed) - logger.log_info('Updated PSU FAN speed to {}%'.format(speed)) - - @thermal_json_object('fan.all.check_and_set_speed') class CheckAndSetAllFanSpeedAction(SetAllFanSpeedAction): @@ -131,14 +128,17 @@ def execute(self, thermal_info_dict): from .thermal import Thermal from .thermal_conditions import UpdateCoolingLevelToMinCondition from .fan import Fan - Thermal.set_thermal_algorithm_status(self.status, False) - if self.status: - # Check thermal zone temperature, if all thermal zone temperature - # back to normal, set it to minimum allowed speed to - # save power - UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) + status_changed = Thermal.set_thermal_algorithm_status(self.status, False) - logger.log_info('Changed thermal algorithm status to {}'.format(self.status)) + # Only update cooling level if thermal algorithm status changed + if status_changed: + if self.status: + # Check thermal zone temperature, if all thermal zone temperature + # back to normal, set it to minimum allowed speed to + # save power + UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) + + logger.log_info('Changed thermal algorithm status to {}'.format(self.status)) class ChangeMinCoolingLevelAction(ThermalPolicyActionBase): @@ -174,8 +174,6 @@ def execute(self, thermal_info_dict): Fan.set_cooling_level(Fan.min_cooling_level, current_cooling_level) UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) - logger.log_info('Changed minimum cooling level to {}'.format(Fan.min_cooling_level)) - class UpdatePsuFanSpeedAction(ThermalPolicyActionBase): def execute(self, thermal_info_dict): From 0078acf65fb119db97f169c8557026e177282ac0 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Wed, 27 May 2020 01:46:29 +0800 Subject: [PATCH 06/14] [Mellanox] Never disable kernel thermal algorithm at real-time (#4638) --- .../x86_64-mlnx_msn2700-r0/thermal_policy.json | 15 +-------------- .../sonic_platform/thermal_actions.py | 6 ++++++ 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json index f16f68dd002e..1e23d6c8b2bd 100644 --- a/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json +++ b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json @@ -23,10 +23,6 @@ } ], "actions": [ - { - "type": "thermal_control.control", - "status": "false" - }, { "type": "fan.all.set_speed", "speed": "100" @@ -41,10 +37,6 @@ } ], "actions": [ - { - "type": "thermal_control.control", - "status": "false" - }, { "type": "fan.all.set_speed", "speed": "100" @@ -59,10 +51,6 @@ } ], "actions": [ - { - "type": "thermal_control.control", - "status": "false" - }, { "type": "fan.all.set_speed", "speed": "100" @@ -84,8 +72,7 @@ ], "actions": [ { - "type": "thermal_control.control", - "status": "true" + "type": "thermal.recover" } ] } diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 9a4cde05b842..e7436bd0a5b7 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -141,6 +141,12 @@ def execute(self, thermal_info_dict): logger.log_info('Changed thermal algorithm status to {}'.format(self.status)) +@thermal_json_object('thermal.recover') +class ThermalRecoverAction(ThermalPolicyActionBase): + def execute(self, thermal_info_dict): + UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) + + class ChangeMinCoolingLevelAction(ThermalPolicyActionBase): UNKNOWN_SKU_COOLING_LEVEL = 6 def execute(self, thermal_info_dict): From 2baaeaaceb5e0c2686507dc6ee026b90f74db0f5 Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Mon, 1 Jun 2020 10:38:34 +0300 Subject: [PATCH 07/14] remove simx patch and add new patch for disable hw-management thermal control service --- .../0001-Make-hw-mgmt-SimX-compatiable.patch | 53 ------------------- ...-Disable-thermal-policy-running-in-h.patch | 27 ++++++++++ ...-Disable-thermal-policy-running-in-h.patch | 31 ----------- 3 files changed, 27 insertions(+), 84 deletions(-) delete mode 100644 platform/mellanox/hw-management/0001-Make-hw-mgmt-SimX-compatiable.patch create mode 100644 platform/mellanox/hw-management/0001-hw-management.sh-Disable-thermal-policy-running-in-h.patch delete mode 100644 platform/mellanox/hw-management/0002-hw-management.sh-Disable-thermal-policy-running-in-h.patch diff --git a/platform/mellanox/hw-management/0001-Make-hw-mgmt-SimX-compatiable.patch b/platform/mellanox/hw-management/0001-Make-hw-mgmt-SimX-compatiable.patch deleted file mode 100644 index a72c94473e88..000000000000 --- a/platform/mellanox/hw-management/0001-Make-hw-mgmt-SimX-compatiable.patch +++ /dev/null @@ -1,53 +0,0 @@ -From ebb17bd1f6996f73cb67313846a63c789e74c4f4 Mon Sep 17 00:00:00 2001 -From: Mykola Faryma -Date: Fri, 21 Feb 2020 12:28:54 +0200 -Subject: [PATCH 1/1] Make hw-mgmt SimX compatiable - -Signed-off-by: Mykola Faryma ---- - usr/usr/bin/hw-management.sh | 29 +++++++++++++++++++++++++++++ - 1 file changed, 29 insertions(+) - -diff --git a/usr/usr/bin/hw-management.sh b/usr/usr/bin/hw-management.sh -index 1b5b18a..3dfd4b1 100755 ---- a/usr/usr/bin/hw-management.sh -+++ b/usr/usr/bin/hw-management.sh -@@ -943,6 +943,35 @@ do_chip_down() - /usr/bin/hw-management-thermal-events.sh change hotplug_asic down %S %p - } - -+handle_simx() -+{ -+ local -r onie_platform="$(cat /host/machine.conf | grep onie_platform | cut -d= -f2)" -+ -+ local -r syseeprom_cache_path="/var/cache/sonic/decode-syseeprom/syseeprom_cache" -+ local -r syseeprom_hex_path="/usr/share/sonic/device/${onie_platform}/syseeprom.hex" -+ local -r syseeprom_vpd_path="/var/run/hw-management/eeprom/vpd_info" -+ -+ case $ACTION in -+ start) -+ /bin/bash -c "/bin/rm -f ${syseeprom_cache_path}" -+ /bin/bash -c "/bin/mkdir -p ${eeprom_path}" -+ /bin/bash -c "/usr/bin/xxd -r -p ${syseeprom_hex_path} ${syseeprom_vpd_path}" -+ ;; -+ stop) -+ /bin/bash -c "/bin/rm -fr ${hw_management_path}" -+ ;; -+ *) -+ echo "Usage: `basename $0` {start|stop}" -+ exit 1 -+ ;; -+ esac -+} -+ -+if [[ "$(cat /sys/devices/virtual/dmi/id/sys_vendor)" = "QEMU" ]]; then -+ handle_simx -+ exit 0 -+fi -+ - case $ACTION in - start) - if [ -d /var/run/hw-management ]; then --- -1.9.1 - diff --git a/platform/mellanox/hw-management/0001-hw-management.sh-Disable-thermal-policy-running-in-h.patch b/platform/mellanox/hw-management/0001-hw-management.sh-Disable-thermal-policy-running-in-h.patch new file mode 100644 index 000000000000..2bdadebcd0c2 --- /dev/null +++ b/platform/mellanox/hw-management/0001-hw-management.sh-Disable-thermal-policy-running-in-h.patch @@ -0,0 +1,27 @@ +From 3512488c981eb81d51ce92cb3573721e36861f56 Mon Sep 17 00:00:00 2001 +From: Junchao Chen +Date: Fri, 29 May 2020 10:38:53 +0300 +Subject: [PATCH] Disable hw-management thermal control service + +--- + usr/usr/bin/hw-management.sh | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/usr/usr/bin/hw-management.sh b/usr/usr/bin/hw-management.sh +index 65e5d39..0d1c4a1 100755 +--- a/usr/usr/bin/hw-management.sh ++++ b/usr/usr/bin/hw-management.sh +@@ -832,7 +832,9 @@ do_start() + if [ -f $config_path/max_tachos ]; then + max_tachos=$(<$config_path/max_tachos) + fi +- $THERMAL_CONTROL $thermal_type $max_tachos $max_psus& ++ # Disable hw-management thermal control because ++ # SONiC already implement it ++ #$THERMAL_CONTROL $thermal_type $max_tachos $max_psus& + } + + do_stop() +-- +1.9.1 + diff --git a/platform/mellanox/hw-management/0002-hw-management.sh-Disable-thermal-policy-running-in-h.patch b/platform/mellanox/hw-management/0002-hw-management.sh-Disable-thermal-policy-running-in-h.patch deleted file mode 100644 index d1c34fd16ec0..000000000000 --- a/platform/mellanox/hw-management/0002-hw-management.sh-Disable-thermal-policy-running-in-h.patch +++ /dev/null @@ -1,31 +0,0 @@ -From 76b02916794be2e2558fcff1d11609a594f633d7 Mon Sep 17 00:00:00 2001 -From: Stephen Sun -Date: Fri, 14 Feb 2020 13:48:00 +0800 -Subject: [PATCH] Disable thermal policy running in hw-mgmt service SONiC - thermal control algorithm has been supported. - -Signed-off-by: Stephen Sun ---- - usr/usr/bin/hw-management.sh | 6 +++++- - 1 file changed, 5 insertions(+), 1 deletion(-) - -diff --git a/usr/usr/bin/hw-management.sh b/usr/usr/bin/hw-management.sh -index 2cdbfb2..48b41d5 100755 ---- a/usr/usr/bin/hw-management.sh -+++ b/usr/usr/bin/hw-management.sh -@@ -799,7 +799,11 @@ do_start() - #disabled for leopard chipless bringup. - echo 1 > $config_path/suspend - -- $THERMAL_CONTROL $thermal_type $max_tachos $max_psus& -+# -+# Disable thermal control algorithm in hw-management service -+# because there has already been that in SONiC -+# -+# $THERMAL_CONTROL $thermal_type $max_tachos $max_psus& - } - - do_stop() --- -1.9.1 - From 05dc67ed3371e6f62eaa030cd9f0ff3a026fadf5 Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Mon, 1 Jun 2020 10:39:24 +0300 Subject: [PATCH 08/14] fix merge issue --- .../mlnx-platform-api/sonic_platform/chassis.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index 1ac782555673..5ecf3c150de9 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -15,6 +15,7 @@ from sonic_daemon_base.daemon_base import Logger from os import listdir from os.path import isfile, join + from glob import glob import sys import io import re @@ -35,6 +36,10 @@ HWMGMT_SYSTEM_ROOT = '/var/run/hw-management/system/' +MST_DEVICE_NAME_PATTERN = '/dev/mst/mt[0-9]*_pciconf0' +MST_DEVICE_RE_PATTERN = '/dev/mst/mt([0-9]*)_pciconf0' +SPECTRUM1_CHIP_ID = '52100' + #reboot cause related definitions REBOOT_CAUSE_ROOT = HWMGMT_SYSTEM_ROOT @@ -95,6 +100,16 @@ def initialize_fan(self): num_of_fan, num_of_drawer = self._extract_num_of_fans_and_fan_drawers() multi_rotor_in_drawer = num_of_fan > num_of_drawer + # Fan's direction isn't supported on spectrum 1 devices for now + mst_dev_list = glob(MST_DEVICE_NAME_PATTERN) + if not mst_dev_list: + raise RuntimeError("Can't get chip type due to {} not found".format(MST_DEVICE_NAME_PATTERN)) + m = re.search(MST_DEVICE_RE_PATTERN, mst_dev_list[0]) + if m.group(1) == SPECTRUM1_CHIP_ID: + has_fan_dir = False + else: + has_fan_dir = True + for index in range(num_of_fan): if multi_rotor_in_drawer: fan = Fan(has_fan_dir, index, index/2, False, self.platform_name) From 59e2ab53aeb5d52566e0fb0c4e26a1b58ae6eaff Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Mon, 1 Jun 2020 10:40:14 +0300 Subject: [PATCH 09/14] Add support for SN4700, SN4600C and SN3420 --- .../x86_64-mlnx_msn3420-r0/thermal_policy.json | 1 + .../x86_64-mlnx_msn4600c-r0/thermal_policy.json | 1 + .../x86_64-mlnx_msn4700-r0/thermal_policy.json | 1 + .../sonic_platform/device_data.py | 16 ++++++++++++++++ 4 files changed, 19 insertions(+) create mode 120000 device/mellanox/x86_64-mlnx_msn3420-r0/thermal_policy.json create mode 120000 device/mellanox/x86_64-mlnx_msn4600c-r0/thermal_policy.json create mode 120000 device/mellanox/x86_64-mlnx_msn4700-r0/thermal_policy.json diff --git a/device/mellanox/x86_64-mlnx_msn3420-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn3420-r0/thermal_policy.json new file mode 120000 index 000000000000..5a25cd87f70c --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn3420-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn4600c-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn4600c-r0/thermal_policy.json new file mode 120000 index 000000000000..5a25cd87f70c --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn4600c-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/device/mellanox/x86_64-mlnx_msn4700-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn4700-r0/thermal_policy.json new file mode 120000 index 000000000000..5a25cd87f70c --- /dev/null +++ b/device/mellanox/x86_64-mlnx_msn4700-r0/thermal_policy.json @@ -0,0 +1 @@ +../x86_64-mlnx_msn2700-r0/thermal_policy.json \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py index 35b1f14d5bf9..bbf7f36c9256 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -70,5 +70,21 @@ "unk_untrust": {"-127:120":16}, } } + }, + 'x86_64-mlnx_msn3420-r0': { + 'thermal': { + 'minimum_table': { + "unk_trust": {"-127:120":16}, + "unk_untrust": {"-127:120":16}, + } + } + }, + 'x86_64-mlnx_msn4600c-r0': { + 'thermal': { + 'minimum_table': { + "unk_trust": {"-127:120":16}, + "unk_untrust": {"-127:120":16}, + } + } } } \ No newline at end of file From 9132c32f94c32fc5433d2564694bd4803250af5a Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Mon, 1 Jun 2020 10:41:04 +0300 Subject: [PATCH 10/14] Update submodule pointer for sonic-linux-kernel to fix kernel algorithm issue on SN4700 --- src/sonic-linux-kernel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sonic-linux-kernel b/src/sonic-linux-kernel index ea9e81d95c26..e3833d7f2972 160000 --- a/src/sonic-linux-kernel +++ b/src/sonic-linux-kernel @@ -1 +1 @@ -Subproject commit ea9e81d95c269799e991e22cb5797e6da421a3ce +Subproject commit e3833d7f2972b8444b1a9a08882a125a65317d6e From c8beb142083b441ecbfd9ce35f4e36251679f5ef Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Thu, 4 Jun 2020 04:29:30 +0300 Subject: [PATCH 11/14] Update submodule pointer for sonic-platform-common --- src/sonic-platform-common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sonic-platform-common b/src/sonic-platform-common index df964ac98dc4..9036e15dffe9 160000 --- a/src/sonic-platform-common +++ b/src/sonic-platform-common @@ -1 +1 @@ -Subproject commit df964ac98dc46c0096ef19a683ff58637c4e2b05 +Subproject commit 9036e15dffe9b6581e4c724726abbea8446f9993 From 3951ff663f720c2c6a97c7cf93f035a7a4bc57e6 Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Thu, 4 Jun 2020 05:04:34 +0300 Subject: [PATCH 12/14] fix LGTM warnings --- .../mlnx-platform-api/sonic_platform/psu.py | 2 +- .../sonic_platform/thermal_manager.py | 1 - .../tests/test_thermal_policy.py | 18 ------------------ 3 files changed, 1 insertion(+), 20 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index 2a44a917c7a2..eb81fd65a051 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -192,7 +192,7 @@ def _get_led_capability(self): caps = psu_led_cap.read() cap_list = caps.split() except (ValueError, IOError): - status = 0 + pass return cap_list diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index 914eec79816c..8a9f2374db04 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -1,4 +1,3 @@ -import os from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy from .thermal_actions import * diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index 87fac359b2fd..f25b6421ed85 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -11,7 +11,6 @@ from sonic_platform.thermal_manager import ThermalManager from sonic_platform.thermal_infos import FanInfo, PsuInfo -from sonic_platform.fan import Fan from sonic_platform.thermal import Thermal Thermal.check_thermal_zone_temperature = MagicMock() @@ -168,23 +167,6 @@ def test_all_fan_absence_condition(): assert condition.is_match({'fan_info': fan_info}) -def test_all_fan_presence_condition(): - chassis = MockChassis() - chassis.make_fan_absence() - fan = MockFan() - fan_list = chassis.get_all_fans() - fan_list.append(fan) - fan_info = FanInfo() - fan_info.collect(chassis) - - from sonic_platform.thermal_conditions import AllFanPresenceCondition - condition = AllFanPresenceCondition() - assert not condition.is_match({'fan_info': fan_info}) - - fan_list[0].presence = True - fan_info.collect(chassis) - assert condition.is_match({'fan_info': fan_info}) - def test_any_fan_fault_condition(): chassis = MockChassis() fan = MockFan() From 6da400dddb65782e9ea796ff54e77801ce67543e Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Thu, 4 Jun 2020 05:38:49 +0300 Subject: [PATCH 13/14] ignore LGTM warnings for from . import * --- .../mlnx-platform-api/sonic_platform/thermal_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index 8a9f2374db04..c0eae332e435 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -1,8 +1,8 @@ from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy -from .thermal_actions import * -from .thermal_conditions import * -from .thermal_infos import * +from .thermal_actions import * # lgtm [py/polluting-import] +from .thermal_conditions import * # lgtm [py/polluting-import] +from .thermal_infos import * # lgtm [py/polluting-import] class ThermalManager(ThermalManagerBase): From 4b5f579ce6562131804d32d743fc330535a015ab Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Thu, 4 Jun 2020 06:03:31 +0300 Subject: [PATCH 14/14] Update submodule pointer for sonic-utilities --- src/sonic-utilities | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sonic-utilities b/src/sonic-utilities index 3d77d5a9c49d..40e7452d3007 160000 --- a/src/sonic-utilities +++ b/src/sonic-utilities @@ -1 +1 @@ -Subproject commit 3d77d5a9c49d764f9d7c479ec8446a2b8aec4925 +Subproject commit 40e7452d300758341d31f4afee59f2de2eb4dc47