From c730f3e2079ddae78f5db3401eab35046e21dd4f Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Tue, 21 Apr 2020 23:09:53 +0800 Subject: [PATCH] [Mellanox] thermal control enhancement for dynamic minimum fan speed and PSU fan speed policy (#4403) --- .../thermal_policy.json | 27 ++- dockers/docker-platform-monitor/Dockerfile.j2 | 3 +- .../sonic_platform/chassis.py | 12 +- .../sonic_platform/device_data.py | 101 ++++++++ .../mlnx-platform-api/sonic_platform/fan.py | 104 +++++++-- .../mlnx-platform-api/sonic_platform/psu.py | 4 +- .../sonic_platform/thermal.py | 148 +++++++++++- .../sonic_platform/thermal_actions.py | 114 ++++++++- .../sonic_platform/thermal_conditions.py | 68 ++++++ .../sonic_platform/thermal_infos.py | 22 +- .../sonic_platform/thermal_manager.py | 55 +++-- .../mlnx-platform-api/tests/mock_platform.py | 14 +- .../tests/test_thermal_policy.py | 217 +++++++++++++++++- .../tests/thermal_policy.json | 25 ++ 14 files changed, 849 insertions(+), 65 deletions(-) create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py diff --git a/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json index 054d797be951..f16f68dd002e 100644 --- a/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json +++ b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json @@ -1,6 +1,6 @@ { "thermal_control_algorithm": { - "run_at_boot_up": "false", + "run_at_boot_up": "true", "fan_speed_when_suspend": "60" }, "info_types": [ @@ -51,6 +51,24 @@ } ] }, + { + "name": "any fan broken", + "conditions": [ + { + "type": "fan.any.fault" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, { "name": "all fan and psu presence", "conditions": [ @@ -59,12 +77,15 @@ }, { "type": "psu.all.presence" + }, + { + "type": "fan.all.good" } ], "actions": [ { - "type": "fan.all.set_speed", - "speed": "60" + "type": "thermal_control.control", + "status": "true" } ] } diff --git a/dockers/docker-platform-monitor/Dockerfile.j2 b/dockers/docker-platform-monitor/Dockerfile.j2 index c1e86abb5170..d1b6d7dfc779 100755 --- a/dockers/docker-platform-monitor/Dockerfile.j2 +++ b/dockers/docker-platform-monitor/Dockerfile.j2 @@ -17,7 +17,8 @@ RUN apt-get update && \ rrdtool \ python-smbus \ ethtool \ - dmidecode && \ + dmidecode \ + i2c-tools && \ pip install enum34 {% if docker_platform_monitor_debs.strip() -%} diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index 08c2ccf80a5c..d47110ed8f45 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -29,6 +29,7 @@ MLNX_NUM_PSU = 2 GET_HWSKU_CMD = "sonic-cfggen -d -v DEVICE_METADATA.localhost.hwsku" +GET_PLATFORM_CMD = "sonic-cfggen -d -v DEVICE_METADATA.localhost.platform" EEPROM_CACHE_ROOT = '/var/cache/sonic/decode-syseeprom' EEPROM_CACHE_FILE = 'syseeprom_cache' @@ -60,6 +61,7 @@ def __init__(self): # Initialize SKU name self.sku_name = self._get_sku_name() + self.platform_name = self._get_platform_name() mi = get_machine_info() if mi is not None: self.name = mi['onie_platform'] @@ -110,9 +112,9 @@ def initialize_fan(self): for index in range(num_of_fan): if multi_rotor_in_drawer: - fan = Fan(has_fan_dir, index, index/2, False, self.sku_name) + fan = Fan(has_fan_dir, index, index/2, False, self.platform_name) else: - fan = Fan(has_fan_dir, index, index, False, self.sku_name) + fan = Fan(has_fan_dir, index, index, False, self.platform_name) self._fan_list.append(fan) @@ -245,6 +247,12 @@ def _get_sku_name(self): return out.rstrip('\n') + def _get_platform_name(self): + p = subprocess.Popen(GET_PLATFORM_CMD, shell=True, stdout=subprocess.PIPE) + out, err = p.communicate() + return out.rstrip('\n') + + def _get_port_position_tuple_by_sku_name(self): position_tuple = port_position_tuple_list[hwsku_dict_port[self.sku_name]] return position_tuple diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py new file mode 100644 index 000000000000..f006281c511f --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -0,0 +1,101 @@ +DEVICE_DATA = { + 'x86_64-mlnx_msn2700-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:40":13, "41:120":15}, + "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, + "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + } + } + }, + 'x86_64-mlnx_msn2740-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:120":13}, + "p2c_untrust": {"-127:35":13, "36:40":14 , "41:120":15}, + "c2p_trust": {"-127:120":13}, + "c2p_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17}, + "unk_trust": {"-127:120":13}, + "unk_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17}, + } + } + }, + 'x86_64-mlnx_msn2100-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:120":12}, + "p2c_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16}, + "c2p_trust": {"-127:40":12, "41:120":13}, + "c2p_untrust": {"-127:40":12, "41:120":13}, + "unk_trust": {"-127:40":12, "41:120":13}, + "unk_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16} + } + } + }, + 'x86_64-mlnx_msn2410-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:40":13, "41:120":15}, + "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, + "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + } + } + }, + 'x86_64-mlnx_msn2010-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:120":12}, + "p2c_untrust": {"-127:15":12, "16:20":13, "21:30":14, "31:35":15, "36:120":16}, + "c2p_trust": {"-127:120":12}, + "c2p_untrust": {"-127:20":12, "21:25":13 , "26:30":14, "31:35":15, "36:120":16}, + "unk_trust": {"-127:120":12}, + "unk_untrust": {"-127:15":12, "16:20":13 , "21:30":14, "31:35":15, "36:120":16} + } + } + }, + 'x86_64-mlnx_msn3700-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "p2c_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, + "c2p_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "c2p_untrust": {"-127:25":12, "26:40":13 , "41:120":14}, + "unk_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, + } + } + }, + 'x86_64-mlnx_msn3700c-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "p2c_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, + "c2p_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "c2p_untrust": {"-127:25":12, "26:40":13 , "41:120":14}, + "unk_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, + } + } + }, + 'x86_64-mlnx_msn3800-r0': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:35":12, "36:120":13}, + "p2c_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, + "c2p_trust": {"-127:30":12, "31:40":13 , "41:120":14}, + "c2p_untrust": {"-127:20":12, "21:30":13 , "31:35":14, "36:40":15, "41:120":16}, + "unk_trust": {"-127:30":12, "31:40":13 , "41:120":14}, + "unk_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, + } + } + }, + 'x86_64-mlnx_msn4700-r0': { + + } +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index 9ce65d1e2f98..adca48befbc6 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -9,6 +9,7 @@ ############################################################################# import os.path +import subprocess try: from sonic_platform_base.fan_base import FanBase @@ -22,25 +23,34 @@ FAN_PATH = "/var/run/hw-management/thermal/" LED_PATH = "/var/run/hw-management/led/" +CONFIG_PATH = "/var/run/hw-management/config" # fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches FAN_DIR = "/var/run/hw-management/system/fan_dir" +COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state" -# SKUs with unplugable FANs: +# Platforms with unplugable FANs: # 1. don't have fanX_status and should be treated as always present -hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100'] +platform_with_unplugable_fan = ['x86_64-mlnx_msn2010-r0', 'x86_64-mlnx_msn2100-r0'] + class Fan(FanBase): """Platform-specific Fan class""" STATUS_LED_COLOR_ORANGE = "orange" - - def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None): + min_cooling_level = 2 + MIN_VALID_COOLING_LEVEL = 1 + MAX_VALID_COOLING_LEVEL = 10 + # PSU fan speed vector + PSU_FAN_SPEED = ['0x3c', '0x3c', '0x3c', '0x3c', '0x3c', + '0x3c', '0x3c', '0x46', '0x50', '0x5a', '0x64'] + + def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, platform = None): # API index is starting from 0, Mellanox platform index is starting from 1 self.index = fan_index + 1 self.drawer_index = drawer_index + 1 self.is_psu_fan = psu_fan - self.always_presence = False if sku not in hwsku_dict_with_unplugable_fan else True + self.always_presence = False if platform not in platform_with_unplugable_fan else True self.fan_min_speed_path = "fan{}_min".format(self.index) if not self.is_psu_fan: @@ -54,6 +64,10 @@ def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sk self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index) self._name = 'psu_{}_fan_{}'.format(self.index, 1) self.fan_max_speed_path = None + self.psu_i2c_bus_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_bus'.format(self.index)) + self.psu_i2c_addr_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_addr'.format(self.index)) + self.psu_i2c_command_path = os.path.join(CONFIG_PATH, 'fan_command') + self.fan_status_path = "fan{}_fault".format(self.index) self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index) self.fan_red_led_path = "led_fan{}_red".format(self.drawer_index) @@ -90,7 +104,7 @@ def get_direction(self): try: with open(os.path.join(self.fan_dir), 'r') as fan_dir: - fan_dir_bits = int(fan_dir.read()) + fan_dir_bits = int(fan_dir.read().strip()) fan_mask = 1 << self.drawer_index - 1 if fan_dir_bits & fan_mask: return self.FAN_DIRECTION_INTAKE @@ -116,7 +130,7 @@ def get_status(self): else: try: with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status: - status = int(fault_status.read()) + status = int(fault_status.read().strip()) except (ValueError, IOError): status = 1 @@ -142,7 +156,7 @@ def get_presence(self): else: try: with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status: - status = int(presence_status.read()) + status = int(presence_status.read().strip()) except (ValueError, IOError): status = 0 @@ -164,7 +178,7 @@ def _get_max_speed_in_rpm(self): speed = 0 try: with open(os.path.join(FAN_PATH, self.fan_max_speed_path), 'r') as max_fan_speed: - speed = int(max_fan_speed.read()) + speed = int(max_fan_speed.read().strip()) except (ValueError, IOError): speed = 0 @@ -181,7 +195,7 @@ def get_speed(self): speed = 0 try: with open(os.path.join(FAN_PATH, self.fan_speed_get_path), 'r') as fan_curr_speed: - speed_in_rpm = int(fan_curr_speed.read()) + speed_in_rpm = int(fan_curr_speed.read().strip()) except (ValueError, IOError): speed_in_rpm = 0 @@ -210,7 +224,7 @@ def get_target_speed(self): try: with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'r') as fan_pwm: - pwm = int(fan_pwm.read()) + pwm = int(fan_pwm.read().strip()) except (ValueError, IOError): pwm = 0 @@ -231,13 +245,34 @@ def set_speed(self, speed): bool: True if set success, False if fail. """ status = True - pwm = int(round(PWM_MAX*speed/100.0)) if self.is_psu_fan: - #PSU fan speed is not setable. - return False - + from .thermal import logger + try: + with open(self.psu_i2c_bus_path, 'r') as f: + bus = f.read().strip() + with open(self.psu_i2c_addr_path, 'r') as f: + addr = f.read().strip() + with open(self.psu_i2c_command_path, 'r') as f: + command = f.read().strip() + speed = Fan.PSU_FAN_SPEED[int(speed / 10)] + command = "i2cset -f -y {0} {1} {2} {3} wp".format(bus, addr, command, speed) + subprocess.check_call(command, shell = True) + return True + except subprocess.CalledProcessError as ce: + logger.log_error('Failed to call command {}, return code={}, command output={}'.format(ce.cmd, ce.returncode, ce.output)) + return False + except Exception as e: + logger.log_error('Failed to set PSU FAN speed - {}'.format(e)) + return False + try: + cooling_level = int(speed / 10) + if cooling_level < self.min_cooling_level: + cooling_level = self.min_cooling_level + speed = self.min_cooling_level * 10 + self.set_cooling_level(cooling_level, cooling_level) + pwm = int(round(PWM_MAX*speed/100.0)) with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'w') as fan_pwm: fan_pwm.write(str(pwm)) except (ValueError, IOError): @@ -352,3 +387,42 @@ def get_speed_tolerance(self): """ # The tolerance value is fixed as 20% for all the Mellanox platform return 20 + + @classmethod + def set_cooling_level(cls, level, cur_state): + """ + Change cooling level. The input level should be an integer value [1, 10]. + 1 means 10%, 2 means 20%, 10 means 100%. + """ + if not isinstance(level, int): + raise RuntimeError("Failed to set cooling level, input parameter must be integer") + + if level < cls.MIN_VALID_COOLING_LEVEL or level > cls.MAX_VALID_COOLING_LEVEL: + raise RuntimeError("Failed to set cooling level, level value must be in range [{}, {}], got {}".format( + cls.MIN_VALID_COOLING_LEVEL, + cls.MAX_VALID_COOLING_LEVEL, + level + )) + + try: + # Reset FAN cooling level vector. According to low level team, + # if we need set cooling level to X, we need first write a (10+X) + # to cooling_cur_state file to reset the cooling level vector. + with open(COOLING_STATE_PATH, 'w') as cooling_state: + cooling_state.write(str(level + 10)) + + # We need set cooling level after resetting the cooling level vector + with open(COOLING_STATE_PATH, 'w') as cooling_state: + cooling_state.write(str(cur_state)) + except (ValueError, IOError) as e: + raise RuntimeError("Failed to set cooling level - {}".format(e)) + + @classmethod + def get_cooling_level(cls): + try: + with open(COOLING_STATE_PATH, 'r') as cooling_state: + cooling_level = int(cooling_state.read().strip()) + return cooling_level + except (ValueError, IOError) as e: + raise RuntimeError("Failed to get cooling level - {}".format(e)) + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index 1dfcf54baf17..3f2fee433faf 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -101,7 +101,7 @@ def __init__(self, psu_index, sku): # unplugable PSU has no FAN if sku not in hwsku_dict_with_unplugable_psu: - fan = Fan(sku, psu_index, psu_index, True) + fan = Fan(False, psu_index, psu_index, True) self._fan_list.append(fan) self.psu_green_led_path = "led_psu_green" @@ -121,7 +121,7 @@ def _read_generic_file(self, filename, len): result = 0 try: with open(filename, 'r') as fileobj: - result = int(fileobj.read()) + result = int(fileobj.read().strip()) except Exception as e: logger.log_info("Fail to read file {} due to {}".format(filename, repr(e))) return result diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index a5faa5ea793a..39f91913fe1e 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -42,6 +42,16 @@ HW_MGMT_THERMAL_ROOT = "/var/run/hw-management/thermal/" +THERMAL_ZONE_ASIC_PATH = "/var/run/hw-management/thermal/mlxsw/" +THERMAL_ZONE_MODULE_PATH = "/var/run/hw-management/thermal/mlxsw-module{}/" +THERMAL_ZONE_GEARBOX_PATH = "/var/run/hw-management/thermal/mlxsw-gearbox{}/" +THERMAL_ZONE_MODE = "thermal_zone_mode" +THERMAL_ZONE_POLICY = "thermal_zone_policy" +THERMAL_ZONE_TEMPERATURE = "thermal_zone_temp" +THERMAL_ZONE_NORMAL_TEMPERATURE = "temp_trip_norm" + +MODULE_TEMPERATURE_FAULT_PATH = "/var/run/hw-management/thermal/module{}_temp_fault" + thermal_api_handler_cpu_core = { THERMAL_API_GET_TEMPERATURE:"cpu_core{}", THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max", @@ -262,6 +272,7 @@ def initialize_thermals(sku, thermal_list, psu_list): # create thermal objects for all categories of sensors tp_index = hwsku_dict_thermal[sku] thermal_profile = thermal_profile_list[tp_index] + Thermal.thermal_profile = thermal_profile for category in thermal_device_categories_all: if category == THERMAL_DEV_CATEGORY_AMBIENT: count, ambient_list = thermal_profile[category] @@ -290,6 +301,9 @@ def initialize_thermals(sku, thermal_list, psu_list): class Thermal(ThermalBase): + thermal_profile = None + thermal_algorithm_status = False + def __init__(self, category, index, has_index, dependency = None): """ index should be a string for category ambient and int for other categories @@ -321,14 +335,15 @@ def get_name(self): return self.name - def _read_generic_file(self, filename, len): + @classmethod + def _read_generic_file(cls, filename, len): """ Read a generic file, returns the contents of the file """ result = None try: with open(filename, 'r') as fileobj: - result = fileobj.read() + result = fileobj.read().strip() except Exception as e: logger.log_info("Fail to read file {} due to {}".format(filename, repr(e))) return result @@ -420,3 +435,132 @@ def get_high_critical_threshold(self): if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD: return None return value_float / 1000.0 + + + @classmethod + def _write_generic_file(cls, filename, content): + """ + Generic functions to write content to a specified file path if + the content has changed. + """ + try: + with open(filename, 'w+') as file_obj: + origin_content = file_obj.read() + if origin_content != content: + file_obj.write(content) + except Exception as e: + logger.log_info("Fail to write file {} due to {}".format(filename, repr(e))) + + @classmethod + def set_thermal_algorithm_status(cls, status, force=True): + """ + Enable/disable kernel thermal algorithm. + When enable kernel thermal algorithm, kernel will adjust fan speed + according to thermal zones temperature. Please note that kernel will + only adjust fan speed when temperature across some "edge", e.g temperature + changes to exceed high threshold. + When disable kernel thermal algorithm, kernel no longer adjust fan speed. + We usually disable the algorithm when we want to set a fix speed. E.g, when + a fan unit is removed from system, we will set fan speed to 100% and disable + the algorithm to avoid it adjust the speed. + """ + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + + if not force and cls.thermal_algorithm_status == status: + return + + cls.thermal_algorithm_status = status + content = "enabled" if status else "disabled" + policy = "step_wise" if status else "user_space" + cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_POLICY), policy) + + if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + if count != 0: + for index in range(count): + cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_POLICY), policy) + + if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX] + if count != 0: + for index in range(count): + cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_POLICY), policy) + + @classmethod + def check_thermal_zone_temperature(cls): + """ + Check thermal zone current temperature with normal temperature + + Returns: + True if all thermal zones current temperature less or equal than normal temperature + """ + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_ASIC_PATH): + return False + + if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + if count != 0: + for index in range(count): + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_MODULE_PATH.format(start + index)): + return False + + if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX] + if count != 0: + for index in range(count): + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_GEARBOX_PATH.format(start + index)): + return False + + return True + + @classmethod + def _check_thermal_zone_temperature(cls, thermal_zone_path): + normal_temp_path = join(thermal_zone_path, THERMAL_ZONE_NORMAL_TEMPERATURE) + current_temp_path = join(thermal_zone_path, THERMAL_ZONE_TEMPERATURE) + normal = None + current = None + try: + with open(normal_temp_path, 'r') as file_obj: + normal = float(file_obj.read()) + + with open(current_temp_path, 'r') as file_obj: + current = float(file_obj.read()) + + return current <= normal + except Exception as e: + logger.log_info("Fail to check thermal zone temperature for file {} due to {}".format(thermal_zone_path, repr(e))) + + @classmethod + def check_module_temperature_trustable(cls): + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + for index in range(count): + fault_file_path = MODULE_TEMPERATURE_FAULT_PATH.format(index + start) + fault = cls._read_generic_file(fault_file_path, 0) + if fault.strip() != '0': + return 'untrust' + return 'trust' + + @classmethod + def get_air_flow_direction(cls): + fan_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_FAN_AMBIENT) + port_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_PORT_AMBIENT) + + # if there is any exception, let it raise + fan_ambient_temp = int(cls._read_generic_file(fan_ambient_path, 0)) + port_ambient_temp = int(cls._read_generic_file(port_ambient_path, 0)) + if fan_ambient_temp > port_ambient_temp: + return 'p2c', fan_ambient_temp + elif fan_ambient_temp < port_ambient_temp: + return 'c2p', port_ambient_temp + else: + return 'unk', fan_ambient_temp diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 72729287d1c5..1f8292763ddd 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -1,5 +1,6 @@ from sonic_platform_base.sonic_thermal_control.thermal_action_base import ThermalPolicyActionBase from sonic_platform_base.sonic_thermal_control.thermal_json_object import thermal_json_object +from .thermal import logger class SetFanSpeedAction(ThermalPolicyActionBase): @@ -52,7 +53,38 @@ def execute(self, thermal_info_dict): fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] for fan in fan_info_obj.get_presence_fans(): fan.set_speed(self.speed) + logger.log_info('Set all system FAN speed to {}'.format(self.speed)) + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, self.speed) + + @classmethod + def set_psu_fan_speed(cls, thermal_info_dict, speed): + from .thermal_infos import ChassisInfo + if ChassisInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[ChassisInfo.INFO_NAME], ChassisInfo): + chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() + for psu in chassis.get_all_psus(): + for psu_fan in psu.get_all_fans(): + psu_fan.set_speed(speed) + + logger.log_info('Updated PSU FAN speed to {}%'.format(speed)) + + + +@thermal_json_object('fan.all.check_and_set_speed') +class CheckAndSetAllFanSpeedAction(SetAllFanSpeedAction): + """ + Action to check thermal zone temperature and recover speed for all fans + """ + def execute(self, thermal_info_dict): + """ + Check thermal zone and set speed for all fans + :param thermal_info_dict: A dictionary stores all thermal information. + :return: + """ + from .thermal import Thermal + if Thermal.check_thermal_zone_temperature(): + SetAllFanSpeedAction.execute(self, thermal_info_dict) + @thermal_json_object('thermal_control.control') class ControlThermalAlgoAction(ThermalPolicyActionBase): @@ -95,14 +127,80 @@ def execute(self, thermal_info_dict): :param thermal_info_dict: A dictionary stores all thermal information. :return: """ + from .thermal_infos import FanInfo + from .thermal import Thermal + from .thermal_conditions import UpdateCoolingLevelToMinCondition + from .fan import Fan + Thermal.set_thermal_algorithm_status(self.status, False) + if self.status: + # Check thermal zone temperature, if all thermal zone temperature + # back to normal, set it to minimum allowed speed to + # save power + UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) + + logger.log_info('Changed thermal algorithm status to {}'.format(self.status)) + + +class ChangeMinCoolingLevelAction(ThermalPolicyActionBase): + UNKNOWN_SKU_COOLING_LEVEL = 6 + def execute(self, thermal_info_dict): + from .device_data import DEVICE_DATA + from .fan import Fan from .thermal_infos import ChassisInfo - if ChassisInfo.INFO_NAME in thermal_info_dict: - chassis_info_obj = thermal_info_dict[ChassisInfo.INFO_NAME] - chassis = chassis_info_obj.get_chassis() - thermal_manager = chassis.get_thermal_manager() - if self.status: - thermal_manager.start_thermal_control_algorithm() - else: - thermal_manager.stop_thermal_control_algorithm() + from .thermal_conditions import MinCoolingLevelChangeCondition + from .thermal_conditions import UpdateCoolingLevelToMinCondition + chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() + if chassis.platform_name not in DEVICE_DATA or 'thermal' not in DEVICE_DATA[chassis.platform_name] or 'minimum_table' not in DEVICE_DATA[chassis.platform_name]['thermal']: + Fan.min_cooling_level = ChangeMinCoolingLevelAction.UNKNOWN_SKU_COOLING_LEVEL + else: + air_flow_dir = MinCoolingLevelChangeCondition.air_flow_dir + trust_state = MinCoolingLevelChangeCondition.trust_state + temperature = MinCoolingLevelChangeCondition.temperature + minimum_table = DEVICE_DATA[chassis.platform_name]['thermal']['minimum_table']['{}_{}'.format(air_flow_dir, trust_state)] + + for key, cooling_level in minimum_table.items(): + temp_range = key.split(':') + temp_min = int(temp_range[0].strip()) + temp_max = int(temp_range[1].strip()) + if temp_min <= temperature <= temp_max: + Fan.min_cooling_level = cooling_level - 10 + break + + current_cooling_level = Fan.get_cooling_level() + if current_cooling_level < Fan.min_cooling_level: + Fan.set_cooling_level(Fan.min_cooling_level, Fan.min_cooling_level) + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, Fan.min_cooling_level * 10) + else: + Fan.set_cooling_level(Fan.min_cooling_level, current_cooling_level) + UpdateCoolingLevelToMinAction.update_cooling_level_to_minimum(thermal_info_dict) + + logger.log_info('Changed minimum cooling level to {}'.format(Fan.min_cooling_level)) + + +class UpdatePsuFanSpeedAction(ThermalPolicyActionBase): + def execute(self, thermal_info_dict): + from .thermal_conditions import CoolingLevelChangeCondition + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, CoolingLevelChangeCondition.cooling_level * 10) + + +class UpdateCoolingLevelToMinAction(ThermalPolicyActionBase): + def execute(self, thermal_info_dict): + self.update_cooling_level_to_minimum(thermal_info_dict) + + @classmethod + def update_cooling_level_to_minimum(cls, thermal_info_dict): + from .fan import Fan + from .thermal import Thermal + from .thermal_conditions import UpdateCoolingLevelToMinCondition + from .thermal_infos import FanInfo + if Thermal.check_thermal_zone_temperature(): + fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] + speed = Fan.min_cooling_level * 10 + for fan in fan_info_obj.get_presence_fans(): + fan.set_speed(speed) + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, speed) + UpdateCoolingLevelToMinCondition.enable = False + else: + UpdateCoolingLevelToMinCondition.enable = True diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py index 2df59acc9bf1..6bd2d282862b 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py @@ -32,6 +32,20 @@ def is_match(self, thermal_info_dict): return len(fan_info_obj.get_absence_fans()) == 0 if fan_info_obj else False +@thermal_json_object('fan.any.fault') +class AnyFanFaultCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_fault_fans()) > 0 if fan_info_obj else False + + +@thermal_json_object('fan.all.good') +class AllFanGoodCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_fault_fans()) == 0 if fan_info_obj else False + + class PsuCondition(ThermalPolicyConditionBase): def get_psu_info(self, thermal_info_dict): from .thermal_infos import PsuInfo @@ -61,3 +75,57 @@ def is_match(self, thermal_info_dict): psu_info_obj = self.get_psu_info(thermal_info_dict) return len(psu_info_obj.get_absence_psus()) == 0 if psu_info_obj else False + +class MinCoolingLevelChangeCondition(ThermalPolicyConditionBase): + trust_state = None + air_flow_dir = None + temperature = None + + def is_match(self, thermal_info_dict): + from .thermal import Thermal + + trust_state = Thermal.check_module_temperature_trustable() + air_flow_dir, temperature = Thermal.get_air_flow_direction() + temperature = temperature / 1000 + + change_cooling_level = False + if trust_state != MinCoolingLevelChangeCondition.trust_state: + MinCoolingLevelChangeCondition.trust_state = trust_state + change_cooling_level = True + + if air_flow_dir != MinCoolingLevelChangeCondition.air_flow_dir: + MinCoolingLevelChangeCondition.air_flow_dir = air_flow_dir + change_cooling_level = True + + if temperature != MinCoolingLevelChangeCondition.temperature: + MinCoolingLevelChangeCondition.temperature = temperature + change_cooling_level = True + + return change_cooling_level + + +class CoolingLevelChangeCondition(ThermalPolicyConditionBase): + cooling_level = None + + def is_match(self, thermal_info_dict): + from .fan import Fan + current_cooling_level = Fan.get_cooling_level() + if current_cooling_level != CoolingLevelChangeCondition.cooling_level: + CoolingLevelChangeCondition.cooling_level = current_cooling_level + return True + else: + return False + + +class UpdateCoolingLevelToMinCondition(ThermalPolicyConditionBase): + enable = False + def is_match(self, thermal_info_dict): + if not UpdateCoolingLevelToMinCondition.enable: + return False + + from .fan import Fan + current_cooling_level = Fan.get_cooling_level() + if current_cooling_level == Fan.min_cooling_level: + UpdateCoolingLevelToMinCondition.enable = False + return False + return True diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py index 82c186495f5e..e810a5646456 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py @@ -14,6 +14,7 @@ class FanInfo(ThermalPolicyInfoBase): def __init__(self): self._absence_fans = set() self._presence_fans = set() + self._fault_fans = set() self._status_changed = False def collect(self, chassis): @@ -24,17 +25,27 @@ def collect(self, chassis): """ self._status_changed = False for fan in chassis.get_all_fans(): - if fan.get_presence() and fan not in self._presence_fans: + presence = fan.get_presence() + status = fan.get_status() + if presence and fan not in self._presence_fans: self._presence_fans.add(fan) self._status_changed = True if fan in self._absence_fans: self._absence_fans.remove(fan) - elif not fan.get_presence() and fan not in self._absence_fans: + elif not presence and fan not in self._absence_fans: self._absence_fans.add(fan) self._status_changed = True if fan in self._presence_fans: self._presence_fans.remove(fan) + if not status and fan not in self._fault_fans: + self._fault_fans.add(fan) + self._status_changed = True + elif status and fan in self._fault_fans: + self._fault_fans.remove(fan) + self._status_changed = True + + def get_absence_fans(self): """ Retrieves absence fans @@ -49,6 +60,13 @@ def get_presence_fans(self): """ return self._presence_fans + def get_fault_fans(self): + """ + Retrieves fault fans + :return: A set of fault fans + """ + return self._fault_fans + def is_status_changed(self): """ Retrieves if the status of fan information changed diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index 133bb078ca20..914eec79816c 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -1,12 +1,29 @@ import os from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase +from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy from .thermal_actions import * from .thermal_conditions import * from .thermal_infos import * class ThermalManager(ThermalManagerBase): - THERMAL_ALGORITHM_CONTROL_PATH = '/var/run/hw-management/config/suspend' + @classmethod + def initialize(cls): + """ + Initialize thermal manager, including register thermal condition types and thermal action types + and any other vendor specific initialization. + :return: + """ + cls._add_private_thermal_policy() + + @classmethod + def deinitialize(cls): + """ + Destroy thermal manager, including any vendor specific cleanup. The default behavior of this function + is a no-op. + :return: + """ + cls.start_thermal_control_algorithm() @classmethod def start_thermal_control_algorithm(cls): @@ -16,7 +33,8 @@ def start_thermal_control_algorithm(cls): Returns: bool: True if set success, False if fail. """ - cls._control_thermal_control_algorithm(False) + from .thermal import Thermal + Thermal.set_thermal_algorithm_status(True) @classmethod def stop_thermal_control_algorithm(cls): @@ -26,25 +44,22 @@ def stop_thermal_control_algorithm(cls): Returns: bool: True if set success, False if fail. """ - cls._control_thermal_control_algorithm(True) + from .thermal import Thermal + Thermal.set_thermal_algorithm_status(False) @classmethod - def _control_thermal_control_algorithm(cls, suspend): - """ - Control thermal control algorithm - - Args: - suspend: Bool, indicate suspend the algorithm or not + def _add_private_thermal_policy(cls): + dynamic_min_speed_policy = ThermalPolicy() + dynamic_min_speed_policy.conditions[MinCoolingLevelChangeCondition] = MinCoolingLevelChangeCondition() + dynamic_min_speed_policy.actions[ChangeMinCoolingLevelAction] = ChangeMinCoolingLevelAction() + cls._policy_dict['DynamicMinCoolingLevelPolicy'] = dynamic_min_speed_policy - Returns: - bool: True if set success, False if fail. - """ - status = True - write_value = 1 if suspend else 0 - try: - with open(cls.THERMAL_ALGORITHM_CONTROL_PATH, 'w') as control_file: - control_file.write(str(write_value)) - except (ValueError, IOError): - status = False + update_psu_fan_speed_policy = ThermalPolicy() + update_psu_fan_speed_policy.conditions[CoolingLevelChangeCondition] = CoolingLevelChangeCondition() + update_psu_fan_speed_policy.actions[UpdatePsuFanSpeedAction] = UpdatePsuFanSpeedAction() + cls._policy_dict['UpdatePsuFanSpeedPolicy'] = update_psu_fan_speed_policy - return status + update_cooling_level_policy = ThermalPolicy() + update_cooling_level_policy.conditions[UpdateCoolingLevelToMinCondition] = UpdateCoolingLevelToMinCondition() + update_cooling_level_policy.actions[UpdateCoolingLevelToMinAction] = UpdateCoolingLevelToMinAction() + cls._policy_dict['UpdateCoolingLevelPolicy'] = update_cooling_level_policy diff --git a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py index f34ace97968d..c53480584889 100644 --- a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py +++ b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py @@ -1,13 +1,20 @@ class MockFan: + speed = 60 def __init__(self): self.presence = True - self.speed = 60 + self.status = True def get_presence(self): return self.presence def set_speed(self, speed): - self.speed = speed + MockFan.speed = speed + + def get_status(self): + return self.status + + def get_target_speed(self): + return MockFan.speed class MockPsu: @@ -21,6 +28,9 @@ def get_presence(self): def get_powergood_status(self): return self.powergood + def get_all_fans(self): + return [] + class MockChassis: def __init__(self): diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index 843244e937fa..835d7a495bbb 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -11,6 +11,11 @@ from sonic_platform.thermal_manager import ThermalManager from sonic_platform.thermal_infos import FanInfo, PsuInfo +from sonic_platform.fan import Fan +from sonic_platform.thermal import Thermal + +Thermal.check_thermal_zone_temperature = MagicMock() +Thermal.set_thermal_algorithm_status = MagicMock() @pytest.fixture(scope='session', autouse=True) @@ -27,6 +32,7 @@ def test_load_policy(thermal_manager): assert 'any fan absence' in thermal_manager._policy_dict assert 'any psu absence' in thermal_manager._policy_dict + assert 'any fan broken' in thermal_manager._policy_dict assert 'all fan and psu presence' in thermal_manager._policy_dict assert thermal_manager._fan_speed_when_suspend == 60 @@ -40,6 +46,7 @@ def test_fan_info(): fan_info.collect(chassis) assert len(fan_info.get_absence_fans()) == 1 assert len(fan_info.get_presence_fans()) == 0 + assert len(fan_info.get_fault_fans()) == 0 assert fan_info.is_status_changed() fan_list = chassis.get_all_fans() @@ -47,8 +54,15 @@ def test_fan_info(): fan_info.collect(chassis) assert len(fan_info.get_absence_fans()) == 0 assert len(fan_info.get_presence_fans()) == 1 + assert len(fan_info.get_fault_fans()) == 0 assert fan_info.is_status_changed() + fan_list[0].status = False + fan_info.collect(chassis) + assert len(fan_info.get_absence_fans()) == 0 + assert len(fan_info.get_presence_fans()) == 1 + assert len(fan_info.get_fault_fans()) == 1 + assert fan_info.is_status_changed() def test_psu_info(): chassis = MockChassis() @@ -77,35 +91,47 @@ def test_fan_policy(thermal_manager): chassis = MockChassis() chassis.make_fan_absence() chassis.fan_list.append(MockFan()) - thermal_manager.start_thermal_control_algorithm = MagicMock() - thermal_manager.stop_thermal_control_algorithm = MagicMock() thermal_manager.run_policy(chassis) fan_list = chassis.get_all_fans() assert fan_list[1].speed == 100 - thermal_manager.stop_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) fan_list[0].presence = True + Thermal.check_thermal_zone_temperature = MagicMock(return_value=True) + thermal_manager.run_policy(chassis) + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) + assert Thermal.check_thermal_zone_temperature.call_count == 2 + assert fan_list[0].speed == 60 + assert fan_list[1].speed == 60 + + fan_list[0].status = False + thermal_manager.run_policy(chassis) + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) + + fan_list[0].status = True + Thermal.check_thermal_zone_temperature = MagicMock(return_value=False) thermal_manager.run_policy(chassis) - thermal_manager.start_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) + assert Thermal.check_thermal_zone_temperature.call_count == 2 + assert fan_list[0].speed == 100 + assert fan_list[1].speed == 100 def test_psu_policy(thermal_manager): chassis = MockChassis() chassis.make_psu_absence() chassis.fan_list.append(MockFan()) - thermal_manager.start_thermal_control_algorithm = MagicMock() - thermal_manager.stop_thermal_control_algorithm = MagicMock() thermal_manager.run_policy(chassis) fan_list = chassis.get_all_fans() assert fan_list[0].speed == 100 - thermal_manager.stop_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) psu_list = chassis.get_all_psus() psu_list[0].presence = True thermal_manager.run_policy(chassis) - thermal_manager.start_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) def test_any_fan_absence_condition(): @@ -159,6 +185,44 @@ def test_all_fan_presence_condition(): fan_info.collect(chassis) assert condition.is_match({'fan_info': fan_info}) +def test_any_fan_fault_condition(): + chassis = MockChassis() + fan = MockFan() + fan_list = chassis.get_all_fans() + fan_list.append(fan) + fault_fan = MockFan() + fault_fan.status = False + fan_list.append(fault_fan) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AnyFanFaultCondition + condition = AnyFanFaultCondition() + assert condition.is_match({'fan_info': fan_info}) + + fault_fan.status = True + fan_info.collect(chassis) + assert not condition.is_match({'fan_info': fan_info}) + +def test_all_fan_good_condition(): + chassis = MockChassis() + fan = MockFan() + fan_list = chassis.get_all_fans() + fan_list.append(fan) + fault_fan = MockFan() + fault_fan.status = False + fan_list.append(fault_fan) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AllFanGoodCondition + condition = AllFanGoodCondition() + assert not condition.is_match({'fan_info': fan_info}) + + fault_fan.status = True + fan_info.collect(chassis) + assert condition.is_match({'fan_info': fan_info}) + def test_any_psu_absence_condition(): chassis = MockChassis() @@ -275,6 +339,53 @@ def test_load_control_thermal_algo_action(): with pytest.raises(ValueError): action.load_from_json(json_obj) +def test_load_check_and_set_speed_action(): + from sonic_platform.thermal_actions import CheckAndSetAllFanSpeedAction + action = CheckAndSetAllFanSpeedAction() + json_str = '{\"speed\": \"40\"}' + json_obj = json.loads(json_str) + action.load_from_json(json_obj) + assert action.speed == 40 + + json_str = '{\"speed\": \"-1\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"speed\": \"101\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"invalid\": \"60\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + +def test_execute_check_and_set_fan_speed_action(): + chassis = MockChassis() + fan_list = chassis.get_all_fans() + fan_list.append(MockFan()) + fan_list.append(MockFan()) + fan_info = FanInfo() + fan_info.collect(chassis) + Thermal.check_thermal_zone_temperature = MagicMock(return_value=True) + + from sonic_platform.thermal_actions import CheckAndSetAllFanSpeedAction + action = CheckAndSetAllFanSpeedAction() + action.speed = 99 + action.execute({'fan_info': fan_info}) + assert fan_list[0].speed == 99 + assert fan_list[1].speed == 99 + + Thermal.check_thermal_zone_temperature = MagicMock(return_value=False) + fan_list[0].speed = 100 + fan_list[1].speed = 100 + action.speed = 60 + action.execute({'fan_info': fan_info}) + assert fan_list[0].speed == 100 + assert fan_list[1].speed == 100 + def test_load_duplicate_condition(): from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy with open(os.path.join(test_path, 'duplicate_condition.json')) as f: @@ -315,4 +426,94 @@ class MockThermalManager(ThermalManagerBase): with pytest.raises(Exception): MockThermalManager.load(os.path.join(test_path, 'policy_with_same_conditions.json')) +def test_dynamic_minimum_table_data(): + from sonic_platform.device_data import DEVICE_DATA + for platform, platform_data in DEVICE_DATA.items(): + if 'thermal' in platform_data and 'minimum_table' in platform_data['thermal']: + minimum_table = platform_data['thermal']['minimum_table'] + check_minimum_table_data(platform, minimum_table) + +def check_minimum_table_data(platform, minimum_table): + valid_dir = ['p2c', 'c2p', 'unk'] + valid_trust_state = ['trust', 'untrust'] + + for category, data in minimum_table.items(): + key_data = category.split('_') + assert key_data[0] in valid_dir + assert key_data[1] in valid_trust_state + + data_list = [(value, key) for key, value in data.items()] + data_list.sort(key=lambda x : x[0]) + + previous_edge = None + previous_cooling_level = None + for item in data_list: + cooling_level = item[0] + range_str = item[1] + + ranges = range_str.split(':') + low = int(ranges[0]) + high = int(ranges[1]) + assert low < high + + if previous_edge is None: + assert low == -127 + else: + assert low - previous_edge == 1, '{}-{}-{} error, item={}'.format(platform, key_data[0], key_data[1], item) + previous_edge = high + + assert 10 <= cooling_level <= 20 + if previous_cooling_level is not None: + assert cooling_level > previous_cooling_level + previous_cooling_level = cooling_level + +def test_dynamic_minimum_policy(thermal_manager): + from sonic_platform.thermal_conditions import MinCoolingLevelChangeCondition + from sonic_platform.thermal_actions import ChangeMinCoolingLevelAction + from sonic_platform.thermal_infos import ChassisInfo + from sonic_platform.thermal import Thermal + from sonic_platform.fan import Fan + ThermalManager.initialize() + assert 'DynamicMinCoolingLevelPolicy' in thermal_manager._policy_dict + policy = thermal_manager._policy_dict['DynamicMinCoolingLevelPolicy'] + assert MinCoolingLevelChangeCondition in policy.conditions + assert ChangeMinCoolingLevelAction in policy.actions + + condition = policy.conditions[MinCoolingLevelChangeCondition] + action = policy.actions[ChangeMinCoolingLevelAction] + Thermal.check_module_temperature_trustable = MagicMock(return_value='trust') + Thermal.get_air_flow_direction = MagicMock(return_value=('p2c', 35000)) + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.trust_state == 'trust' + assert MinCoolingLevelChangeCondition.air_flow_dir == 'p2c' + assert MinCoolingLevelChangeCondition.temperature == 35 + assert not condition.is_match(None) + + Thermal.check_module_temperature_trustable = MagicMock(return_value='untrust') + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.trust_state == 'untrust' + + Thermal.get_air_flow_direction = MagicMock(return_value=('c2p', 35000)) + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.air_flow_dir == 'c2p' + + Thermal.get_air_flow_direction = MagicMock(return_value=('c2p', 25000)) + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.temperature == 25 + chassis = MockChassis() + chassis.platform_name = 'invalid' + info = ChassisInfo() + info._chassis = chassis + thermal_info_dict = {ChassisInfo.INFO_NAME: info} + Fan.get_cooling_level = MagicMock(return_value=5) + Fan.set_cooling_level = MagicMock() + action.execute(thermal_info_dict) + assert Fan.min_cooling_level == 6 + Fan.set_cooling_level.assert_called_with(6, 6) + Fan.set_cooling_level.call_count = 0 + + chassis.platform_name = 'x86_64-mlnx_msn2700-r0' + action.execute(thermal_info_dict) + assert Fan.min_cooling_level == 4 + Fan.set_cooling_level.assert_called_with(4, 5) diff --git a/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json b/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json index 5d31b2abd875..413211b21220 100644 --- a/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json +++ b/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json @@ -51,6 +51,24 @@ } ] }, + { + "name": "any fan broken", + "conditions": [ + { + "type": "fan.any.fault" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, { "name": "all fan and psu presence", "conditions": [ @@ -59,12 +77,19 @@ }, { "type": "psu.all.presence" + }, + { + "type": "fan.all.good" } ], "actions": [ { "type": "thermal_control.control", "status": "true" + }, + { + "type": "fan.all.check_and_set_speed", + "speed": "60" } ] }