Skip to content

Commit

Permalink
[Mellanox] thermal control enhancement for dynamic minimum fan speed …
Browse files Browse the repository at this point in the history
…and PSU fan speed policy (#4403)
  • Loading branch information
Junchao-Mellanox authored Apr 21, 2020
1 parent 860cb26 commit c730f3e
Show file tree
Hide file tree
Showing 14 changed files with 849 additions and 65 deletions.
27 changes: 24 additions & 3 deletions device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"thermal_control_algorithm": {
"run_at_boot_up": "false",
"run_at_boot_up": "true",
"fan_speed_when_suspend": "60"
},
"info_types": [
Expand Down Expand Up @@ -51,6 +51,24 @@
}
]
},
{
"name": "any fan broken",
"conditions": [
{
"type": "fan.any.fault"
}
],
"actions": [
{
"type": "thermal_control.control",
"status": "false"
},
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
},
{
"name": "all fan and psu presence",
"conditions": [
Expand All @@ -59,12 +77,15 @@
},
{
"type": "psu.all.presence"
},
{
"type": "fan.all.good"
}
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "60"
"type": "thermal_control.control",
"status": "true"
}
]
}
Expand Down
3 changes: 2 additions & 1 deletion dockers/docker-platform-monitor/Dockerfile.j2
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ RUN apt-get update && \
rrdtool \
python-smbus \
ethtool \
dmidecode && \
dmidecode \
i2c-tools && \
pip install enum34

{% if docker_platform_monitor_debs.strip() -%}
Expand Down
12 changes: 10 additions & 2 deletions platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
MLNX_NUM_PSU = 2

GET_HWSKU_CMD = "sonic-cfggen -d -v DEVICE_METADATA.localhost.hwsku"
GET_PLATFORM_CMD = "sonic-cfggen -d -v DEVICE_METADATA.localhost.platform"

EEPROM_CACHE_ROOT = '/var/cache/sonic/decode-syseeprom'
EEPROM_CACHE_FILE = 'syseeprom_cache'
Expand Down Expand Up @@ -60,6 +61,7 @@ def __init__(self):

# Initialize SKU name
self.sku_name = self._get_sku_name()
self.platform_name = self._get_platform_name()
mi = get_machine_info()
if mi is not None:
self.name = mi['onie_platform']
Expand Down Expand Up @@ -110,9 +112,9 @@ def initialize_fan(self):

for index in range(num_of_fan):
if multi_rotor_in_drawer:
fan = Fan(has_fan_dir, index, index/2, False, self.sku_name)
fan = Fan(has_fan_dir, index, index/2, False, self.platform_name)
else:
fan = Fan(has_fan_dir, index, index, False, self.sku_name)
fan = Fan(has_fan_dir, index, index, False, self.platform_name)
self._fan_list.append(fan)


Expand Down Expand Up @@ -245,6 +247,12 @@ def _get_sku_name(self):
return out.rstrip('\n')


def _get_platform_name(self):
p = subprocess.Popen(GET_PLATFORM_CMD, shell=True, stdout=subprocess.PIPE)
out, err = p.communicate()
return out.rstrip('\n')


def _get_port_position_tuple_by_sku_name(self):
position_tuple = port_position_tuple_list[hwsku_dict_port[self.sku_name]]
return position_tuple
Expand Down
101 changes: 101 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
DEVICE_DATA = {
'x86_64-mlnx_msn2700-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:40":13, "41:120":15},
"p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16},
"c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}
}
}
},
'x86_64-mlnx_msn2740-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:120":13},
"p2c_untrust": {"-127:35":13, "36:40":14 , "41:120":15},
"c2p_trust": {"-127:120":13},
"c2p_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17},
"unk_trust": {"-127:120":13},
"unk_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17},
}
}
},
'x86_64-mlnx_msn2100-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:120":12},
"p2c_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16},
"c2p_trust": {"-127:40":12, "41:120":13},
"c2p_untrust": {"-127:40":12, "41:120":13},
"unk_trust": {"-127:40":12, "41:120":13},
"unk_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16}
}
}
},
'x86_64-mlnx_msn2410-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:40":13, "41:120":15},
"p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16},
"c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}
}
}
},
'x86_64-mlnx_msn2010-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:120":12},
"p2c_untrust": {"-127:15":12, "16:20":13, "21:30":14, "31:35":15, "36:120":16},
"c2p_trust": {"-127:120":12},
"c2p_untrust": {"-127:20":12, "21:25":13 , "26:30":14, "31:35":15, "36:120":16},
"unk_trust": {"-127:120":12},
"unk_untrust": {"-127:15":12, "16:20":13 , "21:30":14, "31:35":15, "36:120":16}
}
}
},
'x86_64-mlnx_msn3700-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"p2c_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16},
"c2p_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"c2p_untrust": {"-127:25":12, "26:40":13 , "41:120":14},
"unk_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16},
}
}
},
'x86_64-mlnx_msn3700c-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"p2c_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16},
"c2p_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"c2p_untrust": {"-127:25":12, "26:40":13 , "41:120":14},
"unk_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16},
}
}
},
'x86_64-mlnx_msn3800-r0': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:35":12, "36:120":13},
"p2c_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17},
"c2p_trust": {"-127:30":12, "31:40":13 , "41:120":14},
"c2p_untrust": {"-127:20":12, "21:30":13 , "31:35":14, "36:40":15, "41:120":16},
"unk_trust": {"-127:30":12, "31:40":13 , "41:120":14},
"unk_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17},
}
}
},
'x86_64-mlnx_msn4700-r0': {

}
}
104 changes: 89 additions & 15 deletions platform/mellanox/mlnx-platform-api/sonic_platform/fan.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#############################################################################

import os.path
import subprocess

try:
from sonic_platform_base.fan_base import FanBase
Expand All @@ -22,25 +23,34 @@

FAN_PATH = "/var/run/hw-management/thermal/"
LED_PATH = "/var/run/hw-management/led/"
CONFIG_PATH = "/var/run/hw-management/config"
# fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches
FAN_DIR = "/var/run/hw-management/system/fan_dir"
COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state"

# SKUs with unplugable FANs:
# Platforms with unplugable FANs:
# 1. don't have fanX_status and should be treated as always present
hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100']
platform_with_unplugable_fan = ['x86_64-mlnx_msn2010-r0', 'x86_64-mlnx_msn2100-r0']


class Fan(FanBase):
"""Platform-specific Fan class"""

STATUS_LED_COLOR_ORANGE = "orange"

def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None):
min_cooling_level = 2
MIN_VALID_COOLING_LEVEL = 1
MAX_VALID_COOLING_LEVEL = 10
# PSU fan speed vector
PSU_FAN_SPEED = ['0x3c', '0x3c', '0x3c', '0x3c', '0x3c',
'0x3c', '0x3c', '0x46', '0x50', '0x5a', '0x64']

def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, platform = None):
# API index is starting from 0, Mellanox platform index is starting from 1
self.index = fan_index + 1
self.drawer_index = drawer_index + 1

self.is_psu_fan = psu_fan
self.always_presence = False if sku not in hwsku_dict_with_unplugable_fan else True
self.always_presence = False if platform not in platform_with_unplugable_fan else True

self.fan_min_speed_path = "fan{}_min".format(self.index)
if not self.is_psu_fan:
Expand All @@ -54,6 +64,10 @@ def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sk
self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index)
self._name = 'psu_{}_fan_{}'.format(self.index, 1)
self.fan_max_speed_path = None
self.psu_i2c_bus_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_bus'.format(self.index))
self.psu_i2c_addr_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_addr'.format(self.index))
self.psu_i2c_command_path = os.path.join(CONFIG_PATH, 'fan_command')

self.fan_status_path = "fan{}_fault".format(self.index)
self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index)
self.fan_red_led_path = "led_fan{}_red".format(self.drawer_index)
Expand Down Expand Up @@ -90,7 +104,7 @@ def get_direction(self):

try:
with open(os.path.join(self.fan_dir), 'r') as fan_dir:
fan_dir_bits = int(fan_dir.read())
fan_dir_bits = int(fan_dir.read().strip())
fan_mask = 1 << self.drawer_index - 1
if fan_dir_bits & fan_mask:
return self.FAN_DIRECTION_INTAKE
Expand All @@ -116,7 +130,7 @@ def get_status(self):
else:
try:
with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status:
status = int(fault_status.read())
status = int(fault_status.read().strip())
except (ValueError, IOError):
status = 1

Expand All @@ -142,7 +156,7 @@ def get_presence(self):
else:
try:
with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status:
status = int(presence_status.read())
status = int(presence_status.read().strip())
except (ValueError, IOError):
status = 0

Expand All @@ -164,7 +178,7 @@ def _get_max_speed_in_rpm(self):
speed = 0
try:
with open(os.path.join(FAN_PATH, self.fan_max_speed_path), 'r') as max_fan_speed:
speed = int(max_fan_speed.read())
speed = int(max_fan_speed.read().strip())
except (ValueError, IOError):
speed = 0

Expand All @@ -181,7 +195,7 @@ def get_speed(self):
speed = 0
try:
with open(os.path.join(FAN_PATH, self.fan_speed_get_path), 'r') as fan_curr_speed:
speed_in_rpm = int(fan_curr_speed.read())
speed_in_rpm = int(fan_curr_speed.read().strip())
except (ValueError, IOError):
speed_in_rpm = 0

Expand Down Expand Up @@ -210,7 +224,7 @@ def get_target_speed(self):

try:
with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'r') as fan_pwm:
pwm = int(fan_pwm.read())
pwm = int(fan_pwm.read().strip())
except (ValueError, IOError):
pwm = 0

Expand All @@ -231,13 +245,34 @@ def set_speed(self, speed):
bool: True if set success, False if fail.
"""
status = True
pwm = int(round(PWM_MAX*speed/100.0))

if self.is_psu_fan:
#PSU fan speed is not setable.
return False

from .thermal import logger
try:
with open(self.psu_i2c_bus_path, 'r') as f:
bus = f.read().strip()
with open(self.psu_i2c_addr_path, 'r') as f:
addr = f.read().strip()
with open(self.psu_i2c_command_path, 'r') as f:
command = f.read().strip()
speed = Fan.PSU_FAN_SPEED[int(speed / 10)]
command = "i2cset -f -y {0} {1} {2} {3} wp".format(bus, addr, command, speed)
subprocess.check_call(command, shell = True)
return True
except subprocess.CalledProcessError as ce:
logger.log_error('Failed to call command {}, return code={}, command output={}'.format(ce.cmd, ce.returncode, ce.output))
return False
except Exception as e:
logger.log_error('Failed to set PSU FAN speed - {}'.format(e))
return False

try:
cooling_level = int(speed / 10)
if cooling_level < self.min_cooling_level:
cooling_level = self.min_cooling_level
speed = self.min_cooling_level * 10
self.set_cooling_level(cooling_level, cooling_level)
pwm = int(round(PWM_MAX*speed/100.0))
with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'w') as fan_pwm:
fan_pwm.write(str(pwm))
except (ValueError, IOError):
Expand Down Expand Up @@ -352,3 +387,42 @@ def get_speed_tolerance(self):
"""
# The tolerance value is fixed as 20% for all the Mellanox platform
return 20

@classmethod
def set_cooling_level(cls, level, cur_state):
"""
Change cooling level. The input level should be an integer value [1, 10].
1 means 10%, 2 means 20%, 10 means 100%.
"""
if not isinstance(level, int):
raise RuntimeError("Failed to set cooling level, input parameter must be integer")

if level < cls.MIN_VALID_COOLING_LEVEL or level > cls.MAX_VALID_COOLING_LEVEL:
raise RuntimeError("Failed to set cooling level, level value must be in range [{}, {}], got {}".format(
cls.MIN_VALID_COOLING_LEVEL,
cls.MAX_VALID_COOLING_LEVEL,
level
))

try:
# Reset FAN cooling level vector. According to low level team,
# if we need set cooling level to X, we need first write a (10+X)
# to cooling_cur_state file to reset the cooling level vector.
with open(COOLING_STATE_PATH, 'w') as cooling_state:
cooling_state.write(str(level + 10))

# We need set cooling level after resetting the cooling level vector
with open(COOLING_STATE_PATH, 'w') as cooling_state:
cooling_state.write(str(cur_state))
except (ValueError, IOError) as e:
raise RuntimeError("Failed to set cooling level - {}".format(e))

@classmethod
def get_cooling_level(cls):
try:
with open(COOLING_STATE_PATH, 'r') as cooling_state:
cooling_level = int(cooling_state.read().strip())
return cooling_level
except (ValueError, IOError) as e:
raise RuntimeError("Failed to get cooling level - {}".format(e))

Loading

0 comments on commit c730f3e

Please sign in to comment.