-
Notifications
You must be signed in to change notification settings - Fork 1.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Mellanox] thermal control enhancement for dynamic minimum fan speed and PSU fan speed policy #4403
Changes from 3 commits
3b77d71
6ab058a
5a75eae
981bfb5
df85565
4398275
3a4f559
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
DEVICE_DATA = { | ||
'ACS-MSN2700': { | ||
'thermal': { | ||
'minimum_table': { | ||
"p2c_trust": {"-127:40":13, "41:120":15}, | ||
"p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, | ||
"c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} | ||
} | ||
} | ||
}, | ||
'LS-SN2700': { | ||
'thermal': { | ||
'minimum_table': { | ||
"p2c_trust": {"-127:40":13, "41:120":15}, | ||
"p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, | ||
"c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} | ||
} | ||
} | ||
}, | ||
'ACS-MSN2740': { | ||
'thermal': { | ||
'minimum_table': { | ||
"p2c_trust": {"-127:120":13}, | ||
"p2c_untrust": {"-127:35":13, "36:40":14 , "41:120":15}, | ||
"c2p_trust": {"-127:120":13}, | ||
"c2p_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17}, | ||
"unk_trust": {"-127:120":13}, | ||
"unk_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17}, | ||
} | ||
} | ||
}, | ||
'ACS-MSN2410': { | ||
'thermal': { | ||
'minimum_table': { | ||
"p2c_trust": {"-127:40":13, "41:120":15}, | ||
"p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, | ||
"c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} | ||
} | ||
} | ||
}, | ||
'Mellanox-SN2700': { | ||
'thermal': { | ||
'minimum_table': { | ||
"p2c_trust": {"-127:40":13, "41:120":15}, | ||
"p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, | ||
"c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} | ||
} | ||
} | ||
}, | ||
'Mellanox-SN2700-D48C8': { | ||
'thermal': { | ||
'minimum_table': { | ||
"p2c_trust": {"-127:40":13, "41:120":15}, | ||
"p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, | ||
"c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, | ||
"unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} | ||
} | ||
} | ||
}, | ||
'ACS-MSN2100': { | ||
'thermal': { | ||
'minimum_table': { | ||
"p2c_trust": {"-127:120":12}, | ||
"p2c_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16}, | ||
"c2p_trust": {"-127:40":12, "41:120":13}, | ||
"c2p_untrust": {"-127:40":12, "41:120":13}, | ||
"unk_trust": {"-127:40":12, "41:120":13}, | ||
"unk_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16} | ||
} | ||
} | ||
}, | ||
'ACS-MSN2010': { | ||
'thermal': { | ||
'minimum_table': { | ||
"p2c_trust": {"-127:120":12}, | ||
"p2c_untrust": {"-127:15":12, "16:20":13, "21:30":14, "31:35":15, "36:120":16}, | ||
"c2p_trust": {"-127:120":12}, | ||
"c2p_untrust": {"-127:20":12, "21:25":13 , "26:30":14, "31:35":15, "36:120":16}, | ||
"unk_trust": {"-127:120":12}, | ||
"unk_untrust": {"-127:15":12, "16:20":13 , "21:30":14, "31:35":15, "36:120":16} | ||
} | ||
} | ||
}, | ||
'ACS-MSN3700': { | ||
'thermal': { | ||
'minimum_table': { | ||
"p2c_trust": {"-127:25":12, "26:40":13 , "41:120":14}, | ||
"p2c_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, | ||
"c2p_trust": {"-127:25":12, "26:40":13 , "41:120":14}, | ||
"c2p_untrust": {"-127:25":12, "26:40":13 , "41:120":14}, | ||
"unk_trust": {"-127:25":12, "26:40":13 , "41:120":14}, | ||
"unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, | ||
} | ||
} | ||
}, | ||
'ACS-MSN3800': { | ||
'thermal': { | ||
'minimum_table': { | ||
"p2c_trust": {"-127:35":12, "36:120":13}, | ||
"p2c_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, | ||
"c2p_trust": {"-127:30":12, "31:40":13 , "41:120":14}, | ||
"c2p_untrust": {"-127:20":12, "21:30":13 , "31:35":14, "36:40":15, "41:120":16}, | ||
"unk_trust": {"-127:30":12, "31:40":13 , "41:120":14}, | ||
"unk_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, | ||
} | ||
} | ||
}, | ||
'Mellanox-SN3800-D112C8': { | ||
'thermal': { | ||
'minimum_table': { | ||
"p2c_trust": {"-127:35":12, "36:120":13}, | ||
"p2c_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, | ||
"c2p_trust": {"-127:30":12, "31:40":13 , "41:120":14}, | ||
"c2p_untrust": {"-127:20":12, "21:30":13 , "31:35":14, "36:40":15, "41:120":16}, | ||
"unk_trust": {"-127:30":12, "31:40":13 , "41:120":14}, | ||
"unk_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, | ||
} | ||
} | ||
}, | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are missing SN4700 which is also supported. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't have minimum table for 4700. We set minimum FAN speed to 60% for 4700 or any other platform who has no minimum table. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
############################################################################# | ||
|
||
import os.path | ||
import subprocess | ||
|
||
try: | ||
from sonic_platform_base.fan_base import FanBase | ||
|
@@ -22,17 +23,24 @@ | |
|
||
FAN_PATH = "/var/run/hw-management/thermal/" | ||
LED_PATH = "/var/run/hw-management/led/" | ||
CONFIG_PATH = "/var/run/hw-management/config" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think we have this config path on different files. is this correct to have it this way instead of having it inherited? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is correct. The config path here is a directory and I only need it to find some i2c address related to PSU fan. I don't see difference among differenct platforms. |
||
# fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches | ||
FAN_DIR = "/var/run/hw-management/system/fan_dir" | ||
COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state" | ||
|
||
# SKUs with unplugable FANs: | ||
# 1. don't have fanX_status and should be treated as always present | ||
hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here, should not be SKU but rather a system type |
||
|
||
|
||
class Fan(FanBase): | ||
"""Platform-specific Fan class""" | ||
|
||
STATUS_LED_COLOR_ORANGE = "orange" | ||
min_cooling_level = 2 | ||
# PSU fan speed vector | ||
PSU_FAN_SPEED = ['0x3c', '0x3c', '0x3c', '0x3c', '0x3c', | ||
'0x3c', '0x3c', '0x46', '0x50', '0x5a', '0x64'] | ||
|
||
def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None): | ||
# API index is starting from 0, Mellanox platform index is starting from 1 | ||
|
@@ -54,6 +62,10 @@ def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sk | |
self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index) | ||
self._name = 'psu_{}_fan_{}'.format(self.index, 1) | ||
self.fan_max_speed_path = None | ||
self.psu_i2c_bus_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_bus'.format(self.index)) | ||
self.psu_i2c_addr_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_addr'.format(self.index)) | ||
self.psu_i2c_command_path = os.path.join(CONFIG_PATH, 'fan_command') | ||
|
||
self.fan_status_path = "fan{}_fault".format(self.index) | ||
self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index) | ||
self.fan_red_led_path = "led_fan{}_red".format(self.drawer_index) | ||
|
@@ -231,13 +243,34 @@ def set_speed(self, speed): | |
bool: True if set success, False if fail. | ||
""" | ||
status = True | ||
pwm = int(round(PWM_MAX*speed/100.0)) | ||
|
||
if self.is_psu_fan: | ||
#PSU fan speed is not setable. | ||
return False | ||
|
||
from .thermal import logger | ||
try: | ||
with open(self.psu_i2c_bus_path, 'r') as f: | ||
bus = f.read().strip() | ||
with open(self.psu_i2c_addr_path, 'r') as f: | ||
addr = f.read().strip() | ||
with open(self.psu_i2c_command_path, 'r') as f: | ||
command = f.read().strip() | ||
speed = Fan.PSU_FAN_SPEED[int(speed / 10)] | ||
command = "i2cset -f -y {0} {1} {2} {3} wp".format(bus, addr, command, speed) | ||
subprocess.check_call(command, shell = True) | ||
return True | ||
except subprocess.CalledProcessError as ce: | ||
logger.log_error('Failed to call command {}, return code={}, command output={}'.format(ce.cmd, ce.returncode, ce.output)) | ||
return False | ||
except Exception as e: | ||
logger.log_error('Failed to set PSU FAN speed - {}'.format(e)) | ||
return False | ||
|
||
try: | ||
cooling_level = int(speed / 10) | ||
if cooling_level < self.min_cooling_level: | ||
cooling_level = self.min_cooling_level | ||
speed = self.min_cooling_level * 10 | ||
self.set_cooling_level(cooling_level) | ||
pwm = int(round(PWM_MAX*speed/100.0)) | ||
with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'w') as fan_pwm: | ||
fan_pwm.write(str(pwm)) | ||
except (ValueError, IOError): | ||
|
@@ -352,3 +385,36 @@ def get_speed_tolerance(self): | |
""" | ||
# The tolerance value is fixed as 20% for all the Mellanox platform | ||
return 20 | ||
|
||
@classmethod | ||
def set_cooling_level(cls, level): | ||
""" | ||
Change cooling level. The input level should be an integer value [1, 10]. | ||
1 means 10%, 2 means 20%, 10 means 100%. | ||
""" | ||
if not isinstance(level, int): | ||
raise RuntimeError("Failed to set cooling level, input parameter must be integer") | ||
|
||
if level < 1 or level > 10: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggest to have min, max defines and use it for both if case and prints. |
||
raise RuntimeError("Failed to set cooling level, level value must be in range [1, 10], got {}".format(level)) | ||
|
||
try: | ||
# reset FAN driver and change cooling state | ||
with open(COOLING_STATE_PATH, 'w') as cooling_state: | ||
cooling_state.write(str(level + 10)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you please clarify? how do you reset with just setting the cooling level? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added comment to code to explain this. |
||
|
||
# make cooling state display correct value | ||
with open(COOLING_STATE_PATH, 'w') as cooling_state: | ||
cooling_state.write(str(level)) | ||
except (ValueError, IOError) as e: | ||
raise RuntimeError("Failed to set cooling level - {}".format(e)) | ||
|
||
@classmethod | ||
def get_cooling_level(cls): | ||
try: | ||
with open(COOLING_STATE_PATH, 'r') as cooling_state: | ||
cooling_level = int(cooling_state.read()) | ||
return cooling_level | ||
except (ValueError, IOError) as e: | ||
raise RuntimeError("Failed to get cooling level - {}".format(e)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seems like for the different SKU which are different flavour of the same Platform type we need to define the same table over and over. can you please find a better way to do so?
for example: ACS-MSN2700, LS-SN2700, Mellanox-SN2700, Mellanox-SN2700-D48C8, etc.
soon when we will have a dynamic SKU creator we will not be able to support it thus it cannot be based on SKU.
we have the same limitation in sfputil and this is wrong.