Skip to content

Commit

Permalink
[Mellanox] Support PSU power threshold checking (#11863)
Browse files Browse the repository at this point in the history
* Support power threshold

Signed-off-by: Stephen Sun <stephens@nvidia.com>

* get_psu_power_warning_threshold => get_psu_power_warning_suppress_threshold

Signed-off-by: Stephen Sun <stephens@nvidia.com>

* Fix comments

Signed-off-by: Stephen Sun <stephens@nvidia.com>

Signed-off-by: Stephen Sun <stephens@nvidia.com>
  • Loading branch information
stephenxs authored Nov 21, 2022
1 parent f402e6b commit 5d45759
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 0 deletions.
57 changes: 57 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/psu.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,11 @@ class Psu(FixedPsu):
PSU_VPD = "eeprom/psu{}_vpd"
PSU_CURRENT_IN = "power/psu{}_curr_in"
PSU_VOLT_IN = "power/psu{}_volt_in"
PORT_AMBIENT_TEMP = os.path.join(PSU_PATH, "thermal/port_amb")
FAN_AMBIENT_TEMP = os.path.join(PSU_PATH, "thermal/fan_amb")
AMBIENT_TEMP_CRITICAL_THRESHOLD = os.path.join(PSU_PATH, "config/amb_tmp_crit_limit")
AMBIENT_TEMP_WARNING_THRESHOLD = os.path.join(PSU_PATH, "config/amb_tmp_warn_limit")
PSU_POWER_SLOPE = os.path.join(PSU_PATH, "config/psu_power_slope")

shared_led = None

Expand All @@ -235,6 +240,8 @@ def __init__(self, psu_index):
self.psu_power_max = self.psu_power + "_max"
self.psu_presence = os.path.join(PSU_PATH, "thermal/psu{}_status".format(self.index))

self.psu_power_max_capacity = os.path.join(PSU_PATH, "config/psu{}_power_capacity".format(self.index))

self.psu_temp = os.path.join(PSU_PATH, 'thermal/psu{}_temp'.format(self.index))
self.psu_temp_threshold = os.path.join(PSU_PATH, 'thermal/psu{}_temp_max'.format(self.index))

Expand Down Expand Up @@ -505,6 +512,56 @@ def get_input_current(self):
return float(amperes) / 1000
return None

def _get_psu_power_threshold(self, temp_threshold_path):
"""
Calculate power threshold for a PSU according to the maximum power capacity and ambient temperature
amb_temp = min(port_amb, fan_amb)
If amb_temp < ambient_temp_threshold
threshold = max capacity
else
threshold = max capacity - slope*(amb_temp - ambient_temp_threshold)
"""
if self.get_powergood_status():
if os.path.exists(self.psu_power_max_capacity):
power_max_capacity = utils.read_int_from_file(self.psu_power_max_capacity)
temp_threshold = utils.read_int_from_file(temp_threshold_path)
fan_ambient_temp = utils.read_int_from_file(Psu.FAN_AMBIENT_TEMP)
port_ambient_temp = utils.read_int_from_file(Psu.PORT_AMBIENT_TEMP)
ambient_temp = min(fan_ambient_temp, port_ambient_temp)
if ambient_temp < temp_threshold:
power_threshold = power_max_capacity
else:
slope = utils.read_int_from_file(Psu.PSU_POWER_SLOPE)
power_threshold = power_max_capacity - (ambient_temp - temp_threshold) * slope
if power_threshold <= 0:
logger.log_warning('Got negative PSU power threshold {} for {}'.format(power_threshold, self.get_name()))
power_threshold = 0
return float(power_threshold) / 1000000

return None

def get_psu_power_warning_suppress_threshold(self):
"""
Retrieve the warning suppress threshold of the power on this PSU
The value can be volatile, so the caller should call the API each time it is used.
On Mellanox platform, it is translated from the `warning threshold`
Returns:
A float number, the warning suppress threshold of the PSU in watts.
"""
return self._get_psu_power_threshold(Psu.AMBIENT_TEMP_WARNING_THRESHOLD)

def get_psu_power_critical_threshold(self):
"""
Retrieve the critical threshold of the power on this PSU
The value can be volatile, so the caller should call the API each time it is used.
Returns:
A float number, the critical threshold of the PSU in watts.
"""
return self._get_psu_power_threshold(Psu.AMBIENT_TEMP_CRITICAL_THRESHOLD)


class InvalidPsuVolWA:
"""This class is created as a workaround for a known hardware issue that the PSU voltage threshold could be a
invalid value 127998. Once we read a voltage threshold value equal to 127998, we should do following:
Expand Down
55 changes: 55 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/test_psu.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,58 @@ def get_entry_value(key):
vpd_info[InvalidPsuVolWA.CAPACITY_FIELD] = InvalidPsuVolWA.EXPECT_CAPACITY
assert InvalidPsuVolWA.run(psu, InvalidPsuVolWA.INVALID_VOLTAGE_VALUE, '') == 9999
mock_run_command.assert_called_with(['sensors', '-s'])

@mock.patch('os.path.exists', mock.MagicMock(return_value=True))
@mock.patch('sonic_platform.utils.read_int_from_file')
def test_psu_power_threshold(self, mock_read_int_from_file):
Psu.all_psus_support_power_threshold = True
psu = Psu(0)
common_info = {
psu.psu_oper_status: 1,
psu.psu_power_max_capacity: 100000000,
psu.AMBIENT_TEMP_CRITICAL_THRESHOLD: 65000,
psu.AMBIENT_TEMP_WARNING_THRESHOLD: 55000,
psu.PSU_POWER_SLOPE: 2000
}
normal_data = {
psu.PORT_AMBIENT_TEMP: 55000,
psu.FAN_AMBIENT_TEMP: 50000,
'warning_threshold': 100.0,
'critical_threshold': 100.0
}
warning_data = {
psu.PORT_AMBIENT_TEMP: 65000,
psu.FAN_AMBIENT_TEMP: 60000,
'warning_threshold': 90.0,
'critical_threshold': 100.0
}
critical_data = {
psu.PORT_AMBIENT_TEMP: 70000,
psu.FAN_AMBIENT_TEMP: 75000,
'warning_threshold': 70.0,
'critical_threshold': 90.0
}
test_data = {}
def mock_side_effect(value):
if value in common_info:
return common_info[value]
else:
return test_data[value]

mock_read_int_from_file.side_effect = mock_side_effect
test_data = normal_data
assert psu.get_psu_power_warning_suppress_threshold() == normal_data['warning_threshold']
assert psu.get_psu_power_critical_threshold() == normal_data['critical_threshold']

test_data = warning_data
assert psu.get_psu_power_warning_suppress_threshold() == warning_data['warning_threshold']
assert psu.get_psu_power_critical_threshold() == warning_data['critical_threshold']

test_data = critical_data
assert psu.get_psu_power_warning_suppress_threshold() == critical_data['warning_threshold']
assert psu.get_psu_power_critical_threshold() == critical_data['critical_threshold']

def test_psu_not_support_power_threshold(self):
psu = Psu(0)
assert psu.get_psu_power_warning_suppress_threshold() is None
assert psu.get_psu_power_critical_threshold() is None

0 comments on commit 5d45759

Please sign in to comment.