From 06948bf13f24ad855a0e24745611a94403de9b8e Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Thu, 11 Jul 2019 14:14:03 +0300 Subject: [PATCH 1/3] [Mellanox]refractor the sfp event change notification logic for new platform api remove the standalong daemon which is in charge of polling sfp change event through sdk interface and move the polling stuff to the event in the chassis daemon. --- .../sonic_platform/chassis.py | 82 +++++++ .../mlnx-platform-api/sonic_platform/sfp.py | 1 + .../sonic_platform/sfp_event.py | 206 ++++++++++++++++++ 3 files changed, 289 insertions(+) create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index f9875a296d35..cc92f2a27937 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -19,6 +19,7 @@ from sonic_platform.watchdog import get_watchdog from sonic_daemon_base.daemon_base import Logger from eeprom import Eeprom + from sfp_event import sfp_event from os import listdir from os.path import isfile, join import io @@ -28,6 +29,8 @@ except ImportError as e: raise ImportError (str(e) + "- required module not found") +MAX_SELECT_DELAY = 3600 + MLNX_NUM_PSU = 2 GET_HWSKU_CMD = "sonic-cfggen -d -v DEVICE_METADATA.localhost.hwsku" @@ -121,6 +124,14 @@ def __init__(self): self._component_name_list.append(COMPONENT_CPLD1) self._component_name_list.append(COMPONENT_CPLD2) + # Initialize sfp-change-listening stuff + self._init_sfp_change_event() + + def _init_sfp_change_event(self): + self.sfp_event = sfp_event() + self.sfp_event.initialize() + self.MAX_SELECT_EVENT_RETURNED = self.PORT_END + def _extract_num_of_fans_and_fan_drawers(self): num_of_fan = 0 num_of_drawer = 0 @@ -318,3 +329,74 @@ def get_firmware_version(self, component_name): return self._get_firmware_version() return None + + def _show_capabilities(self): + """ + This function is for debug purpose + Some features require a xSFP module to support some capabilities but it's unrealistic to + check those modules one by one. + So this function is introduce to show some capabilities of all xSFP modules mounted on the device. + """ + for s in self._sfp_list: + try: + print "index {} tx disable {} dom {} calibration {} temp {} volt {} power (tx {} rx {})".format(s.index, + s.dom_tx_disable_supported, + s.dom_supported, + s.calibration, + s.dom_temp_supported, + s.dom_volt_supported, + s.dom_rx_power_supported, + s.dom_tx_power_supported + ) + except: + print "fail to retrieve capabilities for module index {}".format(s.index) + + def get_change_event(self, timeout=0): + """ + Returns a nested dictionary containing all devices which have + experienced a change at chassis level + + Args: + timeout: Timeout in milliseconds (optional). If timeout == 0, + this method will block until a change is detected. + + Returns: + (bool, dict): + - True if call successful, False if not; + - A nested dictionary where key is a device type, + value is a dictionary with key:value pairs in the format of + {'device_id':'device_event'}, + where device_id is the device ID for this device and + device_event, + status='1' represents device inserted, + status='0' represents device removed. + Ex. {'fan':{'0':'0', '2':'1'}, 'sfp':{'11':'0'}} + indicates that fan 0 has been removed, fan 2 + has been inserted and sfp 11 has been removed. + """ + wait_for_ever = (timeout == 0) + port_dict = {} + if wait_for_ever: + timeout = MAX_SELECT_DELAY + while True: + status = self.sfp_event.check_sfp_status(port_dict, timeout) + if not port_dict == {}: + break + else: + status = self.sfp_event.check_sfp_status(port_dict, timeout) + + # if no event polled, just return empty set + if status: + # workaround. + # check_sfp_status cannot return all the notifications in fd via a single call + # due to sdk reason (see comment in check_sfp_status for detail). + # we have to iterate in a loop to get all the notifications in the fd. + i = 0 + while i < self.MAX_SELECT_EVENT_RETURNED: + status = self.sfp_event.check_sfp_status(port_dict, 0) + if not status: + break + i = i + 1 + return True, {'sfp':port_dict} + else: + return True, {} diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py index 6df3e6437b9d..9ea9c21899f5 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py @@ -327,6 +327,7 @@ def _dom_capability_detect(self): self.dom_volt_supported = False self.dom_rx_power_supported = False self.dom_tx_power_supported = False + self.calibration = 0 self.dom_tx_disable_supported = (int(sfp_dom_capability_raw[1], 16) & 0x40 != 0) else: self.dom_supported = False diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py new file mode 100644 index 000000000000..d4aa3f93d3e4 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python +''' +listen to the SDK for the SFP change event and return to chassis. +''' + +from __future__ import print_function +import sys, errno +import os +import time +import select +from python_sdk_api.sx_api import * +from sonic_daemon_base.daemon_base import Logger + +SYSLOG_IDENTIFIER = "sfp-event" + +SDK_SFP_STATE_IN = 0x1 +SDK_SFP_STATE_OUT = 0x2 +STATUS_PLUGIN = '1' +STATUS_PLUGOUT = '0' +STATUS_UNKNOWN = '2' + +sfp_value_status_dict = { + SDK_SFP_STATE_IN: STATUS_PLUGIN, + SDK_SFP_STATE_OUT: STATUS_PLUGOUT, +} + +PMPE_PACKET_SIZE = 2000 + +logger = Logger(SYSLOG_IDENTIFIER) + +class sfp_event: + ''' Listen to plugin/plugout cable events ''' + + SX_OPEN_RETRIES = 20 + + def __init__(self): + self.swid = 0 + self.handle = None + + def initialize(self): + # open SDK API handle + # retry at most SX_OPEN_RETRIES times to wait + # until SDK is started during system startup + retry = 1 + while True: + rc, self.handle = sx_api_open(None) + if rc == SX_STATUS_SUCCESS: + break + + logger.log_info("failed to open SDK API handle... retrying {}".format(retry)) + + time.sleep(2 ** retry) + retry += 1 + + if retry > self.SX_OPEN_RETRIES: + raise RuntimeError("failed to open SDK API handle after {} retries".format(retry)) + + # Allocate SDK fd and user channel structures + self.rx_fd_p = new_sx_fd_t_p() + self.user_channel_p = new_sx_user_channel_t_p() + + rc = sx_api_host_ifc_open(self.handle, self.rx_fd_p) + if rc != SX_STATUS_SUCCESS: + raise RuntimeError("sx_api_host_ifc_open exited with error, rc {}".format(rc)) + + self.user_channel_p.type = SX_USER_CHANNEL_TYPE_FD + self.user_channel_p.channel.fd = self.rx_fd_p + + rc = sx_api_host_ifc_trap_id_register_set(self.handle, + SX_ACCESS_CMD_REGISTER, + self.swid, + SX_TRAP_ID_PMPE, + self.user_channel_p) + if rc != SX_STATUS_SUCCESS: + raise RuntimeError("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(c)) + + def deinitialize(self): + if self.handle is None: + return + + # unregister trap id + rc = sx_api_host_ifc_trap_id_register_set(self.handle, + SX_ACCESS_CMD_DEREGISTER, + self.swid, + SX_TRAP_ID_PMPE, + self.user_channel_p) + if rc != SX_STATUS_SUCCESS: + logger.log_error("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(rc)) + + rc = sx_api_host_ifc_close(self.handle, self.rx_fd_p) + if rc != SX_STATUS_SUCCESS: + logger.log_error("sx_api_host_ifc_close exited with error, rc {}".format(rc)) + + rc = sx_api_close(self.handle) + if rc != SX_STATUS_SUCCESS: + logger.log_error("sx_api_close exited with error, rc {}".format(rc)) + + delete_sx_fd_t_p(self.rx_fd_p) + delete_sx_user_channel_t_p(self.user_channel_p) + + def check_sfp_status(self, port_change, timeout): + """ + the meaning of timeout is aligned with select.select, which has the following meaning: + 0: poll, returns without blocked + arbitrary positive value: doesn't returns until at least fd in the set is ready or + seconds elapsed + Note: + check_sfp_status makes the use of select to retrieve the notifications, which means + it should has the logic of reading out all the notifications in the fd selected without blocked. + However, it fails to do that due to some sdk API's characteristics: + sx_lib_host_ifc_recv can only read one notification each time and will block when no notification in that fd. + sx_lib_host_ifc_recv_list can return all notification in the fd via a single reading operation but + not supported by PMPE register (I've tested it but failed) + as a result the only way to satisfy the logic is to call sx_lib_host_ifc_recv in a loop until all notifications + has been read and we have to find a way to check that. it seems the only way to check that is via using select. + in this sense, we return one notification each time check_sfp_status called and let the caller, get_change_event, + to repeat calling it with timeout = 0 in a loop until no new notification read (in this case it returns false). + by doing so all the notifications in the fd can be retrieved through a single call to get_change_event. + """ + found = 0 + + try: + read, _, _ = select.select([self.rx_fd_p.fd], [], [], timeout) + except select.error as err: + rc, msg = err + if rc == errno.EAGAIN or rc == errno.EINTR: + return False + else: + raise + + for fd in read: + if fd == self.rx_fd_p.fd: + success, port_list, module_state = self.on_pmpe(self.rx_fd_p) + if not success: + logger.log_error("failed to read from {}".format(fd)) + break + + sfp_state = sfp_value_status_dict.get(module_state, STATUS_UNKNOWN) + if sfp_state == STATUS_UNKNOWN: + # in the following sequence, STATUS_UNKNOWN can be returned. + # so we shouldn't raise exception here. + # 1. some sfp module is inserted + # 2. sfp_event gets stuck and fails to fetch the change event instantaneously + # 3. and then the sfp module is removed + # 4. sfp_event starts to try fetching the change event + # in this case found is increased so that True will be returned + logger.log_info("unknown module state {}, maybe the port suffers two adjacent insertion/removal".format(module_state)) + found += 1 + continue + + for port in port_list: + logger.log_info("SFP on port {} state {}".format(port, sfp_state)) + port_change[port] = sfp_state + found += 1 + + if found == 0: + return False + else: + return True + + def on_pmpe(self, fd_p): + ''' on port module plug event handler ''' + + # recv parameters + pkt_size = PMPE_PACKET_SIZE + pkt_size_p = new_uint32_t_p() + uint32_t_p_assign(pkt_size_p, pkt_size) + pkt = new_uint8_t_arr(pkt_size) + recv_info_p = new_sx_receive_info_t_p() + pmpe_t = sx_event_pmpe_t() + port_attributes_list = new_sx_port_attributes_t_arr(64) + port_cnt_p = new_uint32_t_p() + uint32_t_p_assign(port_cnt_p,64) + label_port_list = [] + module_state = 0 + + rc = sx_lib_host_ifc_recv(fd_p, pkt, pkt_size_p, recv_info_p) + if rc != 0: + logger.log_error("sx_lib_host_ifc_recv exited with error, rc %d" % rc) + status = False + else: + status = True + pmpe_t = recv_info_p.event_info.pmpe + port_list_size = pmpe_t.list_size + logical_port_list = pmpe_t.log_port_list + module_state = pmpe_t.module_state + + for i in xrange(port_list_size): + logical_port = sx_port_log_id_t_arr_getitem(logical_port_list, i) + rc = sx_api_port_device_get(self.handle, 1 , 0, port_attributes_list, port_cnt_p) + port_cnt = uint32_t_p_value(port_cnt_p) + + for i in xrange(port_cnt): + port_attributes = sx_port_attributes_t_arr_getitem(port_attributes_list,i) + if port_attributes.log_port == logical_port: + lable_port = port_attributes.port_mapping.module_port + break + label_port_list.append(lable_port) + + delete_uint32_t_p(pkt_size_p) + delete_uint8_t_arr(pkt) + delete_sx_receive_info_t_p(recv_info_p) + delete_sx_port_attributes_t_arr(port_attributes_list) + delete_uint32_t_p(port_cnt_p) + + return status, label_port_list, module_state, From 0b95e158ee3c7700a48e257d0df51c1f54f661f8 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Wed, 17 Jul 2019 09:16:32 +0300 Subject: [PATCH 2/3] rephase some comment --- .../mlnx-platform-api/sonic_platform/chassis.py | 12 +++++++----- .../mlnx-platform-api/sonic_platform/sfp_event.py | 5 ++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index cc92f2a27937..78eb48403018 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -385,12 +385,14 @@ def get_change_event(self, timeout=0): else: status = self.sfp_event.check_sfp_status(port_dict, timeout) - # if no event polled, just return empty set if status: - # workaround. - # check_sfp_status cannot return all the notifications in fd via a single call - # due to sdk reason (see comment in check_sfp_status for detail). - # we have to iterate in a loop to get all the notifications in the fd. + # get_change_event has the meaning of retrieving all the notifications through a single call. + # Typically this is implemented via a select framework which requires the underlay file-reading + # interface able to retrieve all notifications without blocking once the fd has been selected. + # However, sdk doesn't provide any interface satisfied the requirement. as a result, + # check_sfp_status returns only one notification may indicate more notifications in its queue. + # In this sense, we have to iterate in a loop to get all the notifications in case that + # the first call returns at least one. i = 0 while i < self.MAX_SELECT_EVENT_RETURNED: status = self.sfp_event.check_sfp_status(port_dict, 0) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py index d4aa3f93d3e4..d452a22db27f 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py @@ -38,9 +38,8 @@ def __init__(self): self.handle = None def initialize(self): - # open SDK API handle - # retry at most SX_OPEN_RETRIES times to wait - # until SDK is started during system startup + # open SDK API handle. + # retry at most SX_OPEN_RETRIES times to wait until SDK is started during system startup retry = 1 while True: rc, self.handle = sx_api_open(None) From 8ad1826d20d477117cefdbf77e9c7a5fd8bfdc74 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Wed, 17 Jul 2019 12:07:38 +0300 Subject: [PATCH 3/3] fix typo in sfp_event.sfp_event.initialize --- platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py index d452a22db27f..1e57603d38ad 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py @@ -71,7 +71,7 @@ def initialize(self): SX_TRAP_ID_PMPE, self.user_channel_p) if rc != SX_STATUS_SUCCESS: - raise RuntimeError("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(c)) + raise RuntimeError("sx_api_host_ifc_trap_id_register_set exited with error, rc {}".format(rc)) def deinitialize(self): if self.handle is None: