diff --git a/orchagent/pfc_detect_barefoot.lua b/orchagent/pfc_detect_barefoot.lua index b270549a29bd..c413c5999cdc 100644 --- a/orchagent/pfc_detect_barefoot.lua +++ b/orchagent/pfc_detect_barefoot.lua @@ -36,63 +36,68 @@ for i = n, 1, -1 do local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i]) local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i]) - local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' - local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION' + -- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then + -- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding + -- maps haven't been updated yet. + if queue_index and port_id then + local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' + local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION' - -- Get all counters - local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') - local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') - local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) - local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) + -- Get all counters + local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') + local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') + local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) + local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) - if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then - occupancy_bytes = tonumber(occupancy_bytes) - packets = tonumber(packets) - pfc_rx_packets = tonumber(pfc_rx_packets) - pfc_duration = tonumber(pfc_duration) + if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then + occupancy_bytes = tonumber(occupancy_bytes) + packets = tonumber(packets) + pfc_rx_packets = tonumber(pfc_rx_packets) + pfc_duration = tonumber(pfc_duration) - local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') - local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - -- DEBUG CODE START. Uncomment to enable - local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') - -- DEBUG CODE END. + local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') + local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + -- DEBUG CODE START. Uncomment to enable + local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') + -- DEBUG CODE END. - -- If this is not a first run, then we have last values available - if packets_last and pfc_rx_packets_last and pfc_duration_last then - packets_last = tonumber(packets_last) - pfc_rx_packets_last = tonumber(pfc_rx_packets_last) - pfc_duration_last = tonumber(pfc_duration_last) + -- If this is not a first run, then we have last values available + if packets_last and pfc_rx_packets_last and pfc_duration_last then + packets_last = tonumber(packets_last) + pfc_rx_packets_last = tonumber(pfc_rx_packets_last) + pfc_duration_last = tonumber(pfc_duration_last) - -- Check actual condition of queue being in PFC storm - if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or - -- DEBUG CODE START. Uncomment to enable - (debug_storm == "enabled") or - -- DEBUG CODE END. - (occupancy_bytes == 0 and packets - packets_last == 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then - if time_left <= poll_time then - redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') - is_deadlock = true - time_left = detection_time + -- Check actual condition of queue being in PFC storm + if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or + -- DEBUG CODE START. Uncomment to enable + (debug_storm == "enabled") or + -- DEBUG CODE END. + (occupancy_bytes == 0 and packets - packets_last == 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then + if time_left <= poll_time then + redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') + is_deadlock = true + time_left = detection_time + else + time_left = time_left - poll_time + end else - time_left = time_left - poll_time - end - else - if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + end + time_left = detection_time end - time_left = detection_time end - end - -- Save values for next run - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) - if is_deadlock == false then - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + -- Save values for next run + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) + if is_deadlock == false then + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + end end end end diff --git a/orchagent/pfc_detect_broadcom.lua b/orchagent/pfc_detect_broadcom.lua index 4f82b933176f..29ed2d163393 100644 --- a/orchagent/pfc_detect_broadcom.lua +++ b/orchagent/pfc_detect_broadcom.lua @@ -35,61 +35,66 @@ for i = n, 1, -1 do local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i]) local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i]) - local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' - local pfc_on2off_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_ON2OFF_RX_PKTS' + -- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then + -- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding + -- maps haven't been updated yet. + if queue_index and port_id then + local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' + local pfc_on2off_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_ON2OFF_RX_PKTS' - -- Get all counters - local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') - local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') - local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) - local pfc_on2off = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_on2off_key) - local queue_pause_status = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS') + -- Get all counters + local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') + local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') + local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) + local pfc_on2off = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_on2off_key) + local queue_pause_status = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS') - if occupancy_bytes and packets and pfc_rx_packets and pfc_on2off and queue_pause_status then - occupancy_bytes = tonumber(occupancy_bytes) - packets = tonumber(packets) - pfc_rx_packets = tonumber(pfc_rx_packets) - pfc_on2off = tonumber(pfc_on2off) + if occupancy_bytes and packets and pfc_rx_packets and pfc_on2off and queue_pause_status then + occupancy_bytes = tonumber(occupancy_bytes) + packets = tonumber(packets) + pfc_rx_packets = tonumber(pfc_rx_packets) + pfc_on2off = tonumber(pfc_on2off) - local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') - local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - local pfc_on2off_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_on2off_key .. '_last') - local queue_pause_status_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS_last') + local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') + local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + local pfc_on2off_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_on2off_key .. '_last') + local queue_pause_status_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS_last') - -- DEBUG CODE START. Uncomment to enable - local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') - -- DEBUG CODE END. + -- DEBUG CODE START. Uncomment to enable + local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') + -- DEBUG CODE END. - -- If this is not a first run, then we have last values available - if packets_last and pfc_rx_packets_last and pfc_on2off_last and queue_pause_status_last then - packets_last = tonumber(packets_last) - pfc_rx_packets_last = tonumber(pfc_rx_packets_last) - pfc_on2off_last = tonumber(pfc_on2off_last) + -- If this is not a first run, then we have last values available + if packets_last and pfc_rx_packets_last and pfc_on2off_last and queue_pause_status_last then + packets_last = tonumber(packets_last) + pfc_rx_packets_last = tonumber(pfc_rx_packets_last) + pfc_on2off_last = tonumber(pfc_on2off_last) - -- Check actual condition of queue being in PFC storm - if (pfc_rx_packets - pfc_rx_packets_last > 0 and pfc_on2off - pfc_on2off_last == 0 and queue_pause_status_last == 'true' and queue_pause_status == 'true') or - (debug_storm == "enabled") then - if time_left <= poll_time then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') - is_deadlock = true - time_left = detection_time + -- Check actual condition of queue being in PFC storm + if (pfc_rx_packets - pfc_rx_packets_last > 0 and pfc_on2off - pfc_on2off_last == 0 and queue_pause_status_last == 'true' and queue_pause_status == 'true') or + (debug_storm == "enabled") then + if time_left <= poll_time then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') + is_deadlock = true + time_left = detection_time + else + time_left = time_left - poll_time + end else - time_left = time_left - poll_time - end - else - if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + end + time_left = detection_time end - time_left = detection_time end - end - -- Save values for next run - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS_last', queue_pause_status) - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_on2off_key .. '_last', pfc_on2off) + -- Save values for next run + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS_last', queue_pause_status) + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_on2off_key .. '_last', pfc_on2off) + end end end end diff --git a/orchagent/pfc_detect_innovium.lua b/orchagent/pfc_detect_innovium.lua index cedd51baa327..8deedeaa4f4f 100644 --- a/orchagent/pfc_detect_innovium.lua +++ b/orchagent/pfc_detect_innovium.lua @@ -36,72 +36,77 @@ for i = n, 1, -1 do local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i]) local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i]) - local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' - local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION' - - -- Get all counters - local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') - local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') - local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) - local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) - - if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then - occupancy_bytes = tonumber(occupancy_bytes) - packets = tonumber(packets) - pfc_rx_packets = tonumber(pfc_rx_packets) - pfc_duration = tonumber(pfc_duration) - - local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') - local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - -- DEBUG CODE START. Uncomment to enable - local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') - -- DEBUG CODE END. - - -- If this is not a first run, then we have last values available - if packets_last and pfc_rx_packets_last and pfc_duration_last then - packets_last = tonumber(packets_last) - pfc_rx_packets_last = tonumber(pfc_rx_packets_last) - pfc_duration_last = tonumber(pfc_duration_last) - - -- Check actual condition of queue being in PFC storm - -- if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) then - -- redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'K7_debug_1', 'YES') - - -- if (debug_storm == "enabled") then - -- redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'K7_debug_2', 'YES') - - -- if (occupancy_bytes == 0 and packets - packets_last == 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then - -- redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'K7_debug_3', 'YES') - - - if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or - -- DEBUG CODE START. Uncomment to enable - (debug_storm == "enabled") or - -- DEBUG CODE END. - (occupancy_bytes == 0 and pfc_rx_packets - pfc_rx_packets_last > 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then - if time_left <= poll_time then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') - is_deadlock = true - time_left = detection_time + -- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then + -- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding + -- maps haven't been updated yet. + if queue_index and port_id then + local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' + local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION' + + -- Get all counters + local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') + local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') + local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) + local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) + + if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then + occupancy_bytes = tonumber(occupancy_bytes) + packets = tonumber(packets) + pfc_rx_packets = tonumber(pfc_rx_packets) + pfc_duration = tonumber(pfc_duration) + + local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') + local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + -- DEBUG CODE START. Uncomment to enable + local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') + -- DEBUG CODE END. + + -- If this is not a first run, then we have last values available + if packets_last and pfc_rx_packets_last and pfc_duration_last then + packets_last = tonumber(packets_last) + pfc_rx_packets_last = tonumber(pfc_rx_packets_last) + pfc_duration_last = tonumber(pfc_duration_last) + + -- Check actual condition of queue being in PFC storm + -- if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) then + -- redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'K7_debug_1', 'YES') + + -- if (debug_storm == "enabled") then + -- redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'K7_debug_2', 'YES') + + -- if (occupancy_bytes == 0 and packets - packets_last == 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then + -- redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'K7_debug_3', 'YES') + + + if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or + -- DEBUG CODE START. Uncomment to enable + (debug_storm == "enabled") or + -- DEBUG CODE END. + (occupancy_bytes == 0 and pfc_rx_packets - pfc_rx_packets_last > 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then + if time_left <= poll_time then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') + is_deadlock = true + time_left = detection_time + else + time_left = time_left - poll_time + end else - time_left = time_left - poll_time - end - else - if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + end + time_left = detection_time end - time_left = detection_time end - end - -- Save values for next run - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) - if is_deadlock == false then - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) - redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + -- Save values for next run + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) + if is_deadlock == false then + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) + redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + end end end end diff --git a/orchagent/pfc_detect_mellanox.lua b/orchagent/pfc_detect_mellanox.lua index 6df16241e91e..e805ad9cff1e 100644 --- a/orchagent/pfc_detect_mellanox.lua +++ b/orchagent/pfc_detect_mellanox.lua @@ -36,64 +36,69 @@ for i = n, 1, -1 do local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i]) local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i]) - local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' - local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION_US' + -- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then + -- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding + -- maps haven't been updated yet. + if queue_index and port_id then + local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' + local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION_US' - -- Get all counters - local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') - local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') - local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) - local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) + -- Get all counters + local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') + local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') + local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) + local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) - if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then - occupancy_bytes = tonumber(occupancy_bytes) - packets = tonumber(packets) - pfc_rx_packets = tonumber(pfc_rx_packets) - pfc_duration = tonumber(pfc_duration) + if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then + occupancy_bytes = tonumber(occupancy_bytes) + packets = tonumber(packets) + pfc_rx_packets = tonumber(pfc_rx_packets) + pfc_duration = tonumber(pfc_duration) - local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') - local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - -- DEBUG CODE START. Uncomment to enable - local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') - -- DEBUG CODE END. + local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') + local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + -- DEBUG CODE START. Uncomment to enable + local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') + -- DEBUG CODE END. - -- If this is not a first run, then we have last values available - if packets_last and pfc_rx_packets_last and pfc_duration_last then - packets_last = tonumber(packets_last) - pfc_rx_packets_last = tonumber(pfc_rx_packets_last) - pfc_duration_last = tonumber(pfc_duration_last) - local storm_condition = (pfc_duration - pfc_duration_last) > (poll_time * 0.8) + -- If this is not a first run, then we have last values available + if packets_last and pfc_rx_packets_last and pfc_duration_last then + packets_last = tonumber(packets_last) + pfc_rx_packets_last = tonumber(pfc_rx_packets_last) + pfc_duration_last = tonumber(pfc_duration_last) + local storm_condition = (pfc_duration - pfc_duration_last) > (poll_time * 0.8) - -- Check actual condition of queue being in PFC storm - if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or - -- DEBUG CODE START. Uncomment to enable - (debug_storm == "enabled") or - -- DEBUG CODE END. - (occupancy_bytes == 0 and packets - packets_last == 0 and storm_condition) then - if time_left <= poll_time then - redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') - is_deadlock = true - time_left = detection_time + -- Check actual condition of queue being in PFC storm + if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or + -- DEBUG CODE START. Uncomment to enable + (debug_storm == "enabled") or + -- DEBUG CODE END. + (occupancy_bytes == 0 and packets - packets_last == 0 and storm_condition) then + if time_left <= poll_time then + redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') + is_deadlock = true + time_left = detection_time + else + time_left = time_left - poll_time + end else - time_left = time_left - poll_time - end - else - if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + end + time_left = detection_time end - time_left = detection_time end - end - -- Save values for next run - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) - if is_deadlock == false then - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + -- Save values for next run + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) + if is_deadlock == false then + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + end end end end diff --git a/orchagent/pfc_detect_nephos.lua b/orchagent/pfc_detect_nephos.lua index d152fc5f8c76..648904e17a55 100644 --- a/orchagent/pfc_detect_nephos.lua +++ b/orchagent/pfc_detect_nephos.lua @@ -35,65 +35,70 @@ for i = n, 1, -1 do local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i]) local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i]) - local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' - local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION' + -- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then + -- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding + -- maps haven't been updated yet. + if queue_index and port_id then + local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' + local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION' - -- Get all counters - local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') - local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') - local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) - local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) + -- Get all counters + local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES') + local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS') + local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key) + local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key) - if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then - occupancy_bytes = tonumber(occupancy_bytes) - packets = tonumber(packets) - pfc_rx_packets = tonumber(pfc_rx_packets) - pfc_duration = tonumber(pfc_duration) + if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then + occupancy_bytes = tonumber(occupancy_bytes) + packets = tonumber(packets) + pfc_rx_packets = tonumber(pfc_rx_packets) + pfc_duration = tonumber(pfc_duration) - local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') - local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - -- DEBUG CODE START. Uncomment to enable - local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') - -- DEBUG CODE END. + local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') + local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + -- DEBUG CODE START. Uncomment to enable + local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') + -- DEBUG CODE END. - -- If this is not a first run, then we have last values available - if packets_last and pfc_rx_packets_last and pfc_duration_last then - packets_last = tonumber(packets_last) - pfc_rx_packets_last = tonumber(pfc_rx_packets_last) - pfc_duration_last = tonumber(pfc_duration_last) + -- If this is not a first run, then we have last values available + if packets_last and pfc_rx_packets_last and pfc_duration_last then + packets_last = tonumber(packets_last) + pfc_rx_packets_last = tonumber(pfc_rx_packets_last) + pfc_duration_last = tonumber(pfc_duration_last) - -- Check actual condition of queue being in PFC storm - if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or - -- DEBUG CODE START. Uncomment to enable - (debug_storm == "enabled") or - -- DEBUG CODE END. - (occupancy_bytes == 0 and packets - packets_last == 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then - if time_left <= poll_time then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') - is_deadlock = true - time_left = detection_time + -- Check actual condition of queue being in PFC storm + if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or + -- DEBUG CODE START. Uncomment to enable + (debug_storm == "enabled") or + -- DEBUG CODE END. + (occupancy_bytes == 0 and packets - packets_last == 0 and (pfc_duration - pfc_duration_last) > poll_time * 0.8) then + if time_left <= poll_time then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm"]') + is_deadlock = true + time_left = detection_time + else + time_left = time_left - poll_time + end else - time_left = time_left - poll_time - end - else - if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + end + time_left = detection_time end - time_left = detection_time end - end - -- Save values for next run - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) - redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + -- Save values for next run + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) + redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last') + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration) + end end end end end return rets - + diff --git a/orchagent/pfc_restore.lua b/orchagent/pfc_restore.lua index 7b137a40d348..4c278526876e 100644 --- a/orchagent/pfc_restore.lua +++ b/orchagent/pfc_restore.lua @@ -32,36 +32,41 @@ for i = n, 1, -1 do local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i]) local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i]) - local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' + -- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then + -- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding + -- maps haven't been updated yet. + if queue_index and port_id then + local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' - local pfc_rx_packets = tonumber(redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key)) - local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') - -- DEBUG CODE START. Uncomment to enable - local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') - -- DEBUG CODE END. - if pfc_rx_packets_last then - pfc_rx_packets_last = tonumber(pfc_rx_packets_last) + local pfc_rx_packets = tonumber(redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key)) + local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') + -- DEBUG CODE START. Uncomment to enable + local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') + -- DEBUG CODE END. + if pfc_rx_packets_last then + pfc_rx_packets_last = tonumber(pfc_rx_packets_last) - -- Check actual condition of queue being restored from PFC storm - if (pfc_rx_packets - pfc_rx_packets_last == 0) - -- DEBUG CODE START. Uncomment to enable - and (debug_storm ~= "enabled") - -- DEBUG CODE END. - then - if time_left <= poll_time then - redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') - time_left = restoration_time + -- Check actual condition of queue being restored from PFC storm + if (pfc_rx_packets - pfc_rx_packets_last == 0) + -- DEBUG CODE START. Uncomment to enable + and (debug_storm ~= "enabled") + -- DEBUG CODE END. + then + if time_left <= poll_time then + redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]') + time_left = restoration_time + else + time_left = time_left - poll_time + end else - time_left = time_left - poll_time + time_left = restoration_time end - else - time_left = restoration_time end - end - -- Save values for next run - redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_RESTORATION_TIME_LEFT', time_left) - redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) + -- Save values for next run + redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_RESTORATION_TIME_LEFT', time_left) + redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) + end end end