Skip to content

Commit

Permalink
[202106] Prevent other notification event storms to keep enqueue unch…
Browse files Browse the repository at this point in the history
…ecked and drained all memory that leads to crashing the switch router (#977)
  • Loading branch information
gechiang authored Dec 8, 2021
1 parent de02b6d commit cd83e16
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 6 deletions.
50 changes: 45 additions & 5 deletions syncd/NotificationQueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@ using namespace syncd;
#define MUTEX std::lock_guard<std::mutex> _lock(m_mutex);

NotificationQueue::NotificationQueue(
_In_ size_t queueLimit):
_In_ size_t queueLimit,
_In_ size_t consecutiveThresholdLimit):
m_queueSizeLimit(queueLimit),
m_dropCount(0)
m_thresholdLimit(consecutiveThresholdLimit),
m_dropCount(0),
m_lastEventCount(0),
m_lastEvent(SAI_SWITCH_NOTIFICATION_NAME_FDB_EVENT)
{
SWSS_LOG_ENTER();

Expand All @@ -30,16 +34,52 @@ bool NotificationQueue::enqueue(
MUTEX;

SWSS_LOG_ENTER();
bool candidateToDrop = false;
std::string currentEvent;

/*
* If the queue exceeds the limit, then drop all further FDB events This is
* a temporary solution to handle high memory usage by syncd and the
* notification queue keeps growing. The permanent solution would be to
* make this stateful so that only the *latest* event is published.
*
* We have also seen other notification storms that can also cause this queue issue
* So the new scheme is to keep the last notification event and its consecutive count
* If threshold limit reached and the consecutive count also reached then this notification
* will also be dropped regardless of its event type to protect the device from crashing due to
* running out of memory
*/
auto queueSize = m_queue.size();
currentEvent = kfvKey(item);
if (currentEvent == m_lastEvent)
{
m_lastEventCount++;
}
else
{
m_lastEventCount = 1;
m_lastEvent = currentEvent;
}
if (queueSize >= m_queueSizeLimit)
{
/* Too many queued up already check if notification fits condition to be dropped
* 1. All FDB events should be dropped at this point.
* 2. All other notification events will start to drop if it reached the consecutive threshold limit
*/
if (currentEvent == SAI_SWITCH_NOTIFICATION_NAME_FDB_EVENT)
{
candidateToDrop = true;
}
else
{
if (m_lastEventCount >= m_thresholdLimit)
{
candidateToDrop = true;
}
}
}

if (queueSize < m_queueSizeLimit || kfvOp(item) != SAI_SWITCH_NOTIFICATION_NAME_FDB_EVENT) // TODO use enum instead of strings
if (!candidateToDrop)
{
m_queue.push(item);

Expand All @@ -51,9 +91,9 @@ bool NotificationQueue::enqueue(
if (!(m_dropCount % NOTIFICATION_QUEUE_DROP_COUNT_INDICATOR))
{
SWSS_LOG_NOTICE(
"Too many messages in queue (%zu), dropped %zu FDB events!",
"Too many messages in queue (%zu), dropped (%zu), lastEventCount (%zu) Dropping %s !",
queueSize,
m_dropCount);
m_dropCount, m_lastEventCount, m_lastEvent.c_str());
}

return false;
Expand Down
18 changes: 17 additions & 1 deletion syncd/NotificationQueue.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,18 @@ extern "C" {
* Value based on typical L2 deployment with 256k MAC entries and
* some extra buffer for other events like port-state, q-deadlock etc
*
* Note: We recently found a case where SAI continuously sending switch notification events
* that also caused the queue to keep growing and eventually exhaust all system memory and crashed.
* So a new detection/limit scheme is being implemented by keeping track of the last Event
* and if the current Event matches the last Event, then the last Event Count will get
* incremented and this count will also be used as part of the equation to ensure this
* notification should also be dropped if the queue size limit has already reached and not
* just dropping FDB events only.
*
* TODO: move to config, also this limit only applies to fdb notifications
*/
#define DEFAULT_NOTIFICATION_QUEUE_SIZE_LIMIT (300000)
#define DEFAULT_NOTIFICATION_CONSECUTIVE_THRESHOLD (1000)

namespace syncd
{
Expand All @@ -26,7 +35,8 @@ namespace syncd
public:

NotificationQueue(
_In_ size_t limit = DEFAULT_NOTIFICATION_QUEUE_SIZE_LIMIT);
_In_ size_t limit = DEFAULT_NOTIFICATION_QUEUE_SIZE_LIMIT,
_In_ size_t consecutiveThresholdLimit = DEFAULT_NOTIFICATION_CONSECUTIVE_THRESHOLD);

virtual ~NotificationQueue();

Expand All @@ -48,6 +58,12 @@ namespace syncd

size_t m_queueSizeLimit;

size_t m_thresholdLimit;

size_t m_dropCount;

size_t m_lastEventCount;

std::string m_lastEvent;
};
}

0 comments on commit cd83e16

Please sign in to comment.