Skip to content

Commit

Permalink
Add TimerWatchdog for monitoring long execution apis (sonic-net#469)
Browse files Browse the repository at this point in the history
* Add TimerWatchdog for monitoring long execution apis

* Add callback

* Fix spelling

* Change timer watchdog resolution to microseconds

* Add callback entry log
  • Loading branch information
kcudnik authored Jul 1, 2019
1 parent 4b78fbe commit bea4fe8
Show file tree
Hide file tree
Showing 6 changed files with 220 additions and 3 deletions.
1 change: 1 addition & 0 deletions meta/saiserialize.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "sai_serialize.h"
#include "meta/sai_meta.h"
#include "swss/tokenize.h"

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wshadow"
#include "swss/json.hpp"
Expand Down
6 changes: 4 additions & 2 deletions syncd/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ syncd_SOURCES = \
syncd_hard_reinit.cpp \
syncd_notifications.cpp \
syncd_applyview.cpp \
syncd_flex_counter.cpp
syncd_flex_counter.cpp \
TimerWatchdog.cpp

syncd_CPPFLAGS = $(DBGFLAGS) $(AM_CPPFLAGS) $(CFLAGS_COMMON) $(SAIFLAGS)
syncd_LDADD = -lhiredis -lswsscommon $(SAILIB) -lpthread -L$(top_srcdir)/meta/.libs -lsaimetadata -lsaimeta -ldl
Expand All @@ -51,7 +52,8 @@ tests_SOURCES = \
syncd_hard_reinit.cpp \
syncd_notifications.cpp \
syncd_applyview.cpp \
syncd_flex_counter.cpp
syncd_flex_counter.cpp \
TimerWatchdog.cpp

tests_CPPFLAGS = $(DBGFLAGS) $(AM_CPPFLAGS) $(CFLAGS_COMMON)
tests_LDADD = -lhiredis -lswsscommon -lpthread -L$(top_srcdir)/lib/src/.libs -lsairedis -L$(top_srcdir)/meta/.libs -lsaimetadata -lsaimeta
Expand Down
133 changes: 133 additions & 0 deletions syncd/TimerWatchdog.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#include "TimerWatchdog.h"

#include "swss/logger.h"

#include <chrono>

TimerWatchdog::TimerWatchdog(
_In_ int64_t warnTimespan):
m_run(true),
m_warnTimespan(warnTimespan),
m_callback(0)
{
SWSS_LOG_ENTER();

int64_t start = getTimeSinceEpoch();
m_startTimestamp = start;
m_endTimestamp = start;
m_lastCheckTimestamp = start;

m_thread = std::make_shared<std::thread>(&TimerWatchdog::threadFunction, this);

// m_thread->detach()
}

TimerWatchdog::~TimerWatchdog()
{
SWSS_LOG_ENTER();

m_run = false;

m_thread->join();
}

void TimerWatchdog::setStartTime()
{
SWSS_LOG_ENTER();

do
{
m_startTimestamp = getTimeSinceEpoch();
}
while (m_startTimestamp <= m_endTimestamp); // make sure new start time is always past last end time

}

void TimerWatchdog::setEndTime()
{
SWSS_LOG_ENTER();

do
{
m_endTimestamp = getTimeSinceEpoch();
}
while (m_endTimestamp <= m_startTimestamp); // make sure new end time is always past last start time
}

void TimerWatchdog::setCallback(
_In_ Callback callback)
{
SWSS_LOG_ENTER();

m_callback = callback;
}

void TimerWatchdog::threadFunction()
{
SWSS_LOG_ENTER();

SWSS_LOG_NOTICE("starting timer watchdog thread");

int id = 0;

while (m_run)
{
id++;
std::this_thread::sleep_for(std::chrono::seconds(1));

// we make local copies, since executing functions can be so fast that
// when we will read second time start timestamp it can be different
// than previous one

int64_t start = m_startTimestamp;
int64_t end = m_endTimestamp;
int64_t now = getTimeSinceEpoch(); // now needs to be obtained after obtaining start

int64_t span = end - start;

if (span < 0 && start > m_lastCheckTimestamp)
{
SWSS_LOG_NOTICE(" span < 0 = %ld at %ld", span, now);

// this means start > end, so new function is currently executing,
// or that function hanged, so see how long that function is
// executing, this negative span can be arbitrary long even hours,
// and that is fine, since we don't know when OA makes next
// function call

span = now - start; // this must be always non negative

SWSS_LOG_NOTICE(" new span = %ld", span);

if (span < 0)
SWSS_LOG_THROW("negative span 'now - start': %ld - %ld", now, start);

if (span > m_warnTimespan)
{
m_lastCheckTimestamp = start;

// function probably hanged

SWSS_LOG_WARN("time (span < 0) watchdog exceeded %ld microseconds", span);

auto callback = m_callback;

if (callback)
callback(span);
}

continue;
}

m_lastCheckTimestamp = start;
}

SWSS_LOG_NOTICE("ending timer watchdog thread");
}

int64_t TimerWatchdog::getTimeSinceEpoch()
{
SWSS_LOG_ENTER();

return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
}
60 changes: 60 additions & 0 deletions syncd/TimerWatchdog.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#ifndef __TIMER_WATCHDOG_H__
#define __TIMER_WATCHDOG_H__

#include <thread>
#include <atomic>

#ifndef _In_
#define _In_
#endif

class TimerWatchdog
{
typedef void (*Callback)(
_In_ int64_t span);

public:

TimerWatchdog(
_In_ int64_t warnTimespan);

virtual ~TimerWatchdog();

public:

void setStartTime();

void setEndTime();

void setCallback(
_In_ Callback callback);

/**
* @brief Gets timestamp since epoch.
*
* @return Time since epoch in microseconds.
*/
static int64_t getTimeSinceEpoch();

private:

void threadFunction();

private:

volatile bool m_run;

// all values are in microseconds

std::atomic_int_fast64_t m_warnTimespan;
std::atomic_int_fast64_t m_startTimestamp;
std::atomic_int_fast64_t m_endTimestamp;
std::atomic_int_fast64_t m_lastCheckTimestamp;

std::shared_ptr<std::thread> m_thread;

Callback m_callback;

};

#endif // __TIMER_WATCHDOG_H__
20 changes: 20 additions & 0 deletions syncd/syncd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include "swss/warm_restart.h"
#include "swss/table.h"

#include "TimerWatchdog.h"

extern "C" {
#include <sai.h>
}
Expand Down Expand Up @@ -3642,6 +3644,14 @@ void sai_meta_log_syncd(
swss::Logger::getInstance().write(p, ":- %s: %s", func, buffer);
}

void timerWatchdogCallback(
_In_ int64_t span)
{
SWSS_LOG_ENTER();

SWSS_LOG_ERROR("main loop execution exceeded %ld ms", span);
}

int syncd_main(int argc, char **argv)
{
swss::Logger::getInstance().setMinPrio(swss::Logger::SWSS_DEBUG);
Expand Down Expand Up @@ -3821,6 +3831,10 @@ int syncd_main(int argc, char **argv)
runMainLoop = false;
}

TimerWatchdog twd(30 * 1000000); // watch for executions over 30 seconds

twd.setCallback(timerWatchdogCallback);

while(runMainLoop)
{
try
Expand All @@ -3829,6 +3843,8 @@ int syncd_main(int argc, char **argv)

int result = s->select(&sel);

twd.setStartTime();

if (sel == restartQuery.get())
{
/*
Expand Down Expand Up @@ -3921,6 +3937,8 @@ int syncd_main(int argc, char **argv)
{
processEvent(*(swss::ConsumerTable*)sel);
}

twd.setEndTime();
}
catch(const std::exception &e)
{
Expand All @@ -3937,6 +3955,8 @@ int syncd_main(int argc, char **argv)

// make sure that if second exception will arise, then we break the loop
options.disableExitSleep = true;

twd.setEndTime();
}
}

Expand Down
3 changes: 2 additions & 1 deletion tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ vssyncd_SOURCES = \
../syncd/syncd_hard_reinit.cpp \
../syncd/syncd_notifications.cpp \
../syncd/syncd_applyview.cpp \
../syncd/syncd_flex_counter.cpp
../syncd/syncd_flex_counter.cpp \
../syncd/TimerWatchdog.cpp

vssyncd_CPPFLAGS = $(DBGFLAGS) $(AM_CPPFLAGS) $(CFLAGS_COMMON) $(SAIFLAGS)
vssyncd_LDADD = -lhiredis -lswsscommon $(SAILIB) -lpthread -L$(top_srcdir)/meta/.libs -lsaimetadata -lsaimeta -ldl
Expand Down

0 comments on commit bea4fe8

Please sign in to comment.