Skip to content

Commit

Permalink
[synchronous mode] Add failure notification for SAI failures in synch…
Browse files Browse the repository at this point in the history
…ronous mode (sonic-net#1596)

Add function to notify users in the presence of SAI failures by throwing exceptions and trigger swss restart in synchronous mode. And include this function in routeorch and neighorch.

This is a part of the first step in the SAI failure handling in orchagent with synchronous mode:
    1. Failure notification mechanism to ensure enough notifications in the presence of SAI failures and avoid running switches with unhandled failures.
    2. Add general failure handling logic by status.
    3. Develop fine-grain failure handling mechanism for each orch to properly handle different SAI failures.

This function aims to ensure enough notifications in the presence of SAI failures and avoid running switches with unhandled failures (on-par with asynchronous mode).
  • Loading branch information
shi-su authored and Shi Su committed Aug 17, 2021
1 parent 3f35776 commit fc8e43f
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 32 deletions.
25 changes: 13 additions & 12 deletions cfgmgr/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ INCLUDES = -I$(top_srcdir)/lib -I $(top_srcdir) -I $(top_srcdir)/orchagent -I $(
CFLAGS_SAI = -I /usr/include/sai
LIBNL_CFLAGS = -I/usr/include/libnl3
LIBNL_LIBS = -lnl-genl-3 -lnl-route-3 -lnl-3
SAIMETA_LIBS = -lsaimeta -lsaimetadata

bin_PROGRAMS = vlanmgrd teammgrd portmgrd intfmgrd buffermgrd vrfmgrd nbrmgrd vxlanmgrd sflowmgrd natmgrd coppmgrd tunnelmgrd

Expand All @@ -24,60 +25,60 @@ endif
vlanmgrd_SOURCES = vlanmgrd.cpp vlanmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h
vlanmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
vlanmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
vlanmgrd_LDADD = -lswsscommon
vlanmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS)

teammgrd_SOURCES = teammgrd.cpp teammgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h
teammgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
teammgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
teammgrd_LDADD = -lswsscommon
teammgrd_LDADD = -lswsscommon $(SAIMETA_LIBS)

portmgrd_SOURCES = portmgrd.cpp portmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h
portmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
portmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
portmgrd_LDADD = -lswsscommon
portmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS)

intfmgrd_SOURCES = intfmgrd.cpp intfmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h
intfmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
intfmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
intfmgrd_LDADD = -lswsscommon
intfmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS)

buffermgrd_SOURCES = buffermgrd.cpp buffermgr.cpp buffermgrdyn.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h
buffermgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
buffermgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
buffermgrd_LDADD = -lswsscommon
buffermgrd_LDADD = -lswsscommon $(SAIMETA_LIBS)

vrfmgrd_SOURCES = vrfmgrd.cpp vrfmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h
vrfmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
vrfmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
vrfmgrd_LDADD = -lswsscommon
vrfmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS)

nbrmgrd_SOURCES = nbrmgrd.cpp nbrmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h
nbrmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) $(LIBNL_CFLAGS)
nbrmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI) $(LIBNL_CPPFLAGS)
nbrmgrd_LDADD = -lswsscommon $(LIBNL_LIBS)
nbrmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS) $(LIBNL_LIBS)

vxlanmgrd_SOURCES = vxlanmgrd.cpp vxlanmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h
vxlanmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
vxlanmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
vxlanmgrd_LDADD = -lswsscommon
vxlanmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS)

sflowmgrd_SOURCES = sflowmgrd.cpp sflowmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h
sflowmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
sflowmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
sflowmgrd_LDADD = -lswsscommon
sflowmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS)

natmgrd_SOURCES = natmgrd.cpp natmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h
natmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
natmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
natmgrd_LDADD = -lswsscommon
natmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS)

coppmgrd_SOURCES = coppmgrd.cpp coppmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h
coppmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
coppmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
coppmgrd_LDADD = -lswsscommon
coppmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS)

tunnelmgrd_SOURCES = tunnelmgrd.cpp tunnelmgr.cpp $(top_srcdir)/orchagent/orch.cpp $(top_srcdir)/orchagent/request_parser.cpp shellcmd.h
tunnelmgrd_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
tunnelmgrd_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON) $(CFLAGS_SAI)
tunnelmgrd_LDADD = -lswsscommon
tunnelmgrd_LDADD = -lswsscommon $(SAIMETA_LIBS)

12 changes: 6 additions & 6 deletions orchagent/neighorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ bool NeighOrch::addNextHop(const IpAddress &ipAddress, const string &alias)
{
SWSS_LOG_ERROR("Failed to create next hop %s on %s, rv:%d",
ipAddress.to_string().c_str(), alias.c_str(), status);
return false;
return handleSaiCreateStatus(SAI_API_NEXT_HOP, status);
}

SWSS_LOG_NOTICE("Created next hop %s on %s",
Expand Down Expand Up @@ -642,7 +642,7 @@ bool NeighOrch::addNeighbor(const NeighborEntry &neighborEntry, const MacAddress
{
SWSS_LOG_ERROR("Failed to create neighbor %s on %s, rv:%d",
macAddress.to_string().c_str(), alias.c_str(), status);
return false;
return handleSaiCreateStatus(SAI_API_NEIGHBOR, status);
}
}
SWSS_LOG_NOTICE("Created neighbor ip %s, %s on %s", ip_address.to_string().c_str(),
Expand All @@ -665,7 +665,7 @@ bool NeighOrch::addNeighbor(const NeighborEntry &neighborEntry, const MacAddress
{
SWSS_LOG_ERROR("Failed to remove neighbor %s on %s, rv:%d",
macAddress.to_string().c_str(), alias.c_str(), status);
return false;
return handleSaiRemoveStatus(SAI_API_NEIGHBOR, status);
}
m_intfsOrch->decreaseRouterIntfsRefCount(alias);

Expand All @@ -689,7 +689,7 @@ bool NeighOrch::addNeighbor(const NeighborEntry &neighborEntry, const MacAddress
{
SWSS_LOG_ERROR("Failed to update neighbor %s on %s, rv:%d",
macAddress.to_string().c_str(), alias.c_str(), status);
return false;
return handleSaiSetStatus(SAI_API_NEIGHBOR, status);
}
SWSS_LOG_NOTICE("Updated neighbor %s on %s", macAddress.to_string().c_str(), alias.c_str());
}
Expand Down Expand Up @@ -746,7 +746,7 @@ bool NeighOrch::removeNeighbor(const NeighborEntry &neighborEntry, bool disable)
{
SWSS_LOG_ERROR("Failed to remove next hop %s on %s, rv:%d",
ip_address.to_string().c_str(), alias.c_str(), status);
return false;
return handleSaiRemoveStatus(SAI_API_NEXT_HOP, status);
}
}

Expand Down Expand Up @@ -778,7 +778,7 @@ bool NeighOrch::removeNeighbor(const NeighborEntry &neighborEntry, bool disable)
{
SWSS_LOG_ERROR("Failed to remove neighbor %s on %s, rv:%d",
m_syncdNeighbors[neighborEntry].mac.to_string().c_str(), alias.c_str(), status);
return false;
return handleSaiRemoveStatus(SAI_API_NEIGHBOR, status);
}
}

Expand Down
77 changes: 77 additions & 0 deletions orchagent/orch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "tokenize.h"
#include "logger.h"
#include "consumerstatetable.h"
#include "sai_serialize.h"

using namespace swss;

Expand Down Expand Up @@ -685,6 +686,82 @@ Executor *Orch::getExecutor(string executorName)
return NULL;
}

bool Orch::handleSaiCreateStatus(sai_api_t api, sai_status_t status, void *context)
{
/*
* This function aims to provide coarse handling of failures in sairedis create
* operation (i.e., notify users by throwing excepions when failures happen).
* Return value: true - Handled the status successfully. No need to retry this SAI operation.
* false - Cannot handle the status. Need to retry the SAI operation.
* TODO: 1. Add general handling logic for specific statuses (e.g., SAI_STATUS_ITEM_ALREADY_EXISTS)
* 2. Develop fine-grain failure handling mechanisms and replace this coarse handling
* in each orch.
* 3. Take the type of sai api into consideration.
*/
switch (status)
{
case SAI_STATUS_SUCCESS:
SWSS_LOG_WARN("SAI_STATUS_SUCCESS is not expected in handleSaiCreateStatus");
return true;
default:
SWSS_LOG_ERROR("Encountered failure in create operation, exiting orchagent, SAI API: %s, status: %s",
sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str());
exit(EXIT_FAILURE);
}
return false;
}

bool Orch::handleSaiSetStatus(sai_api_t api, sai_status_t status, void *context)
{
/*
* This function aims to provide coarse handling of failures in sairedis set
* operation (i.e., notify users by throwing excepions when failures happen).
* Return value: true - Handled the status successfully. No need to retry this SAI operation.
* false - Cannot handle the status. Need to retry the SAI operation.
* TODO: 1. Add general handling logic for specific statuses
* 2. Develop fine-grain failure handling mechanisms and replace this coarse handling
* in each orch.
* 3. Take the type of sai api into consideration.
*/
switch (status)
{
case SAI_STATUS_SUCCESS:
SWSS_LOG_WARN("SAI_STATUS_SUCCESS is not expected in handleSaiSetStatus");
return true;
default:
SWSS_LOG_ERROR("Encountered failure in set operation, exiting orchagent, SAI API: %s, status: %s",
sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str());
exit(EXIT_FAILURE);
}
return false;
}

bool Orch::handleSaiRemoveStatus(sai_api_t api, sai_status_t status, void *context)
{
/*
* This function aims to provide coarse handling of failures in sairedis remove
* operation (i.e., notify users by throwing excepions when failures happen).
* Return value: true - Handled the status successfully. No need to retry this SAI operation.
* false - Cannot handle the status. Need to retry the SAI operation.
* TODO: 1. Add general handling logic for specific statuses (e.g., SAI_STATUS_OBJECT_IN_USE,
* SAI_STATUS_ITEM_NOT_FOUND)
* 2. Develop fine-grain failure handling mechanisms and replace this coarse handling
* in each orch.
* 3. Take the type of sai api into consideration.
*/
switch (status)
{
case SAI_STATUS_SUCCESS:
SWSS_LOG_WARN("SAI_STATUS_SUCCESS is not expected in handleSaiRemoveStatus");
return true;
default:
SWSS_LOG_ERROR("Encountered failure in remove operation, exiting orchagent, SAI API: %s, status: %s",
sai_serialize_api(api).c_str(), sai_serialize_status(status).c_str());
exit(EXIT_FAILURE);
}
return false;
}

void Orch2::doTask(Consumer &consumer)
{
SWSS_LOG_ENTER();
Expand Down
5 changes: 5 additions & 0 deletions orchagent/orch.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,11 @@ class Orch
/* Note: consumer will be owned by this class */
void addExecutor(Executor* executor);
Executor *getExecutor(std::string executorName);

/* Handling SAI status*/
virtual bool handleSaiCreateStatus(sai_api_t api, sai_status_t status, void *context = nullptr);
virtual bool handleSaiSetStatus(sai_api_t api, sai_status_t status, void *context = nullptr);
virtual bool handleSaiRemoveStatus(sai_api_t api, sai_status_t status, void *context = nullptr);
private:
void removeMeFromObjsReferencedByMe(type_map &type_maps, const std::string &table, const std::string &obj_name, const std::string &field, const std::string &old_referenced_obj_name);
void addConsumer(swss::DBConnector *db, std::string tableName, int pri = default_orch_pri);
Expand Down
29 changes: 15 additions & 14 deletions orchagent/routeorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ bool RouteOrch::validnexthopinNextHopGroup(const NextHopKey &nexthop, uint32_t&
{
SWSS_LOG_ERROR("Failed to add next hop member to group %" PRIx64 ": %d\n",
nhopgroup->second.next_hop_group_id, status);
return false;
return handleSaiCreateStatus(SAI_API_NEXT_HOP_GROUP, status);
}

++count;
Expand Down Expand Up @@ -373,7 +373,7 @@ bool RouteOrch::invalidnexthopinNextHopGroup(const NextHopKey &nexthop, uint32_t
{
SWSS_LOG_ERROR("Failed to remove next hop member %" PRIx64 " from group %" PRIx64 ": %d\n",
nexthop_id, nhopgroup->second.next_hop_group_id, status);
return false;
return handleSaiRemoveStatus(SAI_API_NEXT_HOP_GROUP, status);
}

++count;
Expand Down Expand Up @@ -971,7 +971,7 @@ bool RouteOrch::createFineGrainedNextHopGroup(sai_object_id_t &next_hop_group_id
if (status != SAI_STATUS_SUCCESS)
{
SWSS_LOG_ERROR("Failed to create next hop group rv:%d", status);
return false;
return handleSaiCreateStatus(SAI_API_NEXT_HOP_GROUP, status);
}

gCrmOrch->incCrmResUsedCounter(CrmResourceType::CRM_NEXTHOP_GROUP);
Expand All @@ -989,7 +989,7 @@ bool RouteOrch::removeFineGrainedNextHopGroup(sai_object_id_t &next_hop_group_id
{
SWSS_LOG_ERROR("Failed to remove next hop group %" PRIx64 ", rv:%d",
next_hop_group_id, status);
return false;
return handleSaiRemoveStatus(SAI_API_NEXT_HOP_GROUP, status);
}

gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_NEXTHOP_GROUP);
Expand Down Expand Up @@ -1062,7 +1062,7 @@ bool RouteOrch::addNextHopGroup(const NextHopGroupKey &nexthops)
{
SWSS_LOG_ERROR("Failed to create next hop group %s, rv:%d",
nexthops.to_string().c_str(), status);
return false;
return handleSaiCreateStatus(SAI_API_NEXT_HOP_GROUP, status);
}

m_nextHopGroupCount ++;
Expand Down Expand Up @@ -1190,7 +1190,7 @@ bool RouteOrch::removeNextHopGroup(const NextHopGroupKey &nexthops)
{
SWSS_LOG_ERROR("Failed to remove next hop group member[%zu] %" PRIx64 ", rv:%d",
i, next_hop_ids[i], statuses[i]);
return false;
return handleSaiRemoveStatus(SAI_API_NEXT_HOP_GROUP, statuses[i]);
}

gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_NEXTHOP_GROUP_MEMBER);
Expand All @@ -1200,7 +1200,7 @@ bool RouteOrch::removeNextHopGroup(const NextHopGroupKey &nexthops)
if (status != SAI_STATUS_SUCCESS)
{
SWSS_LOG_ERROR("Failed to remove next hop group %" PRIx64 ", rv:%d", next_hop_group_id, status);
return false;
return handleSaiRemoveStatus(SAI_API_NEXT_HOP_GROUP, status);
}

m_nextHopGroupCount --;
Expand Down Expand Up @@ -1703,7 +1703,8 @@ bool RouteOrch::addRoutePost(const RouteBulkContext& ctx, const NextHopGroupKey
}
else if (it_route == m_syncdRoutes.at(vrf_id).end())
{
if (*it_status++ != SAI_STATUS_SUCCESS)
sai_status_t status = *it_status++;
if (status != SAI_STATUS_SUCCESS)
{
SWSS_LOG_ERROR("Failed to create route %s with next hop(s) %s",
ipPrefix.to_string().c_str(), nextHops.to_string().c_str());
Expand All @@ -1712,7 +1713,7 @@ bool RouteOrch::addRoutePost(const RouteBulkContext& ctx, const NextHopGroupKey
{
removeNextHopGroup(nextHops);
}
return false;
return handleSaiCreateStatus(SAI_API_ROUTE, status);
}

if (ipPrefix.isV4())
Expand Down Expand Up @@ -1742,7 +1743,7 @@ bool RouteOrch::addRoutePost(const RouteBulkContext& ctx, const NextHopGroupKey
{
SWSS_LOG_ERROR("Failed to set route %s with packet action forward, %d",
ipPrefix.to_string().c_str(), status);
return false;
return handleSaiSetStatus(SAI_API_ROUTE, status);
}
}

Expand All @@ -1751,7 +1752,7 @@ bool RouteOrch::addRoutePost(const RouteBulkContext& ctx, const NextHopGroupKey
{
SWSS_LOG_ERROR("Failed to set route %s with next hop(s) %s",
ipPrefix.to_string().c_str(), nextHops.to_string().c_str());
return false;
return handleSaiSetStatus(SAI_API_ROUTE, status);
}

/* Increase the ref_count for the next hop (group) entry */
Expand Down Expand Up @@ -1881,7 +1882,7 @@ bool RouteOrch::removeRoutePost(const RouteBulkContext& ctx)
{
SWSS_LOG_ERROR("Failed to set route %s packet action to drop, rv:%d",
ipPrefix.to_string().c_str(), status);
return false;
return handleSaiSetStatus(SAI_API_ROUTE, status);
}

SWSS_LOG_INFO("Set route %s packet action to drop", ipPrefix.to_string().c_str());
Expand All @@ -1891,7 +1892,7 @@ bool RouteOrch::removeRoutePost(const RouteBulkContext& ctx)
{
SWSS_LOG_ERROR("Failed to set route %s next hop ID to NULL, rv:%d",
ipPrefix.to_string().c_str(), status);
return false;
return handleSaiSetStatus(SAI_API_ROUTE, status);
}

SWSS_LOG_INFO("Set route %s next hop ID to NULL", ipPrefix.to_string().c_str());
Expand All @@ -1902,7 +1903,7 @@ bool RouteOrch::removeRoutePost(const RouteBulkContext& ctx)
if (status != SAI_STATUS_SUCCESS)
{
SWSS_LOG_ERROR("Failed to remove route prefix:%s\n", ipPrefix.to_string().c_str());
return false;
return handleSaiRemoveStatus(SAI_API_ROUTE, status);
}

if (ipPrefix.isV4())
Expand Down

0 comments on commit fc8e43f

Please sign in to comment.