From 313ef5cdaa8a4355234e7e2ccff58c2ecb8e8891 Mon Sep 17 00:00:00 2001 From: Prince Sunny Date: Mon, 9 Sep 2019 10:26:11 -0700 Subject: [PATCH] Warmboot Vlan neigh restore fix (#1040) * Send arp request after first Vlan member port is added * Add wait logic after Vlan member add, nbrmgr to wait for restore complete * Address comment to pass db as a parameter and open only once --- cfgmgr/nbrmgr.cpp | 16 ++++++++- cfgmgr/nbrmgr.h | 4 ++- cfgmgr/nbrmgrd.cpp | 19 +++++++++++ neighsyncd/restore_neighbors.py | 57 +++++++++++++++++++++++++++------ 4 files changed, 84 insertions(+), 12 deletions(-) diff --git a/cfgmgr/nbrmgr.cpp b/cfgmgr/nbrmgr.cpp index d2ca7ebc96dc..e3080b7c01a7 100644 --- a/cfgmgr/nbrmgr.cpp +++ b/cfgmgr/nbrmgr.cpp @@ -47,7 +47,8 @@ NbrMgr::NbrMgr(DBConnector *cfgDb, DBConnector *appDb, DBConnector *stateDb, con m_statePortTable(stateDb, STATE_PORT_TABLE_NAME), m_stateLagTable(stateDb, STATE_LAG_TABLE_NAME), m_stateVlanTable(stateDb, STATE_VLAN_TABLE_NAME), - m_stateIntfTable(stateDb, STATE_INTERFACE_TABLE_NAME) + m_stateIntfTable(stateDb, STATE_INTERFACE_TABLE_NAME), + m_stateNeighRestoreTable(stateDb, STATE_NEIGH_RESTORE_TABLE_NAME) { int err = 0; @@ -91,6 +92,19 @@ bool NbrMgr::isIntfStateOk(const string &alias) return false; } +bool NbrMgr::isNeighRestoreDone() +{ + string value; + + m_stateNeighRestoreTable.hget("Flags", "restored", value); + if (value == "true") + { + SWSS_LOG_INFO("Kernel neighbor table restore is done"); + return true; + } + return false; +} + bool NbrMgr::setNeighbor(const string& alias, const IpAddress& ip, const MacAddress& mac) { SWSS_LOG_ENTER(); diff --git a/cfgmgr/nbrmgr.h b/cfgmgr/nbrmgr.h index 61fba1bf7b28..24d83971a1f2 100644 --- a/cfgmgr/nbrmgr.h +++ b/cfgmgr/nbrmgr.h @@ -20,13 +20,15 @@ class NbrMgr : public Orch NbrMgr(DBConnector *cfgDb, DBConnector *appDb, DBConnector *stateDb, const vector &tableNames); using Orch::doTask; + bool isNeighRestoreDone(); + private: bool isIntfStateOk(const string &alias); bool setNeighbor(const string& alias, const IpAddress& ip, const MacAddress& mac); void doTask(Consumer &consumer); - Table m_statePortTable, m_stateLagTable, m_stateVlanTable, m_stateIntfTable; + Table m_statePortTable, m_stateLagTable, m_stateVlanTable, m_stateIntfTable, m_stateNeighRestoreTable; struct nl_sock *m_nl_sock; }; diff --git a/cfgmgr/nbrmgrd.cpp b/cfgmgr/nbrmgrd.cpp index 2872fa9e6796..7623a9d79511 100644 --- a/cfgmgr/nbrmgrd.cpp +++ b/cfgmgr/nbrmgrd.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include "select.h" #include "exec.h" @@ -12,6 +13,9 @@ using namespace std; using namespace swss; +#define RESTORE_NEIGH_WAIT_TIME_OUT 120 +#define RESTORE_NEIGH_WAIT_TIME_INT 10 + /* select() function timeout retry time, in millisecond */ #define SELECT_TIMEOUT 1000 @@ -50,6 +54,21 @@ int main(int argc, char **argv) NbrMgr nbrmgr(&cfgDb, &appDb, &stateDb, cfg_nbr_tables); + chrono::steady_clock::time_point starttime = chrono::steady_clock::now(); + while (!nbrmgr.isNeighRestoreDone()) + { + chrono::duration time_span = chrono::duration_cast> + (chrono::steady_clock::now() - starttime); + int pasttime = int(time_span.count()); + SWSS_LOG_INFO("Kernel neighbor table restoration waited for %d seconds", pasttime); + if (pasttime > RESTORE_NEIGH_WAIT_TIME_OUT) + { + SWSS_LOG_WARN("Kernel neighbor table restore is not finished!"); + break; + } + sleep(RESTORE_NEIGH_WAIT_TIME_INT); + } + std::vector cfgOrchList = {&nbrmgr}; swss::Select s; diff --git a/neighsyncd/restore_neighbors.py b/neighsyncd/restore_neighbors.py index d0851c632ae7..73453ce7d2a8 100755 --- a/neighsyncd/restore_neighbors.py +++ b/neighsyncd/restore_neighbors.py @@ -25,11 +25,29 @@ from scapy.all import conf, in6_getnsma, inet_pton, inet_ntop, in6_getnsmac, get_if_hwaddr, Ether, ARP, IPv6, ICMPv6ND_NS, ICMPv6NDOptSrcLLAddr from swsscommon import swsscommon import errno +import syslog logger = logging.getLogger(__name__) logger.setLevel(logging.WARNING) logger.addHandler(logging.NullHandler()) +SYSLOG_IDENTIFIER = 'restore_neighbor' + +def log_info(msg): + syslog.openlog(SYSLOG_IDENTIFIER) + syslog.syslog(syslog.LOG_INFO, msg) + syslog.closelog() + +def log_warning(msg): + syslog.openlog(SYSLOG_IDENTIFIER) + syslog.syslog(syslog.LOG_WARNING, msg) + syslog.closelog() + +def log_error(msg): + syslog.openlog(SYSLOG_IDENTIFIER) + syslog.syslog(syslog.LOG_ERR, msg) + syslog.closelog() + # timeout the restore process in 110 seconds if not finished # This is mostly to wait for interfaces to be created and up after system warm-reboot # and this process is started by supervisord in swss docker. @@ -58,12 +76,27 @@ def is_intf_oper_state_up(intf): state_file = open(oper_file.format(intf), 'r') state = state_file.readline().rstrip() except Exception as e: - logger.info('Error: {}'.format(str(e))) + log_info('Error: {}'.format(str(e))) return False if state == '1': return True return False +def is_intf_up(intf, db): + if not is_intf_oper_state_up(intf): + return False + if 'Vlan' in intf: + table_name = 'VLAN_MEMBER_TABLE|{}|*'.format(intf) + key = db.keys(db.STATE_DB, table_name) + if key is None: + log_info ("Vlan member is not yet created") + return False + if is_intf_up.counter == 0: + time.sleep(3*CHECK_INTERVAL) + is_intf_up.counter = 1 + log_info ("intf {} is up".format(intf)) + return True + # read the neigh table from AppDB to memory, format as below # build map as below, this can efficiently access intf and family groups later # { intf1 -> { { family1 -> [[ip1, mac1], [ip2, mac2] ...] } @@ -131,7 +164,7 @@ def read_neigh_table_to_maps(): # Use netlink to set neigh table into kernel, not overwrite the existing ones def set_neigh_in_kernel(ipclass, family, intf_idx, dst_ip, dmac): - logging.info('Add neighbor entries: family: {}, intf_idx: {}, ip: {}, mac: {}'.format( + log_info('Add neighbor entries: family: {}, intf_idx: {}, ip: {}, mac: {}'.format( family, intf_idx, dst_ip, dmac)) if family not in ip_family: @@ -152,7 +185,7 @@ def set_neigh_in_kernel(ipclass, family, intf_idx, dst_ip, dmac): # If neigh exists, log it but no exception raise, other exceptions, raise except NetlinkError as e: if e[0] == errno.EEXIST: - logger.warning('Neigh exists in kernel with family: {}, intf_idx: {}, ip: {}, mac: {}'.format( + log_warning('Neigh exists in kernel with family: {}, intf_idx: {}, ip: {}, mac: {}'.format( family, intf_idx, dst_ip, dmac)) else: raise @@ -196,10 +229,13 @@ def restore_update_kernel_neighbors(intf_neigh_map, timeout=DEF_TIME_OUT): ipclass = IPRoute() mtime = monotonic.time.time start_time = mtime() + is_intf_up.counter = 0 + db = swsssdk.SonicV2Connector(host='127.0.0.1') + db.connect(db.STATE_DB, False) while (mtime() - start_time) < timeout: for intf, family_neigh_map in intf_neigh_map.items(): # only try to restore to kernel when link is up - if is_intf_oper_state_up(intf): + if is_intf_up(intf, db): src_mac = get_if_hwaddr(intf) intf_idx = ipclass.link_lookup(ifname=intf)[0] # create socket per intf to send packets @@ -215,6 +251,8 @@ def restore_update_kernel_neighbors(intf_neigh_map, timeout=DEF_TIME_OUT): # use netlink to set neighbor entries set_neigh_in_kernel(ipclass, family, intf_idx, dst_ip, dmac) + log_info('Sending Neigh with family: {}, intf_idx: {}, ip: {}, mac: {}'.format( + family, intf_idx, dst_ip, dmac)) # sending arp/ns packet to update kernel neigh info s.send(build_arp_ns_pkt(family, src_mac, src_ip, dst_ip)) # delete this family on the intf @@ -229,12 +267,12 @@ def restore_update_kernel_neighbors(intf_neigh_map, timeout=DEF_TIME_OUT): if not intf_neigh_map: break time.sleep(CHECK_INTERVAL) + db.close(db.STATE_DB) def main(): - print "restore_neighbors service is started" - + log_info ("restore_neighbors service is started") # Use warmstart python binding to check warmstart information warmstart = swsscommon.WarmStart() warmstart.initialize("neighsyncd", "swss") @@ -242,15 +280,14 @@ def main(): # if swss or system warm reboot not enabled, don't run if not warmstart.isWarmStart(): - print "restore_neighbors service is skipped as warm restart not enabled" + log_info ("restore_neighbors service is skipped as warm restart not enabled") return # swss restart not system warm reboot, set statedb directly if not warmstart.isSystemWarmRebootEnabled(): set_statedb_neigh_restore_done() - print "restore_neighbors service is done as system warm reboot not enabled" + log_info ("restore_neighbors service is done as system warm reboot not enabled") return - # read the neigh table from appDB to internal map try: intf_neigh_map = read_neigh_table_to_maps() @@ -266,7 +303,7 @@ def main(): # set statedb to signal other processes like neighsyncd set_statedb_neigh_restore_done() - print "restore_neighbor service is done for system warmreboot" + log_info ("restore_neighbor service is done for system warmreboot") return if __name__ == '__main__':