diff --git a/scripts/route_check.py b/scripts/route_check.py index c6234bcc9d..4db3f399a2 100755 --- a/scripts/route_check.py +++ b/scripts/route_check.py @@ -11,11 +11,11 @@ How: NOTE: The flow from APPL-DB to ASIC-DB takes non zero milliseconds. 1) Initiate subscribe for ASIC-DB updates. - 2) Read APPL-DB & ASIC-DB + 2) Read APPL-DB & ASIC-DB 3) Get the diff. - 4) If any diff, + 4) If any diff, 4.1) Collect subscribe messages for a second - 4.2) check diff against the subscribe messages + 4.2) check diff against the subscribe messages 5) Rule out local interfaces & default routes 6) If still outstanding diffs, report failure. @@ -29,7 +29,7 @@ down to ensure failure. Analyze the reported failures to match expected. You may use the exit code to verify the result as success or not. - + """ @@ -45,6 +45,7 @@ import time import signal import traceback +import subprocess from swsscommon import swsscommon from utilities_common import chassis @@ -71,6 +72,9 @@ PRINT_MSG_LEN_MAX = 1000 +FRR_CHECK_RETRIES = 3 +FRR_WAIT_TIME = 15 + class Level(Enum): ERR = 'ERR' INFO = 'INFO' @@ -293,7 +297,7 @@ def get_routes(): def get_route_entries(): """ - helper to read present route entries from ASIC-DB and + helper to read present route entries from ASIC-DB and as well initiate selector for ASIC-DB:ASIC-state updates. :return (selector, subscriber, ) """ @@ -309,7 +313,7 @@ def get_route_entries(): res, e = checkout_rt_entry(k) if res: rt.append(e) - + print_message(syslog.LOG_DEBUG, json.dumps({"ASIC_ROUTE_ENTRY": sorted(rt)}, indent=4)) selector = swsscommon.Select() @@ -317,6 +321,31 @@ def get_route_entries(): return (selector, subs, sorted(rt)) +def is_suppress_fib_pending_enabled(): + """ + Returns True if FIB suppression is enabled, False otherwise + """ + cfg_db = swsscommon.ConfigDBConnector() + cfg_db.connect() + + state = cfg_db.get_entry('DEVICE_METADATA', 'localhost').get('suppress-fib-pending') + + return state == 'enabled' + + +def get_frr_routes(): + """ + Read routes from zebra through CLI command + :return frr routes dictionary + """ + + output = subprocess.check_output('show ip route json', shell=True) + routes = json.loads(output) + output = subprocess.check_output('show ipv6 route json', shell=True) + routes.update(json.loads(output)) + return routes + + def get_interfaces(): """ helper to read interface table from APPL-DB. @@ -354,7 +383,7 @@ def filter_out_local_interfaces(keys): chassis_local_intfs = chassis.get_chassis_local_interfaces() local_if_lst.update(set(chassis_local_intfs)) - + db = swsscommon.DBConnector(APPL_DB_NAME, 0) tbl = swsscommon.Table(db, 'ROUTE_TABLE') @@ -493,6 +522,61 @@ def filter_out_standalone_tunnel_routes(routes): return updated_routes +def check_frr_pending_routes(): + """ + Check FRR routes for offload flag presence by executing "show ip route json" + Returns a list of routes that have no offload flag. + """ + + missed_rt = [] + + retries = FRR_CHECK_RETRIES + for i in range(retries): + missed_rt = [] + frr_routes = get_frr_routes() + + for _, entries in frr_routes.items(): + for entry in entries: + if entry['protocol'] != 'bgp': + continue + + # TODO: Also handle VRF routes. Currently this script does not check for VRF routes so it would be incorrect for us + # to assume they are installed in ASIC_DB, so we don't handle them. + if entry['vrfName'] != 'default': + continue + + if not entry.get('offloaded', False): + missed_rt.append(entry) + + if not missed_rt: + break + + time.sleep(FRR_WAIT_TIME) + + return missed_rt + + +def mitigate_installed_not_offloaded_frr_routes(missed_frr_rt, rt_appl): + """ + Mitigate installed but not offloaded FRR routes. + + In case route exists in APPL_DB, this function will manually send a notification to fpmsyncd + to trigger the flow that sends offload flag to zebra. + + It is designed to mitigate a problem when orchagent fails to send notification about installed route to fpmsyncd + or fpmsyncd not being able to read the notification or in case zebra fails to receive offload update due to variety of reasons. + All of the above mentioned cases must be considered as a bug, but even in that case we will report an error in the log but + given that this script ensures the route is installed in the hardware it will automitigate such a bug. + """ + db = swsscommon.DBConnector('APPL_STATE_DB', 0) + response_producer = swsscommon.NotificationProducer(db, f'{APPL_DB_NAME}_{swsscommon.APP_ROUTE_TABLE_NAME}_RESPONSE_CHANNEL') + for entry in [entry for entry in missed_frr_rt if entry['prefix'] in rt_appl]: + fvs = swsscommon.FieldValuePairs([('err_str', 'SWSS_RC_SUCCESS'), ('protocol', entry['protocol'])]) + response_producer.send('SWSS_RC_SUCCESS', entry['prefix'], fvs) + + print_message(syslog.LOG_ERR, f'Mitigated route {entry["prefix"]}') + + def get_soc_ips(config_db): mux_table = config_db.get_table('MUX_CABLE') soc_ips = [] @@ -536,7 +620,7 @@ def check_routes(): """ The heart of this script which runs the checks. Read APPL-DB & ASIC-DB, the relevant tables for route checking. - Checkout routes in ASIC-DB to match APPL-DB, discounting local & + Checkout routes in ASIC-DB to match APPL-DB, discounting local & default routes. In case of missed / unexpected entries in ASIC, it might be due to update latency between APPL & ASIC DBs. So collect ASIC-DB subscribe updates for a second, and checkout if you see SET @@ -545,12 +629,16 @@ def check_routes(): If there are still some unjustifiable diffs, between APPL & ASIC DB, related to routes report failure, else all good. + If there are FRR routes that aren't marked offloaded but all APPL & ASIC DB + routes are in sync report failure and perform a mitigation action. + :return (0, None) on sucess, else (-1, results) where results holds the unjustifiable entries. """ intf_appl_miss = [] rt_appl_miss = [] rt_asic_miss = [] + rt_frr_miss = [] results = {} adds = [] @@ -599,11 +687,22 @@ def check_routes(): if rt_asic_miss: results["Unaccounted_ROUTE_ENTRY_TABLE_entries"] = rt_asic_miss + rt_frr_miss = check_frr_pending_routes() + + if rt_frr_miss: + results["missed_FRR_routes"] = rt_frr_miss + if results: print_message(syslog.LOG_WARNING, "Failure results: {", json.dumps(results, indent=4), "}") print_message(syslog.LOG_WARNING, "Failed. Look at reported mismatches above") print_message(syslog.LOG_WARNING, "add: ", json.dumps(adds, indent=4)) print_message(syslog.LOG_WARNING, "del: ", json.dumps(deletes, indent=4)) + + if rt_frr_miss and not rt_appl_miss and not rt_asic_miss: + print_message(syslog.LOG_ERR, "Some routes are not set offloaded in FRR but all routes in APPL_DB and ASIC_DB are in sync") + if is_suppress_fib_pending_enabled(): + mitigate_installed_not_offloaded_frr_routes(rt_frr_miss, rt_appl) + return -1, results else: print_message(syslog.LOG_INFO, "All good!") @@ -649,7 +748,7 @@ def main(): return ret, res else: return ret, res - + if __name__ == "__main__": diff --git a/tests/mock_tables/config_db.json b/tests/mock_tables/config_db.json index 899dada260..51af58e86d 100644 --- a/tests/mock_tables/config_db.json +++ b/tests/mock_tables/config_db.json @@ -831,7 +831,8 @@ "mac": "1d:34:db:16:a6:00", "platform": "x86_64-mlnx_msn3800-r0", "peer_switch": "sonic-switch", - "type": "ToRRouter" + "type": "ToRRouter", + "suppress-fib-pending": "enabled" }, "SNMP_COMMUNITY|msft": { "TYPE": "RO" diff --git a/tests/route_check_test.py b/tests/route_check_test.py index 85e6a64a95..4d93c74e2d 100644 --- a/tests/route_check_test.py +++ b/tests/route_check_test.py @@ -7,7 +7,7 @@ import time from sonic_py_common import device_info from unittest.mock import MagicMock, patch -from tests.route_check_test_data import APPL_DB, ARGS, ASIC_DB, CONFIG_DB, DEFAULT_CONFIG_DB, DESCR, OP_DEL, OP_SET, PRE, RESULT, RET, TEST_DATA, UPD +from tests.route_check_test_data import APPL_DB, ARGS, ASIC_DB, CONFIG_DB, DEFAULT_CONFIG_DB, DESCR, OP_DEL, OP_SET, PRE, RESULT, RET, TEST_DATA, UPD, FRR_ROUTES import pytest @@ -239,6 +239,7 @@ def setup(self): def init(self): route_check.UNIT_TESTING = 1 + route_check.FRR_WAIT_TIME = 0 @pytest.fixture def force_hang(self): @@ -258,7 +259,8 @@ def mock_dbs(self): patch("route_check.swsscommon.Table") as mock_table, \ patch("route_check.swsscommon.Select") as mock_sel, \ patch("route_check.swsscommon.SubscriberStateTable") as mock_subs, \ - patch("route_check.swsscommon.ConfigDBConnector", return_value=mock_config_db): + patch("route_check.swsscommon.ConfigDBConnector", return_value=mock_config_db), \ + patch("route_check.swsscommon.NotificationProducer"): device_info.get_platform = MagicMock(return_value='unittest') set_mock(mock_table, mock_conn, mock_sel, mock_subs, mock_config_db) yield @@ -272,7 +274,21 @@ def test_route_check(self, mock_dbs, test_num): set_test_case_data(ct_data) logger.info("Running test case {}: {}".format(test_num, ct_data[DESCR])) - with patch('sys.argv', ct_data[ARGS].split()): + with patch('sys.argv', ct_data[ARGS].split()), \ + patch('route_check.subprocess.check_output') as mock_check_output: + + check_frr_patch = patch('route_check.check_frr_pending_routes', lambda: []) + + if FRR_ROUTES in ct_data: + routes = ct_data[FRR_ROUTES] + + def side_effect(*args, **kwargs): + return json.dumps(routes) + + mock_check_output.side_effect = side_effect + else: + check_frr_patch.start() + ret, res = route_check.main() expect_ret = ct_data[RET] if RET in ct_data else 0 expect_res = ct_data[RESULT] if RESULT in ct_data else None @@ -283,6 +299,8 @@ def test_route_check(self, mock_dbs, test_num): assert ret == expect_ret assert res == expect_res + check_frr_patch.stop() + def test_timeout(self, mock_dbs, force_hang): # Test timeout ex_raised = False diff --git a/tests/route_check_test_data.py b/tests/route_check_test_data.py index 9e4cd3a009..b8ba9c521a 100644 --- a/tests/route_check_test_data.py +++ b/tests/route_check_test_data.py @@ -6,6 +6,7 @@ CONFIG_DB = 4 PRE = "pre-value" UPD = "update" +FRR_ROUTES = "frr-routes" RESULT = "res" OP_SET = "SET" @@ -359,5 +360,106 @@ } } } - } + }, + "10": { + DESCR: "basic good one, check FRR routes", + ARGS: "route_check -m INFO -i 1000", + PRE: { + APPL_DB: { + ROUTE_TABLE: { + "0.0.0.0/0" : { "ifname": "portchannel0" }, + "10.10.196.12/31" : { "ifname": "portchannel0" }, + }, + INTF_TABLE: { + "PortChannel1013:10.10.196.24/31": {}, + "PortChannel1023:2603:10b0:503:df4::5d/126": {}, + "PortChannel1024": {} + } + }, + ASIC_DB: { + RT_ENTRY_TABLE: { + RT_ENTRY_KEY_PREFIX + "10.10.196.12/31" + RT_ENTRY_KEY_SUFFIX: {}, + RT_ENTRY_KEY_PREFIX + "10.10.196.24/32" + RT_ENTRY_KEY_SUFFIX: {}, + RT_ENTRY_KEY_PREFIX + "2603:10b0:503:df4::5d/128" + RT_ENTRY_KEY_SUFFIX: {}, + RT_ENTRY_KEY_PREFIX + "0.0.0.0/0" + RT_ENTRY_KEY_SUFFIX: {} + } + }, + }, + FRR_ROUTES: { + "0.0.0.0/0": [ + { + "prefix": "0.0.0.0/0", + "vrfName": "default", + "protocol": "bgp", + "offloaded": "true", + }, + ], + "10.10.196.12/31": [ + { + "prefix": "10.10.196.12/31", + "vrfName": "default", + "protocol": "bgp", + "offloaded": "true", + }, + ], + "10.10.196.24/31": [ + { + "protocol": "connected", + }, + ], + }, + }, + "11": { + DESCR: "failure test case, missing FRR routes", + ARGS: "route_check -m INFO -i 1000", + PRE: { + APPL_DB: { + ROUTE_TABLE: { + "0.0.0.0/0" : { "ifname": "portchannel0" }, + "10.10.196.12/31" : { "ifname": "portchannel0" }, + }, + INTF_TABLE: { + "PortChannel1013:10.10.196.24/31": {}, + "PortChannel1023:2603:10b0:503:df4::5d/126": {}, + "PortChannel1024": {} + } + }, + ASIC_DB: { + RT_ENTRY_TABLE: { + RT_ENTRY_KEY_PREFIX + "10.10.196.12/31" + RT_ENTRY_KEY_SUFFIX: {}, + RT_ENTRY_KEY_PREFIX + "10.10.196.24/32" + RT_ENTRY_KEY_SUFFIX: {}, + RT_ENTRY_KEY_PREFIX + "2603:10b0:503:df4::5d/128" + RT_ENTRY_KEY_SUFFIX: {}, + RT_ENTRY_KEY_PREFIX + "0.0.0.0/0" + RT_ENTRY_KEY_SUFFIX: {} + } + }, + }, + FRR_ROUTES: { + "0.0.0.0/0": [ + { + "prefix": "0.0.0.0/0", + "vrfName": "default", + "protocol": "bgp", + "offloaded": "true", + }, + ], + "10.10.196.12/31": [ + { + "prefix": "10.10.196.12/31", + "vrfName": "default", + "protocol": "bgp", + }, + ], + "10.10.196.24/31": [ + { + "protocol": "connected", + }, + ], + }, + RESULT: { + "missed_FRR_routes": [ + {"prefix": "10.10.196.12/31", "vrfName": "default", "protocol": "bgp"} + ], + }, + RET: -1, + }, }