From 4f2773c9a4838bcf6816506645cd36ec0a3f8057 Mon Sep 17 00:00:00 2001 From: Renuka Manavalan <47282725+renukamanavalan@users.noreply.github.com> Date: Thu, 20 Jan 2022 09:27:50 -0800 Subject: [PATCH] [generic-config-updater] Handle failed service restarts (#2020) What I did During config update, update of certain tables do demand service restart. With multiple related updates are not grouped together, this might result in too many service restarts, which could fail with "hitting start limit". When that happens, call reset-failed, try to restart. If it fails again, take a pause and try to restart again. How I did it When service restart fails, call reset-failed, try, pause and then call service restart again. --- generic_config_updater/services_validator.py | 35 +++++++++++++-- .../service_validator_test.py | 43 ++++++++++++++++--- 2 files changed, 68 insertions(+), 10 deletions(-) diff --git a/generic_config_updater/services_validator.py b/generic_config_updater/services_validator.py index 89323df004d5..f69e870ed657 100644 --- a/generic_config_updater/services_validator.py +++ b/generic_config_updater/services_validator.py @@ -17,13 +17,42 @@ def set_verbose(verbose=False): def _service_restart(svc_name): rc = os.system(f"systemctl restart {svc_name}") - logger.log(logger.LOG_PRIORITY_NOTICE, - f"Restarted {svc_name}", print_to_console) + if rc != 0: + # This failure is likely due to too many restarts + # + rc = os.system(f"systemctl reset-failed {svc_name}") + logger.log(logger.LOG_PRIORITY_ERROR, + f"Service has been reset. rc={rc}; Try restart again...", + print_to_console) + + rc = os.system(f"systemctl restart {svc_name}") + if rc != 0: + # Even with reset-failed, restart fails. + # Give a pause before retry. + # + logger.log(logger.LOG_PRIORITY_ERROR, + f"Restart failed for {svc_name} rc={rc} after reset; Pause for 10s & retry", + print_to_console) + os.system("sleep 10s") + rc = os.system(f"systemctl restart {svc_name}") + + if rc == 0: + logger.log(logger.LOG_PRIORITY_NOTICE, + f"Restart succeeded for {svc_name}", + print_to_console) + else: + logger.log(logger.LOG_PRIORITY_ERROR, + f"Restart failed for {svc_name} rc={rc}", + print_to_console) return rc == 0 def rsyslog_validator(old_config, upd_config, keys): - return _service_restart("rsyslog-config") + rc = os.system("/usr/bin/rsyslog-config.sh") + if rc != 0: + return _service_restart("rsyslog") + else: + return True def dhcp_validator(old_config, upd_config, keys): diff --git a/tests/generic_config_updater/service_validator_test.py b/tests/generic_config_updater/service_validator_test.py index 749afaab2ff0..470e5428a79e 100644 --- a/tests/generic_config_updater/service_validator_test.py +++ b/tests/generic_config_updater/service_validator_test.py @@ -6,18 +6,25 @@ from collections import defaultdict from unittest.mock import patch -from generic_config_updater.services_validator import vlan_validator +from generic_config_updater.services_validator import vlan_validator, rsyslog_validator import generic_config_updater.gu_common # Mimics os.system call # -os_system_expected_cmd = "" +os_system_calls = [] +os_system_call_index = 0 msg = "" -def os_system_cfggen(cmd): - assert cmd == os_system_expected_cmd, msg - return 0 +def mock_os_system_call(cmd): + global os_system_calls, os_system_call_index + + assert os_system_call_index < len(os_system_calls) + entry = os_system_calls[os_system_call_index] + os_system_call_index += 1 + + assert cmd == entry["cmd"], msg + return entry["rc"] test_data = [ @@ -53,19 +60,41 @@ def os_system_cfggen(cmd): } ] +test_rsyslog_fail = [ + # Fail the calls, to get the entire fail path calls invoked + # + { "cmd": "/usr/bin/rsyslog-config.sh", "rc": 1 }, # config update; fails + { "cmd": "systemctl restart rsyslog", "rc": 1 }, # rsyslog restart; fails + { "cmd": "systemctl reset-failed rsyslog", "rc": 1 }, # reset; failure here just logs + { "cmd": "systemctl restart rsyslog", "rc": 1 }, # restart again; fails + { "cmd": "sleep 10s", "rc": 0 }, # sleep; rc ignored + { "cmd": "systemctl restart rsyslog", "rc": 1 }, # restart again; fails + ] + + class TestServiceValidator(unittest.TestCase): @patch("generic_config_updater.change_applier.os.system") def test_change_apply(self, mock_os_sys): global os_system_expected_cmd + global os_system_calls, os_system_call_index - mock_os_sys.side_effect = os_system_cfggen + mock_os_sys.side_effect = mock_os_system_call i = 0 for entry in test_data: - os_system_expected_cmd = entry["cmd"] + if entry["cmd"]: + os_system_calls.append({"cmd": entry["cmd"], "rc": 0 }) msg = "case failed: {}".format(str(entry)) vlan_validator(entry["old"], entry["upd"], None) + # Test failure case + # + os_system_calls = test_rsyslog_fail + os_system_call_index = 0 + + rc = rsyslog_validator("", "", "") + assert not rc, "rsyslog_validator expected to fail" +