From 4a0c0103c7dba820e4ab4aa2239b9a4437460b07 Mon Sep 17 00:00:00 2001 From: yozhao101 <56170650+yozhao101@users.noreply.github.com> Date: Wed, 17 Mar 2021 23:51:24 -0700 Subject: [PATCH] [config] Disable/enable container monitoring when stopping/starting services (#1499) Signed-off-by: Yong Zhao yozhao@microsoft.com What I did When we ran the commands sudo config reload or sudo config load_minigraph, the containers swss, snmp, lldp, teamd, syncd, snmp, bgp, radv, pmon, dhcp_relay, telemetry, mgmt-framework and restapi would be stopped and then restarted. The script container_checker ran by Monit will generate false alerting messages into syslog to indicate some containers were not running during such stopping and restarting process. So this PR aims to prevent Monit from generating false alarm messages. How I did it Before stopping services, we disable Monit to monitor the running status of containers. After restarting services, we enable Monit to monitor the running status of containers again. How to verify it I deliberately reduce the monitoring interval of Monit from 60 seconds to 10 seconds to ensure the alerting messages from the script container_checker was generated during sudo config reload and sudo config load_minigraph. After this change was added into _stop_services() and _restart_services() , I checked that the alerting messages from container_checker did not appear in the syslog. I verified this change on the virtual switch. --- config/main.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/config/main.py b/config/main.py index 78dee15a21f6..d27562bd4eec 100644 --- a/config/main.py +++ b/config/main.py @@ -669,6 +669,13 @@ def _get_disabled_services_list(config_db): def _stop_services(): + try: + subprocess.check_call("sudo monit status", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + click.echo("Disabling container monitoring ...") + clicommon.run_command("sudo monit unmonitor container_checker") + except subprocess.CalledProcessError as err: + pass + click.echo("Stopping SONiC target ...") clicommon.run_command("sudo systemctl stop sonic.target") @@ -692,6 +699,13 @@ def _restart_services(): click.echo("Reloading Monit configuration ...") clicommon.run_command("sudo monit reload") + try: + subprocess.check_call("sudo monit status", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + click.echo("Enabling container monitoring ...") + clicommon.run_command("sudo monit monitor container_checker") + except subprocess.CalledProcessError as err: + pass + def interface_is_in_vlan(vlan_member_table, interface_name): """ Check if an interface is in a vlan """