Skip to content

Commit

Permalink
[Supervisord] Deduplicate the alerting messages of critical processes…
Browse files Browse the repository at this point in the history
… from Supervisord. (sonic-net#6849)

Signed-off-by: Yong Zhao yozhao@microsoft.com

Why I did it
In the configuration of rsyslog, duplicate messages will be suppressed and reported in the format of message repeated n times.
Due to this behavior, if a critical process in a container exited unexpectedly, the alerting message will be written into syslog once
and not be written into syslog anymore until the second critical process exited. This PR aims to differentiate these alerting messages such that they will not be suppressed by rsyslogd and can appear in the syslog periodically.

How I did it
This PR adds a counter into the alerting message and shows how many minutes a critical process was not running.

How to verify it
I verified and test this implementation on a physical DUT.
  • Loading branch information
yozhao101 authored and yxieca committed Mar 3, 2021
1 parent 2bce349 commit b3b1e35
Showing 1 changed file with 14 additions and 8 deletions.
22 changes: 14 additions & 8 deletions files/scripts/supervisor-proc-exit-listener
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import signal
import sys
import syslog
import time
from collections import defaultdict

import swsssdk

Expand Down Expand Up @@ -64,7 +65,7 @@ def get_critical_group_and_process_list():
return critical_group_list, critical_process_list


def generate_alerting_message(process_name):
def generate_alerting_message(process_name, dead_minutes):
"""
@summary: If a critical process was not running, this function will determine it resides in host
or in a specific namespace. Then an alerting message will be written into syslog.
Expand All @@ -77,7 +78,8 @@ def generate_alerting_message(process_name):
else:
namespace = namespace_prefix + namespace_id

syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}'.".format(process_name, namespace))
syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}' ({} minutes)."
.format(process_name, namespace, dead_minutes))


def get_autorestart_state(container_name):
Expand Down Expand Up @@ -118,7 +120,7 @@ def main(argv):

critical_group_list, critical_process_list = get_critical_group_and_process_list()

process_under_alerting = {}
process_under_alerting = defaultdict(dict)
# Transition from ACKNOWLEDGED to READY
childutils.listener.ready()

Expand All @@ -145,7 +147,8 @@ def main(argv):
syslog.syslog(syslog.LOG_INFO, msg)
os.kill(os.getppid(), signal.SIGTERM)
else:
process_under_alerting[process_name] = time.time()
process_under_alerting[process_name]["last_alerted"] = time.time()
process_under_alerting[process_name]["dead_minutes"] = 0

# Handle the PROCESS_STATE_RUNNING event
elif headers['eventname'] == 'PROCESS_STATE_RUNNING':
Expand All @@ -162,11 +165,14 @@ def main(argv):
childutils.listener.ready()

# Check whether we need write alerting messages into syslog
for process in process_under_alerting.keys():
for process_name in process_under_alerting.keys():
epoch_time = time.time()
if epoch_time - process_under_alerting[process] >= ALERTING_INTERVAL_SECS:
process_under_alerting[process] = epoch_time
generate_alerting_message(process)
elapsed_secs = epoch_time - process_under_alerting[process_name]["last_alerted"]
if elapsed_secs >= ALERTING_INTERVAL_SECS:
elapsed_mins = elapsed_secs // 60
process_under_alerting[process_name]["last_alerted"] = epoch_time
process_under_alerting[process_name]["dead_minutes"] += elapsed_mins
generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"])


if __name__ == "__main__":
Expand Down

0 comments on commit b3b1e35

Please sign in to comment.