-
Notifications
You must be signed in to change notification settings - Fork 1.4k
/
supervisor-proc-exit-listener
executable file
·219 lines (179 loc) · 9.03 KB
/
supervisor-proc-exit-listener
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/usr/bin/env python3
import getopt
import os
import re
import select
import signal
import sys
import syslog
import time
from collections import defaultdict
from swsscommon import swsscommon
from supervisor import childutils
# Each line of this file should specify one process, (as defined in supervisord.conf file), in the
# following format:
#
# program:<process_name>
WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes'
# Each line of this file should specify either one critical process or one
# critical process group, (as defined in supervisord.conf file), in the
# following format:
#
# program:<process_name>
# group:<group_name>
CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes'
# The FEATURE table in config db contains auto-restart field
FEATURE_TABLE_NAME = 'FEATURE'
# Value of parameter 'timeout' in select(...) method
SELECT_TIMEOUT_SECS = 1.0
# Alerting message will be written into syslog in the following interval
ALERTING_INTERVAL_SECS = 60
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"
def get_group_and_process_list(process_file):
"""
@summary: Read the critical processes/group names.
@return: Two lists which contain critical processes and group names respectively.
"""
group_list = []
process_list = []
with open(process_file, 'r') as file:
for line in file:
# ignore blank lines
if re.match(r"^\s*$", line):
continue
line_info = line.strip(' \n').split(':')
if len(line_info) != 2:
syslog.syslog(syslog.LOG_ERR,
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
sys.exit(5)
identifier_key = line_info[0].strip()
identifier_value = line_info[1].strip()
if identifier_key == "group" and identifier_value:
group_list.append(identifier_value)
elif identifier_key == "program" and identifier_value:
process_list.append(identifier_value)
else:
syslog.syslog(syslog.LOG_ERR,
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
sys.exit(6)
return group_list, process_list
def generate_alerting_message(process_name, status, dead_minutes, priority=syslog.LOG_ERR):
"""
@summary: If a critical process was not running, this function will determine it resides in host
or in a specific namespace. Then an alerting message will be written into syslog.
"""
namespace_prefix = os.environ.get("NAMESPACE_PREFIX")
namespace_id = os.environ.get("NAMESPACE_ID")
if not namespace_prefix or not namespace_id:
namespace = "host"
else:
namespace = namespace_prefix + namespace_id
syslog.syslog(priority, "Process '{}' is {} in namespace '{}' ({} minutes)."
.format(process_name, status, namespace, dead_minutes))
def get_autorestart_state(container_name, use_unix_socket_path):
"""
@summary: Read the status of auto-restart feature from Config_DB.
@return: Return the status of auto-restart feature.
"""
config_db = swsscommon.ConfigDBConnector(use_unix_socket_path=use_unix_socket_path)
config_db.connect()
features_table = config_db.get_table(FEATURE_TABLE_NAME)
if not features_table:
syslog.syslog(syslog.LOG_ERR, "Unable to retrieve features table from Config DB. Exiting...")
sys.exit(2)
if container_name not in features_table:
syslog.syslog(syslog.LOG_ERR, "Unable to retrieve feature '{}'. Exiting...".format(container_name))
sys.exit(3)
is_auto_restart = features_table[container_name].get('auto_restart')
if not is_auto_restart:
syslog.syslog(
syslog.LOG_ERR, "Unable to determine auto-restart feature status for '{}'. Exiting...".format(container_name))
sys.exit(4)
return is_auto_restart
def publish_events(events_handle, process_name, container_name):
params = swsscommon.FieldValueMap()
params["process_name"] = process_name
params["ctr_name"] = container_name
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
def main(argv):
container_name = None
use_unix_socket_path = False
opts, args = getopt.getopt(argv, "c:s", ["container-name=", "use-unix-socket-path"])
for opt, arg in opts:
if opt in ("-c", "--container-name"):
container_name = arg
if opt in ("-s", "--use-unix-socket-path"):
use_unix_socket_path = True
if not container_name:
syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...")
sys.exit(1)
critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE)
# WATCH_PROCESSES_FILE is optional
watch_process_list = []
if os.path.exists(WATCH_PROCESSES_FILE):
_, watch_process_list = get_group_and_process_list(WATCH_PROCESSES_FILE)
process_under_alerting = defaultdict(dict)
process_heart_beat_info = defaultdict(dict)
# Transition from ACKNOWLEDGED to READY
childutils.listener.ready()
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
while True:
file_descriptor_list = select.select([sys.stdin], [], [], SELECT_TIMEOUT_SECS)[0]
if len(file_descriptor_list) > 0:
line = file_descriptor_list[0].readline()
headers = childutils.get_headers(line)
payload = sys.stdin.read(int(headers['len']))
# Handle the PROCESS_STATE_EXITED event
if headers['eventname'] == 'PROCESS_STATE_EXITED':
payload_headers, payload_data = childutils.eventdata(payload + '\n')
expected = int(payload_headers['expected'])
process_name = payload_headers['processname']
group_name = payload_headers['groupname']
if (process_name in critical_process_list or group_name in critical_group_list) and expected == 0:
is_auto_restart = get_autorestart_state(container_name, use_unix_socket_path)
if is_auto_restart != "disabled":
MSG_FORMAT_STR = "Process '{}' exited unexpectedly. Terminating supervisor '{}'"
msg = MSG_FORMAT_STR.format(payload_headers['processname'], container_name)
syslog.syslog(syslog.LOG_INFO, msg)
publish_events(events_handle, payload_headers['processname'], container_name)
swsscommon.events_deinit_publisher(events_handle)
os.kill(os.getppid(), signal.SIGTERM)
else:
process_under_alerting[process_name]["last_alerted"] = time.time()
process_under_alerting[process_name]["dead_minutes"] = 0
# Handle the PROCESS_STATE_RUNNING event
elif headers['eventname'] == 'PROCESS_STATE_RUNNING':
payload_headers, payload_data = childutils.eventdata(payload + '\n')
process_name = payload_headers['processname']
if process_name in process_under_alerting:
process_under_alerting.pop(process_name)
# Handle the PROCESS_COMMUNICATION_STDOUT event
elif headers['eventname'] == 'PROCESS_COMMUNICATION_STDOUT':
payload_headers, payload_data = childutils.eventdata(payload + '\n')
process_name = payload_headers['processname']
# update process heart beat time
if (process_name in watch_process_list):
process_heart_beat_info[process_name]["last_heart_beat"] = time.time()
# Transition from BUSY to ACKNOWLEDGED
childutils.listener.ok()
# Transition from ACKNOWLEDGED to READY
childutils.listener.ready()
# Check whether we need write alerting messages into syslog
for process_name in process_under_alerting.keys():
epoch_time = time.time()
elapsed_secs = epoch_time - process_under_alerting[process_name]["last_alerted"]
if elapsed_secs >= ALERTING_INTERVAL_SECS:
elapsed_mins = elapsed_secs // 60
process_under_alerting[process_name]["last_alerted"] = epoch_time
process_under_alerting[process_name]["dead_minutes"] += elapsed_mins
generate_alerting_message(process_name, "not running", process_under_alerting[process_name]["dead_minutes"])
# Check whether we need write alerting messages into syslog
for process in process_heart_beat_info.keys():
epoch_time = time.time()
elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"]
if elapsed_secs >= ALERTING_INTERVAL_SECS:
elapsed_mins = elapsed_secs // 60
generate_alerting_message(process, "stuck", elapsed_mins, syslog.LOG_WARNING)
if __name__ == "__main__":
main(sys.argv[1:])