Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[multi-asic] Enhancing monit process checker for multi-asic. #6100

Merged
merged 6 commits into from
Dec 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 46 additions & 9 deletions files/image_config/monit/process_checker
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
#!/usr/bin/python3

import argparse
import ast
import sys
import syslog

import psutil
from sonic_py_common import multi_asic
import swsssdk


Expand All @@ -27,20 +29,55 @@ def check_process_existence(container_name, process_cmdline):
# We leveraged the psutil library to help us check whether the process is running or not.
# If the process entity is found in process tree and it is also in the 'running' or 'sleeping'
# state, then it will be marked as 'running'.
is_running = False
for process in psutil.process_iter(["cmdline", "status"]):

# For given feature we get the host and network namespace instances it's processes should be running
# based on it's scope and add it to expected set.

# From psutil we get number of running instances of the processes and add it to the the actual set

# Difference bwetween expected and actual set provides instances where the processes are not running
# and will be logged as syslog message by monit

process_namespace_expected_set = set()
process_namespace_found_set = set()

has_global_scope = ast.literal_eval(feature_table[container_name].get('has_global_scope', 'True'))
has_per_asic_scope = ast.literal_eval(feature_table[container_name].get('has_per_asic_scope', 'False'))

if has_global_scope:
process_namespace_expected_set.add(multi_asic.DEFAULT_NAMESPACE)

if has_per_asic_scope:
process_namespace_expected_set.update(multi_asic.get_namespace_list())

for process in psutil.process_iter(["cmdline", "status", "pid"]):
try:
if ((' '.join(process.cmdline())).startswith(process_cmdline) and process.status() in ["running", "sleeping"]):
is_running = True
break
process_namespace_found_set.add(multi_asic.get_current_namespace(process.info['pid']))
except psutil.NoSuchProcess:
pass

if not is_running:
# If this script is run by Monit, then the following output will be appended to
# Monit's syslog message.
print("'{}' is not running.".format(process_cmdline))
sys.exit(1)
process_namespace_diff_set = process_namespace_expected_set.difference(process_namespace_found_set)

if process_namespace_diff_set:
host_display_str = ""
namespace_display_str = ""

for ns in process_namespace_diff_set:
if ns == multi_asic.DEFAULT_NAMESPACE:
host_display_str = " in host"
else:
if not namespace_display_str:
namespace_display_str = " in namespace " + ns
else:
namespace_display_str += ", " + ns

join_str = " and" if host_display_str and namespace_display_str else ""

# If this script is run by Monit, then the following output will be appended to
# Monit's syslog message.
print("'{}' is not running{}{}{}".format(process_cmdline, host_display_str, join_str, namespace_display_str))
sys.exit(1)
else:
syslog.syslog(syslog.LOG_ERR, "container '{}' is not included in SONiC image or the given container name is invalid!"
.format(container_name))
Expand Down
6 changes: 4 additions & 2 deletions src/sonic-py-common/sonic_py_common/multi_asic.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,14 +138,14 @@ def get_asic_id_from_name(asic_name):
raise ValueError('Unknown asic namespace name {}'.format(asic_name))


def get_current_namespace():
def get_current_namespace(pid=None):
"""
This API returns the network namespace in which it is
invoked. In case of global namepace the API returns None
"""

net_namespace = None
command = ["/bin/ip netns identify", str(os.getpid())]
command = ["sudo /bin/ip netns identify {}".format(os.getpid() if not pid else pid)]
proc = subprocess.Popen(command,
stdout=subprocess.PIPE,
shell=True,
Expand All @@ -159,6 +159,8 @@ def get_current_namespace():
)
if stdout.rstrip('\n') != "":
net_namespace = stdout.rstrip('\n')
else:
net_namespace = DEFAULT_NAMESPACE
except OSError as e:
raise OSError("Error running command {}".format(command))

Expand Down