Skip to content

Commit

Permalink
[container_checker] Use Feature table to get running containers (soni…
Browse files Browse the repository at this point in the history
…c-net#7474)

Why I did it
Finding running containers through "docker ps" breaks when kubernetes deploys container, as the names are mangled.

How I did it
The data is is available from FEATURE table, which takes care of kubernetes deployment too.

How to verify it
Deploy a feature via kubernetes and don't expect error from container_check.
  • Loading branch information
renukamanavalan authored and raphaelt-nvidia committed May 13, 2021
1 parent 45a04d9 commit 4e1292e
Showing 1 changed file with 76 additions and 36 deletions.
112 changes: 76 additions & 36 deletions files/image_config/monit/container_checker
Original file line number Diff line number Diff line change
Expand Up @@ -16,50 +16,31 @@ check program container_checker with path "/usr/bin/container_checker"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
"""

import subprocess
import docker
import sys

import swsssdk
from sonic_py_common import multi_asic


def get_command_result(command):
"""
@summary: This function will execute the command and return the resulting output.
@return: A string which contains the output of command.
"""
command_stdout = ""

try:
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=True, universal_newlines=True)
command_stdout, command_stderr = proc_instance.communicate()
if proc_instance.returncode != 0:
print("Failed to execute the command '{}'. Return code: '{}'".format(
command, proc_instance.returncode))
sys.exit(1)
except (OSError, ValueError) as err:
print("Failed to execute the command '{}'. Error: '{}'".format(command, err))
sys.exit(2)

return command_stdout.rstrip().split("\n")
from swsscommon import swsscommon


def get_expected_running_containers():
"""
@summary: This function will get the expected running containers by following the rule:
@summary: This function will get the expected running & always-enabled containers by following the rule:
The 'state' field of container in 'FEATURE' table should not be 'disabled' or 'always_disabled'.
If the device has Multi-ASIC, this function will get container list by determining the
value of field 'has_global_scope', the number of ASICs and the value of field
'has_per_asic_scope'.
If the device has single ASIC, the container name was put into the list.
@return: A set which contains the expected running containers.
@return: A set which contains the expected running containers and a set that has
containers marked as "always_enabled".
"""
config_db = swsssdk.ConfigDBConnector()
config_db.connect()
feature_table = config_db.get_table("FEATURE")

expected_running_containers = set()
always_running_containers = set()

for container_name in feature_table.keys():
if feature_table[container_name]["state"] not in ["disabled", "always_disabled"]:
Expand All @@ -70,37 +51,95 @@ def get_expected_running_containers():
num_asics = multi_asic.get_num_asics()
for asic_id in range(num_asics):
expected_running_containers.add(container_name + str(asic_id))
elif feature_table[container_name]["state"] == 'always_enabled':
always_running_containers.add(container_name)
else:
expected_running_containers.add(container_name)

return expected_running_containers
return expected_running_containers, always_running_containers


def get_current_running_containers():
def get_current_running_from_DB(always_running_containers):
"""
@summary: This function will get the current running container list by analyzing the
output of command `docker ps`.
@return: A set which contains the current running contianers.
@summary: This function will get the current running container list
from FEATURE table @ STATE_DB, if this table is available.
@return: a tuple
First: Return value indicating if info can be obtained from
DB or not.
Second: A set which contains the current running containers,
if this info is available in DB.
"""
running_containers = set()

command = "docker ps"
command_stdout = get_command_result(command)
for line in command_stdout[1:]:
running_containers.add(line.split()[-1].strip())
state_db = swsscommon.DBConnector("STATE_DB", 0)
tbl = swsscommon.Table(state_db, "FEATURE")
if not tbl.getKeys():
return False, None

for name in tbl.getKeys():
data = dict(tbl.get(name)[1])
if data.get('container_id'):
running_containers.add(name)

DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
RUNNING = 'running'
for name in always_running_containers:
try:
container = DOCKER_CLIENT.containers.get(name)
container_state = container.attrs.get('State', {})
if container_state.get('Status', "") == RUNNING:
running_containers.add(name)
except (docker.errors.NotFound, docker.errors.APIError) as err:
print("Failed to get container '{}'. Error: '{}'".format(name, err))
pass

return True, running_containers


def get_current_running_from_dockers():
"""
@summary: This function will get all running containers from
the list of docker containers in running state.
@return: A set which contains containers that are
in running state.
"""
DOCKER_CLIENT = docker.DockerClient(base_url='unix://var/run/docker.sock')
running_containers = set()
ctrs = DOCKER_CLIENT.containers
try:
lst = ctrs.list(filters={"status": "running"})
for ctr in lst:
running_containers.add(ctr.name)
except docker.errors.APIError as err:
print("Failed to retrieve the running container list. Error: '{}'".format(err))
pass
return running_containers


def get_current_running_containers(always_running_containers):
"""
@summary: This function will get the list of currently running containers.
If available in STATE-DB, get from DB else from list of dockers.
@return: A set of currently running containers.
"""

ret, current_running_containers = get_current_running_from_DB(always_running_containers)
if not ret:
current_running_containers = get_current_running_from_dockers()
return current_running_containers


def main():
"""
@summary: This function will compare the difference between the current running containers
and the containers which were expected to run. If containers which were exepcted
to run were not running, then an alerting message will be written into syslog.
"""
expected_running_containers = get_expected_running_containers()
current_running_containers = get_current_running_containers()
expected_running_containers, always_running_containers = get_expected_running_containers()
current_running_containers = get_current_running_containers(always_running_containers)

expected_running_containers |= always_running_containers
not_running_containers = expected_running_containers.difference(current_running_containers)
if not_running_containers:
print("Expected containers not running: " + ", ".join(not_running_containers))
Expand All @@ -114,3 +153,4 @@ def main():

if __name__ == "__main__":
main()
sys.exit(0)

0 comments on commit 4e1292e

Please sign in to comment.