Skip to content

Commit

Permalink
Add asic presence filtering for container checking in system-health
Browse files Browse the repository at this point in the history
  • Loading branch information
spilkey-cisco committed Jan 24, 2023
1 parent fd3966a commit 3342a31
Showing 1 changed file with 18 additions and 4 deletions.
22 changes: 18 additions & 4 deletions src/system-health/health_checker/service_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,19 @@ def get_expected_running_containers(self, feature_table):
"""
expected_running_containers = set()
container_feature_dict = {}

# Get current asic presence list. For multi_asic system, multi instance containers
# should be checked only for asics present.
asics_id_presence = multi_asic.get_asic_presence_list()

# Some services may run all the instances irrespective of asic presence.
# Add those to exception list.
# database service: Currently services have dependency on all database services to
# be up irrespective of asic presence.
# bgp service: Currently bgp runs all instances. Once this is fixed to be config driven,
# it will be removed from exception list.
run_all_instance_list = ['database', 'bgp']

for feature_name, feature_entry in feature_table.items():
if feature_entry["state"] not in ["disabled", "always_disabled"]:
if multi_asic.is_multi_asic():
Expand All @@ -80,8 +93,9 @@ def get_expected_running_containers(self, feature_table):
if feature_entry["has_per_asic_scope"] == "True":
num_asics = multi_asic.get_num_asics()
for asic_id in range(num_asics):
expected_running_containers.add(feature_name + str(asic_id))
container_feature_dict[feature_name + str(asic_id)] = feature_name
if asic_id in asics_id_presence or feature_name in run_all_instance_list:
expected_running_containers.add(feature_name + str(asic_id))
container_feature_dict[feature_name + str(asic_id)] = feature_name
else:
expected_running_containers.add(feature_name)
container_feature_dict[feature_name] = feature_name
Expand Down Expand Up @@ -343,7 +357,7 @@ def check_process_existence(self, container_name, critical_process_list, config,
process_status = utils.run_command(cmd)
if process_status is None:
for process_name in critical_process_list:
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
self.publish_events(container_name, critical_process_list)
return

Expand All @@ -356,6 +370,6 @@ def check_process_existence(self, container_name, critical_process_list, config,
# and it is safe to ignore such process. E.g, radv. So here we only check those processes which are in process_status.
if process_name in process_status:
if process_status[process_name] != 'RUNNING':
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "Process '{}' in container '{}' is not running".format(process_name, container_name))
else:
self.set_object_ok('Process', '{}:{}'.format(container_name, process_name))

0 comments on commit 3342a31

Please sign in to comment.