From 579ca2b9fa096d9b11077d31d29eb889416417bf Mon Sep 17 00:00:00 2001 From: anamehra <54692434+anamehra@users.noreply.github.com> Date: Wed, 13 Jul 2022 09:42:23 -0700 Subject: [PATCH 1/5] container_checker on RP should check containers based on asic presence On Supervisor/RP card, some application containers may not run if the asic is not present due to a missing Fabric card. The container checker should skip those container instances. Container instances which run only if asic present: swss, syncd, lldp, teamd Exception: All instances of database and bgp containers run irrespective of asic presence. Signed-off-by: anamehra anamehra@cisco.com --- files/image_config/monit/container_checker | 44 ++++++++++++++++++++-- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/files/image_config/monit/container_checker b/files/image_config/monit/container_checker index a67a96a0c18c..4c5363f966b0 100755 --- a/files/image_config/monit/container_checker +++ b/files/image_config/monit/container_checker @@ -20,9 +20,34 @@ import docker import sys import swsssdk -from sonic_py_common import multi_asic, device_info +from sonic_py_common import multi_asic, device_info, daemon_base from swsscommon import swsscommon +def get_asic_presence_list(): + """ + @summary: This function will get the asic presence list. On Supervisor, the list includes only the asics + for inserted and detected fabric cards. For non-supervisor cards, e.g. line card, the list should + contain all supported asics by the card. The function gets the asic list from CHASSIS_ASIC_TABLE from + CHASSIS_STATE_DB. The function assumes that the first N asic ids (asic0 to asic(N-1)) in + CHASSIS_ASIC_TABLE belongs to the supervisor, where N is the max number of asics supported by the Chassis + @return: List of asics present + """ + asics_list = [] + if multi_asic.is_multi_asic(): + if not device_info.is_supervisor(): + # Supervisor has FRU Fabric cards. If not supervisor, all asics + # should be present. Add all asics, 0 - num_asics to the list. + asics_list = list(range(0,multi_asic.get_num_asics())) + else: + # Get asic list from CHASSIS_ASIC_TABLE + chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB") + asic_table = swsscommon.Table(chassis_state_db, 'CHASSIS_ASIC_TABLE') + if asic_table: + asics_presence_list = list(asic_table.getKeys()) + for asic in asics_presence_list: + # asic is asid id: asic0, asic1.... asicN. Get the numeric value. + asics_list.append(int(asic[4:])) + return asics_list def get_expected_running_containers(): """ @@ -41,7 +66,15 @@ def get_expected_running_containers(): expected_running_containers = set() always_running_containers = set() - + + # Get current asic presence list. For multi_asic system, multi instance containers + # should be checked only for asics present. + asics_id_presence = get_asic_presence_list() + + # Some services, like database and bgp run all the instances irrespective of asic presence. + # Add those to exception list. + run_all_instance_list = ['database', 'bgp'] + for container_name in feature_table.keys(): if feature_table[container_name]["state"] not in ["disabled", "always_disabled"]: if multi_asic.is_multi_asic(): @@ -50,7 +83,8 @@ def get_expected_running_containers(): if feature_table[container_name]["has_per_asic_scope"] == "True": num_asics = multi_asic.get_num_asics() for asic_id in range(num_asics): - expected_running_containers.add(container_name + str(asic_id)) + if asic_id in asics_id_presence or container_name in run_all_instance_list: + expected_running_containers.add(container_name + str(asic_id)) else: expected_running_containers.add(container_name) if feature_table[container_name]["state"] == 'always_enabled': @@ -60,9 +94,11 @@ def get_expected_running_containers(): if feature_table[container_name]["has_per_asic_scope"] == "True": num_asics = multi_asic.get_num_asics() for asic_id in range(num_asics): - always_running_containers.add(container_name + str(asic_id)) + if asic_id in asics_id_presence or container_name in run_all_instance_list: + always_running_containers.add(container_name + str(asic_id)) else: always_running_containers.add(container_name) + if device_info.is_supervisor(): always_running_containers.add("database-chassis") return expected_running_containers, always_running_containers From 3c34c1487796b0bd27a9c40a1f81a759d17ce99e Mon Sep 17 00:00:00 2001 From: anamehra Date: Tue, 26 Jul 2022 14:59:03 -0700 Subject: [PATCH 2/5] Moved function to multi_aisc.py Added more comments. Signed-off-by: anamehra --- files/image_config/monit/container_checker | 36 ++++--------------- .../sonic_py_common/multi_asic.py | 32 ++++++++++++++++- 2 files changed, 38 insertions(+), 30 deletions(-) diff --git a/files/image_config/monit/container_checker b/files/image_config/monit/container_checker index 4c5363f966b0..c6271d26c8b1 100755 --- a/files/image_config/monit/container_checker +++ b/files/image_config/monit/container_checker @@ -20,35 +20,9 @@ import docker import sys import swsssdk -from sonic_py_common import multi_asic, device_info, daemon_base +from sonic_py_common import multi_asic, device_info from swsscommon import swsscommon -def get_asic_presence_list(): - """ - @summary: This function will get the asic presence list. On Supervisor, the list includes only the asics - for inserted and detected fabric cards. For non-supervisor cards, e.g. line card, the list should - contain all supported asics by the card. The function gets the asic list from CHASSIS_ASIC_TABLE from - CHASSIS_STATE_DB. The function assumes that the first N asic ids (asic0 to asic(N-1)) in - CHASSIS_ASIC_TABLE belongs to the supervisor, where N is the max number of asics supported by the Chassis - @return: List of asics present - """ - asics_list = [] - if multi_asic.is_multi_asic(): - if not device_info.is_supervisor(): - # Supervisor has FRU Fabric cards. If not supervisor, all asics - # should be present. Add all asics, 0 - num_asics to the list. - asics_list = list(range(0,multi_asic.get_num_asics())) - else: - # Get asic list from CHASSIS_ASIC_TABLE - chassis_state_db = daemon_base.db_connect("CHASSIS_STATE_DB") - asic_table = swsscommon.Table(chassis_state_db, 'CHASSIS_ASIC_TABLE') - if asic_table: - asics_presence_list = list(asic_table.getKeys()) - for asic in asics_presence_list: - # asic is asid id: asic0, asic1.... asicN. Get the numeric value. - asics_list.append(int(asic[4:])) - return asics_list - def get_expected_running_containers(): """ @summary: This function will get the expected running & always-enabled containers by following the rule: @@ -69,10 +43,14 @@ def get_expected_running_containers(): # Get current asic presence list. For multi_asic system, multi instance containers # should be checked only for asics present. - asics_id_presence = get_asic_presence_list() + asics_id_presence = multi_asic.get_asic_presence_list() - # Some services, like database and bgp run all the instances irrespective of asic presence. + # Some services may run all the instances irrespective of asic presence. # Add those to exception list. + # database service: Currently services have dependency on all database services to + # be up irrespective of asic presence. + # bgp service: Currently bgp runs all instances. Once this is fixed to be config driven, + # it will be removed from exception list. run_all_instance_list = ['database', 'bgp'] for container_name in feature_table.keys(): diff --git a/src/sonic-py-common/sonic_py_common/multi_asic.py b/src/sonic-py-common/sonic_py_common/multi_asic.py index 662c01800e35..286cd3ec52e2 100644 --- a/src/sonic-py-common/sonic_py_common/multi_asic.py +++ b/src/sonic-py-common/sonic_py_common/multi_asic.py @@ -4,6 +4,7 @@ from natsort import natsorted from swsscommon import swsscommon +from sonic_py_common import daemon_base from .device_info import CONTAINER_PLATFORM_PATH from .device_info import HOST_DEVICE_PATH @@ -25,7 +26,8 @@ NEIGH_DEVICE_METADATA_CFG_DB_TABLE = 'DEVICE_NEIGHBOR_METADATA' DEFAULT_NAMESPACE = '' PORT_ROLE = 'role' - +CHASSIS_STATE_DB='CHASSIS_STATE_DB' +CHASSIS_ASIC_INFO_TABLE='CHASSIS_ASIC_TABLE' # Dictionary to cache config_db connection handle per namespace # to prevent duplicate connections from being opened @@ -480,3 +482,31 @@ def validate_namespace(namespace): return True else: return False + +def get_asic_presence_list(): + """ + @summary: This function will get the asic presence list. On Supervisor, the list includes only the asics + for inserted and detected fabric cards. For non-supervisor cards, e.g. line card, the list should + contain all supported asics by the card. The function gets the asic list from CHASSIS_ASIC_TABLE from + CHASSIS_STATE_DB. The function assumes that the first N asic ids (asic0 to asic(N-1)) in + CHASSIS_ASIC_TABLE belongs to the supervisor, where N is the max number of asics supported by the Chassis + @return: List of asics present + """ + asics_list = [] + if is_multi_asic(): + if not is_supervisor(): + # This is not supervisor, all asics should be present. Assuming that asics + # are not removable entity on Line Cards. Add all asics, 0 - num_asics to the list. + asics_list = list(range(0,get_num_asics())) + else: + # This is supervisor card. Some fabric cards may not be inserted. + # Get asic list from CHASSIS_ASIC_TABLE which lists only the asics + # present based on Fabric card detection by the platform. + db = daemon_base.db_connect(CHASSIS_STATE_DB) + asic_table = swsscommon.Table(db,CHASSIS_ASIC_INFO_TABLE) + if asic_table: + asics_presence_list = list(asic_table.getKeys()) + for asic in asics_presence_list: + # asic is asid id: asic0, asic1.... asicN. Get the numeric value. + asics_list.append(int(asic[4:])) + return asics_list From feb862801102567ed5996169c39be9fd6399ee57 Mon Sep 17 00:00:00 2001 From: anamehra Date: Fri, 29 Jul 2022 11:10:58 -0700 Subject: [PATCH 3/5] use get_asic_id_from_name() Signed-off-by: anamehra --- src/sonic-py-common/sonic_py_common/multi_asic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sonic-py-common/sonic_py_common/multi_asic.py b/src/sonic-py-common/sonic_py_common/multi_asic.py index 286cd3ec52e2..cee529a5511e 100644 --- a/src/sonic-py-common/sonic_py_common/multi_asic.py +++ b/src/sonic-py-common/sonic_py_common/multi_asic.py @@ -508,5 +508,5 @@ def get_asic_presence_list(): asics_presence_list = list(asic_table.getKeys()) for asic in asics_presence_list: # asic is asid id: asic0, asic1.... asicN. Get the numeric value. - asics_list.append(int(asic[4:])) + asics_list.append(int(get_asic_id_from_name(asic))) return asics_list From bd30ff283ac7161cb078e9c4fecde40be07ef561 Mon Sep 17 00:00:00 2001 From: anamehra <54692434+anamehra@users.noreply.github.com> Date: Wed, 10 Aug 2022 22:38:00 -0700 Subject: [PATCH 4/5] Use swsscommon.DBConnector --- src/sonic-py-common/sonic_py_common/multi_asic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sonic-py-common/sonic_py_common/multi_asic.py b/src/sonic-py-common/sonic_py_common/multi_asic.py index cee529a5511e..77c759c631ee 100644 --- a/src/sonic-py-common/sonic_py_common/multi_asic.py +++ b/src/sonic-py-common/sonic_py_common/multi_asic.py @@ -497,13 +497,13 @@ def get_asic_presence_list(): if not is_supervisor(): # This is not supervisor, all asics should be present. Assuming that asics # are not removable entity on Line Cards. Add all asics, 0 - num_asics to the list. - asics_list = list(range(0,get_num_asics())) + asics_list = list(range(0, get_num_asics())) else: # This is supervisor card. Some fabric cards may not be inserted. # Get asic list from CHASSIS_ASIC_TABLE which lists only the asics # present based on Fabric card detection by the platform. - db = daemon_base.db_connect(CHASSIS_STATE_DB) - asic_table = swsscommon.Table(db,CHASSIS_ASIC_INFO_TABLE) + db = swsscommon.DBConnector(CHASSIS_STATE_DB, 0, True) + asic_table = swsscommon.Table(db, CHASSIS_ASIC_INFO_TABLE) if asic_table: asics_presence_list = list(asic_table.getKeys()) for asic in asics_presence_list: From cf51950ce6635d1d88527e9e68e7f3370865cb4b Mon Sep 17 00:00:00 2001 From: anamehra <54692434+anamehra@users.noreply.github.com> Date: Thu, 11 Aug 2022 09:40:45 -0700 Subject: [PATCH 5/5] Update multi_asic.py Removed daemon_base import --- src/sonic-py-common/sonic_py_common/multi_asic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sonic-py-common/sonic_py_common/multi_asic.py b/src/sonic-py-common/sonic_py_common/multi_asic.py index 77c759c631ee..dc16a88591f5 100644 --- a/src/sonic-py-common/sonic_py_common/multi_asic.py +++ b/src/sonic-py-common/sonic_py_common/multi_asic.py @@ -4,7 +4,6 @@ from natsort import natsorted from swsscommon import swsscommon -from sonic_py_common import daemon_base from .device_info import CONTAINER_PLATFORM_PATH from .device_info import HOST_DEVICE_PATH