From e6938e943f91f5bc9e9b57c767276f404724176f Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Mon, 10 Feb 2020 15:30:43 -0800 Subject: [PATCH 1/7] [database] Implement the auto-restart feature for database container. Signed-off-by: Yong Zhao --- dockers/docker-database/Dockerfile.j2 | 2 ++ dockers/docker-database/critical_processes | 1 + dockers/docker-database/supervisord.conf.j2 | 7 ++++ files/build_templates/dhcp_relay.service.j2 | 2 +- files/build_templates/lldp.service.j2 | 2 +- files/build_templates/radv.service.j2 | 2 +- files/build_templates/sflow.service.j2 | 2 +- .../single_instance/bgp.service.j2 | 2 +- .../single_instance/database.service.j2 | 4 +++ .../single_instance/teamd.service.j2 | 2 +- files/build_templates/snmp.service.j2 | 2 +- files/scripts/supervisor-proc-exit-listener | 33 ++++++++++--------- rules/docker-database.mk | 1 + 13 files changed, 39 insertions(+), 23 deletions(-) create mode 100644 dockers/docker-database/critical_processes diff --git a/dockers/docker-database/Dockerfile.j2 b/dockers/docker-database/Dockerfile.j2 index acb5e013fb84..8cd181614672 100644 --- a/dockers/docker-database/Dockerfile.j2 +++ b/dockers/docker-database/Dockerfile.j2 @@ -36,5 +36,7 @@ COPY ["supervisord.conf.j2", "/usr/share/sonic/templates/"] COPY ["docker-database-init.sh", "/usr/local/bin/"] COPY ["ping_pong_db_insts", "/usr/local/bin/"] COPY ["database_config.json", "/etc/default/sonic-db/"] +COPY ["files/supervisor-proc-exit-listener", "/usr/bin"] +COPY ["critical_processes", "/etc/supervisor"] ENTRYPOINT ["/usr/local/bin/docker-database-init.sh"] diff --git a/dockers/docker-database/critical_processes b/dockers/docker-database/critical_processes new file mode 100644 index 000000000000..7800f0fad3ff --- /dev/null +++ b/dockers/docker-database/critical_processes @@ -0,0 +1 @@ +redis diff --git a/dockers/docker-database/supervisord.conf.j2 b/dockers/docker-database/supervisord.conf.j2 index 110619f762be..442bec1438c8 100644 --- a/dockers/docker-database/supervisord.conf.j2 +++ b/dockers/docker-database/supervisord.conf.j2 @@ -3,6 +3,13 @@ logfile_maxbytes=1MB logfile_backups=2 nodaemon=true +[eventlistener:supervisor-proc-exit-listener] +command=/usr/bin/supervisor-proc-exit-listener --container-name database +events=PROCESS_STATE_EXITED +autostart=true +autorestart=unexpected + + [program:rsyslogd] command=/bin/bash -c "rm -f /var/run/rsyslogd.pid && /usr/sbin/rsyslogd -n" priority=1 diff --git a/files/build_templates/dhcp_relay.service.j2 b/files/build_templates/dhcp_relay.service.j2 index d501a663feba..ff4c0e63de5c 100644 --- a/files/build_templates/dhcp_relay.service.j2 +++ b/files/build_templates/dhcp_relay.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=DHCP relay container -Requires=updategraph.service +Requires=updategraph.service database.service After=updategraph.service swss.service syncd.service teamd.service Before=ntp-config.service StartLimitIntervalSec=1200 diff --git a/files/build_templates/lldp.service.j2 b/files/build_templates/lldp.service.j2 index 2599fc5c5bdc..f6f3e7cdfb30 100644 --- a/files/build_templates/lldp.service.j2 +++ b/files/build_templates/lldp.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=LLDP container -Requires=updategraph.service +Requires=updategraph.service database.service After=updategraph.service swss.service syncd.service Before=ntp-config.service StartLimitIntervalSec=1200 diff --git a/files/build_templates/radv.service.j2 b/files/build_templates/radv.service.j2 index b3dd3a8d8bcb..5a9047fcc5a2 100644 --- a/files/build_templates/radv.service.j2 +++ b/files/build_templates/radv.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=Router advertiser container -Requires=updategraph.service +Requires=updategraph.service database.service After=updategraph.service swss.service syncd.service Before=ntp-config.service StartLimitIntervalSec=1200 diff --git a/files/build_templates/sflow.service.j2 b/files/build_templates/sflow.service.j2 index 643bf646964d..eea0515e978f 100644 --- a/files/build_templates/sflow.service.j2 +++ b/files/build_templates/sflow.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=sFlow container -Requisite=swss.service +Requisite=swss.service database.service After=swss.service syncd.service Before=ntp-config.service StartLimitIntervalSec=1200 diff --git a/files/build_templates/single_instance/bgp.service.j2 b/files/build_templates/single_instance/bgp.service.j2 index 7200a0e3ecf2..b766e165092a 100644 --- a/files/build_templates/single_instance/bgp.service.j2 +++ b/files/build_templates/single_instance/bgp.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=BGP container -Requires=updategraph.service +Requires=updategraph.service database.service After=updategraph.service Before=ntp-config.service diff --git a/files/build_templates/single_instance/database.service.j2 b/files/build_templates/single_instance/database.service.j2 index 472b9d328b7d..fd0063195e31 100644 --- a/files/build_templates/single_instance/database.service.j2 +++ b/files/build_templates/single_instance/database.service.j2 @@ -3,12 +3,16 @@ Description=Database container Requires=docker.service After=docker.service After=rc-local.service +StartLimitIntervalSec=1200 +StartLimitBurst=3 [Service] User=root ExecStartPre=/usr/bin/{{docker_container_name}}.sh start ExecStart=/usr/bin/{{docker_container_name}}.sh wait ExecStop=/usr/bin/{{docker_container_name}}.sh stop +Restart=always +RestartSec=30 [Install] WantedBy=multi-user.target diff --git a/files/build_templates/single_instance/teamd.service.j2 b/files/build_templates/single_instance/teamd.service.j2 index be0521a4fbec..7f01dbe7e032 100644 --- a/files/build_templates/single_instance/teamd.service.j2 +++ b/files/build_templates/single_instance/teamd.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=TEAMD container -Requires=updategraph.service +Requires=updategraph.service database.service After=updategraph.service swss.service Before=ntp-config.service StartLimitIntervalSec=1200 diff --git a/files/build_templates/snmp.service.j2 b/files/build_templates/snmp.service.j2 index 4997ab737e37..44072c55b956 100644 --- a/files/build_templates/snmp.service.j2 +++ b/files/build_templates/snmp.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=SNMP container -Requires=updategraph.service +Requires=updategraph.service database.service Requisite=swss.service After=updategraph.service swss.service syncd.service Before=ntp-config.service diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index cf26d5383074..9485540158ed 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -52,24 +52,25 @@ def main(argv): processname = payload_headers['processname'] groupname = payload_headers['groupname'] - config_db = swsssdk.ConfigDBConnector() - config_db.connect() - container_features_table = config_db.get_table(CONTAINER_FEATURE_TABLE_NAME) - if not container_features_table: - syslog.syslog(syslog.LOG_ERR, "Unable to retrieve container features table from Config DB. Exiting...") - sys.exit(2) - - if not container_features_table.has_key(container_name): - syslog.syslog(syslog.LOG_ERR, "Unable to retrieve features for container '{}'. Exiting...".format(container_name)) - sys.exit(3) - - restart_feature = container_features_table[container_name].get('auto_restart') - if not restart_feature: - syslog.syslog(syslog.LOG_ERR, "Unable to determine auto-restart feature status for container '{}'. Exiting...".format(container_name)) - sys.exit(4) + if container_name != 'database': + config_db = swsssdk.ConfigDBConnector() + config_db.connect() + container_features_table = config_db.get_table(CONTAINER_FEATURE_TABLE_NAME) + if not container_features_table: + syslog.syslog(syslog.LOG_ERR, "Unable to retrieve container features table from Config DB. Exiting...") + sys.exit(2) + + if not container_features_table.has_key(container_name): + syslog.syslog(syslog.LOG_ERR, "Unable to retrieve features for container '{}'. Exiting...".format(container_name)) + sys.exit(3) + + restart_feature = container_features_table[container_name].get('auto_restart') + if not restart_feature: + syslog.syslog(syslog.LOG_ERR, "Unable to determine auto-restart feature status for container '{}'. Exiting...".format(container_name)) + sys.exit(4) # If auto-restart feature is enabled and a critical process exited unexpectedly, terminate supervisor - if restart_feature == 'enabled' and expected == 0 and (processname in critical_processes or groupname in critical_processes): + if (container_name == 'database' or restart_feature == 'enabled') and expected == 0 and (processname in critical_processes or groupname in critical_processes): MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..." msg = MSG_FORMAT_STR.format(payload_headers['processname']) syslog.syslog(syslog.LOG_INFO, msg) diff --git a/rules/docker-database.mk b/rules/docker-database.mk index 91fd06819a4b..7e372048afab 100644 --- a/rules/docker-database.mk +++ b/rules/docker-database.mk @@ -28,3 +28,4 @@ $(DOCKER_DATABASE)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_DATABASE)_BASE_IMAGE_FILES += redis-cli:/usr/bin/redis-cli $(DOCKER_DATABASE)_BASE_IMAGE_FILES += monit_database:/etc/monit/conf.d +$(DOCKER_DATABASE)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) From 6750cacea9c69dcfb8253b9a229ae1913f8bbb2c Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Mon, 10 Feb 2020 16:41:14 -0800 Subject: [PATCH 2/7] [database] Remove the duplicate dependency in service files. Since we already have updategraph ---> config_setup ---> database, we do not need explicitly add database.service in all other container service files. Signed-off-by: Yong Zhao --- files/build_templates/dhcp_relay.service.j2 | 2 +- files/build_templates/lldp.service.j2 | 2 +- files/build_templates/radv.service.j2 | 2 +- files/build_templates/single_instance/bgp.service.j2 | 2 +- files/build_templates/single_instance/teamd.service.j2 | 2 +- files/build_templates/snmp.service.j2 | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/files/build_templates/dhcp_relay.service.j2 b/files/build_templates/dhcp_relay.service.j2 index ff4c0e63de5c..d501a663feba 100644 --- a/files/build_templates/dhcp_relay.service.j2 +++ b/files/build_templates/dhcp_relay.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=DHCP relay container -Requires=updategraph.service database.service +Requires=updategraph.service After=updategraph.service swss.service syncd.service teamd.service Before=ntp-config.service StartLimitIntervalSec=1200 diff --git a/files/build_templates/lldp.service.j2 b/files/build_templates/lldp.service.j2 index f6f3e7cdfb30..2599fc5c5bdc 100644 --- a/files/build_templates/lldp.service.j2 +++ b/files/build_templates/lldp.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=LLDP container -Requires=updategraph.service database.service +Requires=updategraph.service After=updategraph.service swss.service syncd.service Before=ntp-config.service StartLimitIntervalSec=1200 diff --git a/files/build_templates/radv.service.j2 b/files/build_templates/radv.service.j2 index 5a9047fcc5a2..b3dd3a8d8bcb 100644 --- a/files/build_templates/radv.service.j2 +++ b/files/build_templates/radv.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=Router advertiser container -Requires=updategraph.service database.service +Requires=updategraph.service After=updategraph.service swss.service syncd.service Before=ntp-config.service StartLimitIntervalSec=1200 diff --git a/files/build_templates/single_instance/bgp.service.j2 b/files/build_templates/single_instance/bgp.service.j2 index b766e165092a..7200a0e3ecf2 100644 --- a/files/build_templates/single_instance/bgp.service.j2 +++ b/files/build_templates/single_instance/bgp.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=BGP container -Requires=updategraph.service database.service +Requires=updategraph.service After=updategraph.service Before=ntp-config.service diff --git a/files/build_templates/single_instance/teamd.service.j2 b/files/build_templates/single_instance/teamd.service.j2 index 7f01dbe7e032..be0521a4fbec 100644 --- a/files/build_templates/single_instance/teamd.service.j2 +++ b/files/build_templates/single_instance/teamd.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=TEAMD container -Requires=updategraph.service database.service +Requires=updategraph.service After=updategraph.service swss.service Before=ntp-config.service StartLimitIntervalSec=1200 diff --git a/files/build_templates/snmp.service.j2 b/files/build_templates/snmp.service.j2 index 44072c55b956..4997ab737e37 100644 --- a/files/build_templates/snmp.service.j2 +++ b/files/build_templates/snmp.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=SNMP container -Requires=updategraph.service database.service +Requires=updategraph.service Requisite=swss.service After=updategraph.service swss.service syncd.service Before=ntp-config.service From 21f5abeefa4edae9a93d59e1782916b151bbf70a Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Mon, 10 Feb 2020 16:44:37 -0800 Subject: [PATCH 3/7] [event listener] Reorganize the line 73 in event listener script. Signed-off-by: Yong Zhao --- files/scripts/supervisor-proc-exit-listener | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 9485540158ed..86b54ed823b2 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -70,7 +70,8 @@ def main(argv): sys.exit(4) # If auto-restart feature is enabled and a critical process exited unexpectedly, terminate supervisor - if (container_name == 'database' or restart_feature == 'enabled') and expected == 0 and (processname in critical_processes or groupname in critical_processes): + if (container_name == 'database' or restart_feature == 'enabled') and expected == 0 and + (processname in critical_processes or groupname in critical_processes): MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..." msg = MSG_FORMAT_STR.format(payload_headers['processname']) syslog.syslog(syslog.LOG_INFO, msg) From c67ab5117be05e2bd0cdea69e6a30b77fe29ddc4 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Mon, 10 Feb 2020 16:46:25 -0800 Subject: [PATCH 4/7] [database] update the file sflow.service.j2 to remove the duplicate dependency. Signed-off-by: Yong Zhao --- files/build_templates/sflow.service.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/build_templates/sflow.service.j2 b/files/build_templates/sflow.service.j2 index eea0515e978f..643bf646964d 100644 --- a/files/build_templates/sflow.service.j2 +++ b/files/build_templates/sflow.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=sFlow container -Requisite=swss.service database.service +Requisite=swss.service After=swss.service syncd.service Before=ntp-config.service StartLimitIntervalSec=1200 From ee57d12814fe4f3494548e520f7895c638407f28 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Mon, 10 Feb 2020 17:03:16 -0800 Subject: [PATCH 5/7] [event listener] Add comments in event listener. Signed-off-by: Yong Zhao --- files/scripts/supervisor-proc-exit-listener | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 86b54ed823b2..7a5275269849 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -52,6 +52,8 @@ def main(argv): processname = payload_headers['processname'] groupname = payload_headers['groupname'] + # If container is database, we do not need read the status of auto-restart feature from Config_DB. + # Otherwise, we will fetch the status. if container_name != 'database': config_db = swsssdk.ConfigDBConnector() config_db.connect() @@ -69,7 +71,8 @@ def main(argv): syslog.syslog(syslog.LOG_ERR, "Unable to determine auto-restart feature status for container '{}'. Exiting...".format(container_name)) sys.exit(4) - # If auto-restart feature is enabled and a critical process exited unexpectedly, terminate supervisor + # If container is database or auto-restart feature is enabled and at the same time + # a critical process exited unexpectedly, terminate supervisor if (container_name == 'database' or restart_feature == 'enabled') and expected == 0 and (processname in critical_processes or groupname in critical_processes): MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..." From 6f70ceef26add407c307a606bbadcb11c7a94312 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Mon, 10 Feb 2020 17:12:31 -0800 Subject: [PATCH 6/7] [event listener] Update the comments in line 56. Signed-off-by: Yong Zhao --- files/scripts/supervisor-proc-exit-listener | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 7a5275269849..ee5a1f3868a5 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -52,8 +52,7 @@ def main(argv): processname = payload_headers['processname'] groupname = payload_headers['groupname'] - # If container is database, we do not need read the status of auto-restart feature from Config_DB. - # Otherwise, we will fetch the status. + # Read the status of auto-restart feature from Config_DB. if container_name != 'database': config_db = swsssdk.ConfigDBConnector() config_db.connect() From 178f3853e5cd4269f5753b163fdf0746b2aae3b0 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Mon, 10 Feb 2020 23:17:18 -0800 Subject: [PATCH 7/7] [event listener] Add parentheses for if statement in line 76 in event listener. Signed-off-by: Yong Zhao --- files/scripts/supervisor-proc-exit-listener | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index ee5a1f3868a5..cf154b3a5c10 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -72,8 +72,8 @@ def main(argv): # If container is database or auto-restart feature is enabled and at the same time # a critical process exited unexpectedly, terminate supervisor - if (container_name == 'database' or restart_feature == 'enabled') and expected == 0 and - (processname in critical_processes or groupname in critical_processes): + if ((container_name == 'database' or restart_feature == 'enabled') and expected == 0 and + (processname in critical_processes or groupname in critical_processes)): MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..." msg = MSG_FORMAT_STR.format(payload_headers['processname']) syslog.syslog(syslog.LOG_INFO, msg)