From 868961303de4c520c8cb241a7788716b4a793f45 Mon Sep 17 00:00:00 2001 From: Joe LeVeque Date: Thu, 5 Mar 2020 15:27:21 -0800 Subject: [PATCH] [Services] Restart NAT service upon unexpected critical process exit. (#4208) --- dockers/docker-nat/Dockerfile.j2 | 2 ++ dockers/docker-nat/critical_processes | 2 ++ dockers/docker-nat/supervisord.conf | 8 +++++++- files/build_templates/nat.service.j2 | 4 ++++ rules/docker-nat.mk | 3 ++- 5 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 dockers/docker-nat/critical_processes diff --git a/dockers/docker-nat/Dockerfile.j2 b/dockers/docker-nat/Dockerfile.j2 index 3cfbd99e95e1..a74147cc26fd 100644 --- a/dockers/docker-nat/Dockerfile.j2 +++ b/dockers/docker-nat/Dockerfile.j2 @@ -38,6 +38,8 @@ RUN apt-get update \ COPY ["start.sh", "/usr/bin/"] COPY ["supervisord.conf", "/etc/supervisor/conf.d/"] COPY ["restore_nat_entries.py", "/usr/bin/"] +COPY ["files/supervisor-proc-exit-listener", "/usr/bin"] +COPY ["critical_processes", "/etc/supervisor"] RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y RUN rm -rf /debs diff --git a/dockers/docker-nat/critical_processes b/dockers/docker-nat/critical_processes new file mode 100644 index 000000000000..d442976143f1 --- /dev/null +++ b/dockers/docker-nat/critical_processes @@ -0,0 +1,2 @@ +natmgrd +natsyncd diff --git a/dockers/docker-nat/supervisord.conf b/dockers/docker-nat/supervisord.conf index bb42d23fe355..839d6f59ab3c 100644 --- a/dockers/docker-nat/supervisord.conf +++ b/dockers/docker-nat/supervisord.conf @@ -3,6 +3,12 @@ logfile_maxbytes=1MB logfile_backups=2 nodaemon=true +[eventlistener:supervisor-proc-exit-listener] +command=/usr/bin/supervisor-proc-exit-listener --container-name nat +events=PROCESS_STATE_EXITED +autostart=true +autorestart=unexpected + [program:start.sh] command=/usr/bin/start.sh priority=1 @@ -15,7 +21,7 @@ stderr_logfile=syslog command=/usr/sbin/rsyslogd -n priority=2 autostart=false -autorestart=false +autorestart=unexpected stdout_logfile=syslog stderr_logfile=syslog diff --git a/files/build_templates/nat.service.j2 b/files/build_templates/nat.service.j2 index 2e3e17439ef7..79a56f67ca89 100644 --- a/files/build_templates/nat.service.j2 +++ b/files/build_templates/nat.service.j2 @@ -3,12 +3,16 @@ Description=NAT container Requires=updategraph.service swss.service After=updategraph.service swss.service syncd.service Before=ntp-config.service +StartLimitIntervalSec=1200 +StartLimitBurst=3 [Service] User={{ sonicadmin_user }} ExecStartPre=/usr/bin/{{docker_container_name}}.sh start ExecStart=/usr/bin/{{docker_container_name}}.sh wait ExecStop=/usr/bin/{{docker_container_name}}.sh stop +Restart=always +RestartSec=30 [Install] WantedBy=multi-user.target swss.service diff --git a/rules/docker-nat.mk b/rules/docker-nat.mk index dcccc24ba2ff..eb6bd16ccd46 100644 --- a/rules/docker-nat.mk +++ b/rules/docker-nat.mk @@ -30,5 +30,6 @@ $(DOCKER_NAT)_RUN_OPT += --privileged -t $(DOCKER_NAT)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_NAT)_RUN_OPT += -v /host/warmboot:/var/warmboot -$(DOCKER_NAT)_BASE_IMAGE_FILES += natctl:/usr/bin/natctl +$(DOCKER_NAT)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) +$(DOCKER_NAT)_BASE_IMAGE_FILES += natctl:/usr/bin/natctl