From 4fdf975f91193ea0897352af1bfdfb4bd0b6f244 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Fri, 18 Sep 2020 13:19:05 +0800 Subject: [PATCH] [thermalctld] Fix issue: thermalctld should be auto restarted when being killed (#94) Part of thermalctld function is to handle user space thermal policies for events like fan/PSU removing, it works together with kernel thermal algorithm to make sure the switch won't be overheat. Recently, we found that commit Azure/sonic-buildimage@cbc75fe changes its autorestart configuration in supervisord, and it won't be auto restarted after being killed. This PR is to make sure that thermalctld will be always restarted when it is killed. --- sonic-thermalctld/scripts/thermalctld | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/sonic-thermalctld/scripts/thermalctld b/sonic-thermalctld/scripts/thermalctld index 0e2797f89d29..d305dd101cfa 100644 --- a/sonic-thermalctld/scripts/thermalctld +++ b/sonic-thermalctld/scripts/thermalctld @@ -622,6 +622,11 @@ class ThermalControlDaemon(daemon_base.DaemonBase): super(ThermalControlDaemon, self).__init__(log_identifier) self.stop_event = threading.Event() + # Thermal control daemon is designed to never exit, it must always + # return non zero exit code when exiting and so that supervisord will + # restart it automatically. + self.exit_code = 1 + # Signal handler def signal_handler(self, sig, frame): """ @@ -632,11 +637,9 @@ class ThermalControlDaemon(daemon_base.DaemonBase): """ if sig == signal.SIGHUP: self.log_info("Caught SIGHUP - ignoring...") - elif sig == signal.SIGINT: - self.log_info("Caught SIGINT - exiting...") - self.stop_event.set() - elif sig == signal.SIGTERM: - self.log_info("Caught SIGTERM - exiting...") + elif sig == signal.SIGINT or sig == signal.SIGTERM: + self.log_info("Caught signal {} - exiting...".format(sig)) + self.exit_code = sig + 128 self.stop_event.set() else: self.log_warning("Caught unhandled signal '" + sig + "'") @@ -690,7 +693,8 @@ class ThermalControlDaemon(daemon_base.DaemonBase): thermal_monitor.task_stop() - self.log_info("Shutdown...") + self.log_info("Shutdown with exit code {}...".format(self.exit_code)) + exit(self.exit_code) #