diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e032b9..4773939 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.7.0] - 2024-02-26 + +- Client state is now retained on a graceful shutdown (#66). + ## [0.6.0] - 2023-11-30 - Added badges to README.md (#62). diff --git a/README.md b/README.md index e6f7e0c..0f29a48 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ COS Alerter is intended to be used together with alertmanager and prometheus: - Liveness of COS Alerter itself from a metric endpoint it exposes and prometheus scrapes ## Configuring Alertmanager -In order to integrate with COS Alerter you need to a heartbeat rule to Prometheus and Add a route to the Alertmanager config +In order to integrate with COS Alerter you need to add a heartbeat rule to Prometheus and add a route to the Alertmanager config. If you are using the Canonical Observability Stack, the alert rule is already created for you. If not, you can use a rule similar to the following: ```yaml diff --git a/cos_alerter/alerter.py b/cos_alerter/alerter.py index 401380f..4b90b22 100644 --- a/cos_alerter/alerter.py +++ b/cos_alerter/alerter.py @@ -4,6 +4,7 @@ """Main logic for COS Alerter.""" import datetime +import json import logging import os import sys @@ -15,6 +16,7 @@ import apprise import durationpy +import xdg_base_dirs from ruamel.yaml import YAML from ruamel.yaml.constructor import DuplicateKeyError @@ -69,6 +71,13 @@ def reload(self): self.data["notify"]["repeat_interval"] ).total_seconds() + # Static variables. We define them here so it is easy to expose them later as config + # values if needed. + base_dir = xdg_base_dirs.xdg_state_home() / "cos_alerter" + if not base_dir.exists(): + base_dir.mkdir(parents=True) + self.data["clients_file"] = base_dir / "clients.state" + def deep_update(base: dict, new: typing.Optional[dict]): """Deep dict update. @@ -151,6 +160,42 @@ def initialize(): "notify_time": None, } + # Recover any state that was dumped on last exit. + if config["clients_file"].exists(): + with config["clients_file"].open() as f: + existing_clients = json.load(f) + config["clients_file"].unlink() + for client in existing_clients: + if client in state["clients"]: + state["clients"][client]["alert_time"] = existing_clients[client]["alert_time"] + state["clients"][client]["notify_time"] = existing_clients[client][ + "notify_time" + ] + + # This is difficult to test in unit tests because it acquires and does not release all of the + # locks. When integration tests have been solved we need to remove the "no cover" from this + # method. + @staticmethod + def dump_and_pause(): # pragma: no cover + """Dump the state of the program and exit gracefully. + + This function acquires all the locks and never releases them, effectively pausing the + program. + """ + logger.info("Starting safe shutdown.") + for client in state["clients"]: + state["clients"][client]["lock"].acquire() + # Locks are not json serializable. + clients_without_locks = { + client: { + "alert_time": state["clients"][client]["alert_time"], + "notify_time": state["clients"][client]["notify_time"], + } + for client in state["clients"] + } + with config["clients_file"].open("w") as f: + json.dump(clients_without_locks, f) + @staticmethod def clients(): """Return a list of clientids.""" @@ -170,7 +215,12 @@ def is_down(self) -> bool: """Determine if Alertmanager should be considered down based on the last alert.""" if self.data["alert_time"] is None: return False - return time.monotonic() - self.data["alert_time"] > config["watch"]["down_interval"] + # We need to take the max of the alert and the start time, so that we only count time when + # cos-alerter was running. + return ( + time.monotonic() - max(self.data["alert_time"], self.start_time) + > config["watch"]["down_interval"] + ) def _recently_notified(self) -> bool: """Determine if a notification has been previously sent within the repeat interval.""" diff --git a/cos_alerter/daemon.py b/cos_alerter/daemon.py index 057c918..8d73c68 100644 --- a/cos_alerter/daemon.py +++ b/cos_alerter/daemon.py @@ -28,6 +28,13 @@ def sigint(_, __): # pragma: no cover sys.exit() +def sigterm(_, __): # pragma: no cover + """Signal handler for graceful shutdown on sigterm.""" + logger.info("Shutting down.") + AlerterState.dump_and_pause() + sys.exit() + + def sigusr1(_, __): # pragma: no cover """Signal handler for SIGUSR1 which sends a test notification.""" logger.info("Received SIGUSR1.") @@ -82,6 +89,7 @@ def main(run_for: Optional[int] = None, argv: List[str] = sys.argv): # Observe signal handlers try: # pragma: no cover signal.signal(signal.SIGINT, sigint) + signal.signal(signal.SIGTERM, sigterm) signal.signal(signal.SIGUSR1, sigusr1) logger.debug("Signal handlers set.") except ValueError as e: diff --git a/pyproject.toml b/pyproject.toml index 512834f..02d0591 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "cos-alerter" -version = "0.6.0" +version = "0.7.0" authors = [ { name="Dylan Stephano-Shachter", email="dylan.stephano-shachter@canonical.com" } ] @@ -24,9 +24,10 @@ dependencies = [ "flask~=2.2", "prometheus_flask_exporter~=0.22", "pyyaml~=6.0", + "ruamel.yaml~=0.18.0", "timeago~=1.0", "waitress~=2.1", - "ruamel.yaml~=0.18.0" + "xdg-base-dirs~=6.0.1", ] [project.urls] @@ -45,6 +46,8 @@ line-length = 99 [tool.ruff] line-length = 99 extend-exclude = ["__pycache__", "*.egg_info"] + +[tool.ruff.lint] select = ["E", "W", "F", "C", "N", "R", "D", "I001"] # Ignore E501 because using black creates errors with this # Ignore D107 Missing docstring in __init__ @@ -52,7 +55,7 @@ ignore = ["E501", "D107"] # D100, D101, D102, D103: Ignore missing docstrings in tests per-file-ignores = {"tests/*" = ["D100","D101","D102","D103"]} -[tool.ruff.pydocstyle] +[tool.ruff.lint.pydocstyle] convention = "google" [tool.pyright] diff --git a/rockcraft.yaml b/rockcraft.yaml index fafe2fc..f0a3682 100644 --- a/rockcraft.yaml +++ b/rockcraft.yaml @@ -1,7 +1,7 @@ name: cos-alerter summary: A liveness checker for self-monitoring. description: Receive regular pings from the cos stack and alert when they stop. -version: "0.6.0" # NOTE: Make sure this matches `cos-alerter` below +version: "0.7.0" # NOTE: Make sure this matches `cos-alerter` below base: ubuntu:22.04 license: Apache-2.0 platforms: @@ -11,7 +11,7 @@ parts: plugin: python source: . python-packages: - - cos-alerter==0.6.0 # NOTE: Make sure this matches `version` above + - cos-alerter==0.7.0 # NOTE: Make sure this matches `version` above stage-packages: - python3-venv services: diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml index eb27652..d1c8c24 100644 --- a/snap/snapcraft.yaml +++ b/snap/snapcraft.yaml @@ -1,5 +1,5 @@ name: cos-alerter -version: '0.6.0' +version: '0.7.0' summary: A watchdog alerting on alertmanager notification failures. license: Apache-2.0 contact: simon.aronsson@canonical.com diff --git a/tests/test_alerter.py b/tests/test_alerter.py index d52c29a..e446672 100644 --- a/tests/test_alerter.py +++ b/tests/test_alerter.py @@ -164,6 +164,30 @@ def test_is_down_with_wait_for_first_connection(monotonic_mock, fake_fs): assert state.is_down() is True # 5.5 minutes since reset_alert_timeout() was called. +@freezegun.freeze_time("2023-01-01") +@unittest.mock.patch("time.monotonic") +def test_is_down_from_graceful_shutdown(monotonic_mock, fake_fs): + with open("/etc/cos-alerter.yaml") as f: + conf = yaml.safe_load(f) + conf["watch"]["wait_for_first_connection"] = True + with open("/etc/cos-alerter.yaml", "w") as f: + yaml.dump(conf, f) + config.reload() + fake_fs.create_file(config["clients_file"]) + with config["clients_file"].open("w") as f: + f.write('{"clientid1": {"alert_time": 500, "notify_time": null}}') + monotonic_mock.return_value = 1000 + print("Hello Test") + AlerterState.initialize() + state = AlerterState(clientid="clientid1") + with state: + print(list(AlerterState.clients())) + print(state.data) + assert state.is_down() is False + monotonic_mock.return_value = 2330 + assert state.is_down() is True + + @freezegun.freeze_time("2023-01-01") @unittest.mock.patch("time.monotonic") def test_is_down(monotonic_mock, fake_fs): diff --git a/tox.ini b/tox.ini index cdc43c3..3b647f7 100644 --- a/tox.ini +++ b/tox.ini @@ -45,9 +45,7 @@ deps = freezegun pyfakefs pytest - pyyaml werkzeug - ruamel.yaml commands = coverage run --source {[vars]src_path} -m pytest -m "not slow" -v --log-cli-level=INFO {[vars]tst_path} @@ -59,8 +57,6 @@ deps = freezegun pyfakefs pytest - pyyaml - ruamel.yaml commands = coverage run -a --source {[vars]src_path} -m pytest -m slow -v --log-cli-level=INFO {[vars]tst_path}