diff --git a/.gitignore b/.gitignore index b6308c9..8a475b0 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ venv *.egg-info *.snap **/__pycache__ +*.rock diff --git a/CHANGELOG.md b/CHANGELOG.md index 288e880..c840ee2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.9.0] - 2024-05-30 + +- Adding PagerDuty native support (#76). + + ## [0.8.0] - 2024-03-07 - Fixes container silently running by exiting with non-zero status when configuration file is missing. (#70). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 465d981..6cadd41 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -28,6 +28,23 @@ cp config-defaults.yaml cos-alerter.yaml docker run -p 8080:8080 --rm --mount type=bind,source="$(pwd)"/cos-alerter.yaml,target=/etc/cos-alerter.yaml,readonly -it cos-alerter:0.2.0 ``` +## Run With Kubernetes +Prepare the image: +```shell +rockcraft pack +# update with the actual IP of your docker registry +# update with the image tag you would like to use in testing +skopeo copy oci-archive:cos-alerter_0.8.0_amd64.rock docker:/// --dest-tls-verify=false +``` + +Run: +```shell +# in k8s-local-test/deploy.yaml update / with appropriate values +# in k8s-local-test/deploy.yaml update cos-alerter.yaml configmap with appropriate values +kubectl apply -f k8s-local-test/deploy.yaml +kubectl apply -f k8s-local-test/svc.yaml +``` + ## Run Tests * `pip install tox` diff --git a/cos_alerter/alerter.py b/cos_alerter/alerter.py index 661a97e..c28d5f1 100644 --- a/cos_alerter/alerter.py +++ b/cos_alerter/alerter.py @@ -13,10 +13,12 @@ import time import typing from pathlib import Path +from typing import Optional import apprise import durationpy import xdg_base_dirs +from pdpyras import EventsAPISession from ruamel.yaml import YAML from ruamel.yaml.constructor import DuplicateKeyError @@ -211,6 +213,14 @@ def clients(): def reset_alert_timeout(self): """Set the "last alert time" to right now.""" + # In case an instance was down, resolve the PagerDuty incident before resetting the last alert time + if self.is_down(): + pagerduty_destinations, _ = split_destinations() + handle_pagerduty_incidents( + incident_type="resolve", + dedup_key=f"{self.clientid}-{self.last_alert_datetime()}", + destinations_list=pagerduty_destinations, + ) logger.debug("Resetting alert timeout for %s.", self.clientid) self.data["alert_time"] = time.monotonic() @@ -274,7 +284,12 @@ def notify(self): # Sending notifications can be a long operation so handle that in a separate thread. # This avoids interfering with the execution of the main loop. notify_thread = threading.Thread( - target=send_notifications, kwargs={"title": title, "body": body} + target=send_notifications, + kwargs={ + "title": title, + "body": body, + "dedup_key": f"{self.clientid}-{self.last_alert_datetime()}", + }, ) notify_thread.start() @@ -290,15 +305,56 @@ def up_time(): return time.monotonic() - state["start_time"] -def send_notifications(title: str, body: str): +def split_destinations(): + """Split destinations into PagerDuty and non-PagerDuty lists.""" + pagerduty_destinations = [ + source for source in config["notify"]["destinations"] if source.startswith("pagerduty") + ] + non_pagerduty_destinations = [ + source for source in config["notify"]["destinations"] if not source.startswith("pagerduty") + ] + return pagerduty_destinations, non_pagerduty_destinations + + +def send_notifications(title: str, body: str, dedup_key: str): """Send a notification to all receivers.""" # TODO: Since this is run in its own thread, we have to make sure we properly # log failures here. + pagerduty_destinations, non_pagerduty_destinations = split_destinations() + + # Send notifications to non-PagerDuty destinations sender = apprise.Apprise() - for source in config["notify"]["destinations"]: + for source in non_pagerduty_destinations: sender.add(source) sender.notify(title=title, body=body) + # Send notifications to PagerDuty destinations + handle_pagerduty_incidents("trigger", dedup_key, pagerduty_destinations, body) + + +def handle_pagerduty_incidents( + incident_type: str, + dedup_key: str, + destinations_list: list, + incident_summary: Optional[str] = None, +): + """Handles PagerDuty incidents by triggering or resolving incidents based on the specified incident type. + + Args: + incident_type (str): The type of incident action to perform. Should be either 'trigger' or 'resolve'. + dedup_key (str): The deduplication key to uniquely identify the incident. + destinations_list (list): List of destinations to handle PagerDuty incidents for. + incident_summary (str, optional): A summary of the incident, used only when triggering an incident. Defaults to None. + """ + for source in destinations_list: + integration_key = source.split("//")[1].split("@")[0] + session = EventsAPISession(integration_key) + + if incident_type == "trigger": + session.trigger(source="cos-alerter", summary=incident_summary, dedup_key=dedup_key) + elif incident_type == "resolve": + session.resolve(dedup_key) + def send_test_notification(): """Signal handler which sends a test email to all configured receivers.""" @@ -306,4 +362,5 @@ def send_test_notification(): send_notifications( title="COS-Alerter test email.", body="This is a test email automatically generated by COS-alerter.", + dedup_key="testkey", ) diff --git a/k8s-local-test/deploy.yaml b/k8s-local-test/deploy.yaml new file mode 100644 index 0000000..65894cf --- /dev/null +++ b/k8s-local-test/deploy.yaml @@ -0,0 +1,47 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cos-alerter +spec: + replicas: 1 + selector: + matchLabels: + app: cos-alerter + template: + metadata: + labels: + app: cos-alerter + spec: + containers: + - name: cos-alerter + image: / + imagePullPolicy: Always + ports: + - containerPort: 8080 + volumeMounts: + - name: config + mountPath: /etc/cos-alerter.yaml + subPath: cos-alerter.yaml + volumes: + - name: config + configMap: + name: cos-alerter-configmap + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: cos-alerter-configmap +data: + cos-alerter.yaml: | + watch: + down_interval: "5m" + wait_for_first_connection: true + clients: {} + notify: + destinations: [] + repeat_interval: "1h" + log_level: "info" + web_listen_addr: "0.0.0.0:8080" + + diff --git a/k8s-local-test/svc.yaml b/k8s-local-test/svc.yaml new file mode 100644 index 0000000..38b3856 --- /dev/null +++ b/k8s-local-test/svc.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: cos-alerter-service +spec: + selector: + app: cos-alerter + ports: + - protocol: TCP + port: 8080 + targetPort: 8080 + type: ClusterIP \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 996831d..105afcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "cos-alerter" -version = "0.8.0" +version = "0.9.0" authors = [ { name="Dylan Stephano-Shachter", email="dylan.stephano-shachter@canonical.com" } ] @@ -28,6 +28,7 @@ dependencies = [ "timeago~=1.0", "waitress~=2.1", "xdg-base-dirs~=6.0.1", + "pdpyras~=5.2.0" ] [project.urls] diff --git a/rockcraft.yaml b/rockcraft.yaml index 274f491..9c036a9 100644 --- a/rockcraft.yaml +++ b/rockcraft.yaml @@ -1,7 +1,7 @@ name: cos-alerter summary: A liveness checker for self-monitoring. description: Receive regular pings from the cos stack and alert when they stop. -version: "0.8.0" +version: "0.9.0" base: ubuntu@22.04 license: Apache-2.0 platforms: diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml index b8570a6..a7b34ad 100644 --- a/snap/snapcraft.yaml +++ b/snap/snapcraft.yaml @@ -1,5 +1,5 @@ name: cos-alerter -version: '0.8.0' +version: '0.9.0' summary: A watchdog alerting on alertmanager notification failures. license: Apache-2.0 contact: simon.aronsson@canonical.com