Skip to content

Commit

Permalink
adding PagerDuty support
Browse files Browse the repository at this point in the history
  • Loading branch information
IbraAoad committed May 30, 2024
1 parent 717014f commit 4105cc6
Show file tree
Hide file tree
Showing 9 changed files with 147 additions and 6 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ venv
*.egg-info
*.snap
**/__pycache__
*.rock
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.9.0] - 2024-05-30

- Adding PagerDuty native support (#76).


## [0.8.0] - 2024-03-07

- Fixes container silently running by exiting with non-zero status when configuration file is missing. (#70).
Expand Down
17 changes: 17 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,23 @@ cp config-defaults.yaml cos-alerter.yaml
docker run -p 8080:8080 --rm --mount type=bind,source="$(pwd)"/cos-alerter.yaml,target=/etc/cos-alerter.yaml,readonly -it cos-alerter:0.2.0
```

## Run With Kubernetes
Prepare the image:
```shell
rockcraft pack
# update <registry-ip> with the actual IP of your docker registry
# update <image-tag> with the image tag you would like to use in testing
skopeo copy oci-archive:cos-alerter_0.8.0_amd64.rock docker://<registry-ip>/<image-tag> --dest-tls-verify=false
```

Run:
```shell
# in k8s-local-test/deploy.yaml update <registry-ip>/<image-tag> with appropriate values
# in k8s-local-test/deploy.yaml update cos-alerter.yaml configmap with appropriate values
kubectl apply -f k8s-local-test/deploy.yaml
kubectl apply -f k8s-local-test/svc.yaml
```

## Run Tests

* `pip install tox`
Expand Down
63 changes: 60 additions & 3 deletions cos_alerter/alerter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
import time
import typing
from pathlib import Path
from typing import Optional

import apprise
import durationpy
import xdg_base_dirs
from pdpyras import EventsAPISession
from ruamel.yaml import YAML
from ruamel.yaml.constructor import DuplicateKeyError

Expand Down Expand Up @@ -211,6 +213,14 @@ def clients():

def reset_alert_timeout(self):
"""Set the "last alert time" to right now."""
# In case an instance was down, resolve the PagerDuty incident before resetting the last alert time
if self.is_down():
pagerduty_destinations, _ = split_destinations()
handle_pagerduty_incidents(
incident_type="resolve",
dedup_key=f"{self.clientid}-{self.last_alert_datetime()}",
destinations_list=pagerduty_destinations,
)
logger.debug("Resetting alert timeout for %s.", self.clientid)
self.data["alert_time"] = time.monotonic()

Expand Down Expand Up @@ -274,7 +284,12 @@ def notify(self):
# Sending notifications can be a long operation so handle that in a separate thread.
# This avoids interfering with the execution of the main loop.
notify_thread = threading.Thread(
target=send_notifications, kwargs={"title": title, "body": body}
target=send_notifications,
kwargs={
"title": title,
"body": body,
"dedup_key": f"{self.clientid}-{self.last_alert_datetime()}",
},
)
notify_thread.start()

Expand All @@ -290,20 +305,62 @@ def up_time():
return time.monotonic() - state["start_time"]


def send_notifications(title: str, body: str):
def split_destinations():
"""Split destinations into PagerDuty and non-PagerDuty lists."""
pagerduty_destinations = [
source for source in config["notify"]["destinations"] if source.startswith("pagerduty")
]
non_pagerduty_destinations = [
source for source in config["notify"]["destinations"] if not source.startswith("pagerduty")
]
return pagerduty_destinations, non_pagerduty_destinations


def send_notifications(title: str, body: str, dedup_key: str):
"""Send a notification to all receivers."""
# TODO: Since this is run in its own thread, we have to make sure we properly
# log failures here.
pagerduty_destinations, non_pagerduty_destinations = split_destinations()

# Send notifications to non-PagerDuty destinations
sender = apprise.Apprise()
for source in config["notify"]["destinations"]:
for source in non_pagerduty_destinations:
sender.add(source)
sender.notify(title=title, body=body)

# Send notifications to PagerDuty destinations
handle_pagerduty_incidents("trigger", dedup_key, pagerduty_destinations, body)


def handle_pagerduty_incidents(
incident_type: str,
dedup_key: str,
destinations_list: list,
incident_summary: Optional[str] = None,
):
"""Handles PagerDuty incidents by triggering or resolving incidents based on the specified incident type.
Args:
incident_type (str): The type of incident action to perform. Should be either 'trigger' or 'resolve'.
dedup_key (str): The deduplication key to uniquely identify the incident.
destinations_list (list): List of destinations to handle PagerDuty incidents for.
incident_summary (str, optional): A summary of the incident, used only when triggering an incident. Defaults to None.
"""
for source in destinations_list:
integration_key = source.split("//")[1].split("@")[0]
session = EventsAPISession(integration_key)

if incident_type == "trigger":
session.trigger(source="cos-alerter", summary=incident_summary, dedup_key=dedup_key)
elif incident_type == "resolve":
session.resolve(dedup_key)


def send_test_notification():
"""Signal handler which sends a test email to all configured receivers."""
logger.info("Sending test notifications.")
send_notifications(
title="COS-Alerter test email.",
body="This is a test email automatically generated by COS-alerter.",
dedup_key="testkey",
)
47 changes: 47 additions & 0 deletions k8s-local-test/deploy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: cos-alerter
spec:
replicas: 1
selector:
matchLabels:
app: cos-alerter
template:
metadata:
labels:
app: cos-alerter
spec:
containers:
- name: cos-alerter
image: <registry-ip>/<image-tag>
imagePullPolicy: Always
ports:
- containerPort: 8080
volumeMounts:
- name: config
mountPath: /etc/cos-alerter.yaml
subPath: cos-alerter.yaml
volumes:
- name: config
configMap:
name: cos-alerter-configmap

---
apiVersion: v1
kind: ConfigMap
metadata:
name: cos-alerter-configmap
data:
cos-alerter.yaml: |
watch:
down_interval: "5m"
wait_for_first_connection: true
clients: {}
notify:
destinations: []
repeat_interval: "1h"
log_level: "info"
web_listen_addr: "0.0.0.0:8080"
13 changes: 13 additions & 0 deletions k8s-local-test/svc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
apiVersion: v1
kind: Service
metadata:
name: cos-alerter-service
spec:
selector:
app: cos-alerter
ports:
- protocol: TCP
port: 8080
targetPort: 8080
type: ClusterIP
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "cos-alerter"
version = "0.8.0"
version = "0.9.0"
authors = [
{ name="Dylan Stephano-Shachter", email="dylan.stephano-shachter@canonical.com" }
]
Expand All @@ -28,6 +28,7 @@ dependencies = [
"timeago~=1.0",
"waitress~=2.1",
"xdg-base-dirs~=6.0.1",
"pdpyras~=5.2.0"
]

[project.urls]
Expand Down
2 changes: 1 addition & 1 deletion rockcraft.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: cos-alerter
summary: A liveness checker for self-monitoring.
description: Receive regular pings from the cos stack and alert when they stop.
version: "0.8.0"
version: "0.9.0"
base: ubuntu@22.04
license: Apache-2.0
platforms:
Expand Down
2 changes: 1 addition & 1 deletion snap/snapcraft.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: cos-alerter
version: '0.8.0'
version: '0.9.0'
summary: A watchdog alerting on alertmanager notification failures.
license: Apache-2.0
contact: simon.aronsson@canonical.com
Expand Down

0 comments on commit 4105cc6

Please sign in to comment.