Skip to content

Commit

Permalink
Adding PagerDuty Native support (#76)
Browse files Browse the repository at this point in the history
* adding PagerDuty support

* refactor logic

* fix static checks

* remove local k8s manifests
  • Loading branch information
IbraAoad authored Jun 4, 2024
1 parent fcc632c commit 9aaaf72
Show file tree
Hide file tree
Showing 9 changed files with 125 additions and 33 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ venv
*.egg-info
*.snap
**/__pycache__
*.rock
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.9.0] - 2024-05-30

- Added PagerDuty native support (#76).


## [0.8.0] - 2024-03-07

- Fixes container silently running by exiting with non-zero status when configuration file is missing. (#70).
Expand Down
1 change: 1 addition & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ cp config-defaults.yaml cos-alerter.yaml
docker run -p 8080:8080 --rm --mount type=bind,source="$(pwd)"/cos-alerter.yaml,target=/etc/cos-alerter.yaml,readonly -it cos-alerter:0.2.0
```


## Run Tests

* `pip install tox`
Expand Down
84 changes: 80 additions & 4 deletions cos_alerter/alerter.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
import time
import typing
from pathlib import Path
from typing import Dict, List, Optional

import apprise
import durationpy
import xdg_base_dirs
from pdpyras import EventsAPISession
from ruamel.yaml import YAML
from ruamel.yaml.constructor import DuplicateKeyError

Expand Down Expand Up @@ -211,6 +213,9 @@ def clients():

def reset_alert_timeout(self):
"""Set the "last alert time" to right now."""
# In case an instance was down, resolve the PagerDuty incident before resetting the last alert time
if self.is_down():
self.resolve_existing_alerts()
logger.debug("Resetting alert timeout for %s.", self.clientid)
self.data["alert_time"] = time.monotonic()

Expand Down Expand Up @@ -274,10 +279,26 @@ def notify(self):
# Sending notifications can be a long operation so handle that in a separate thread.
# This avoids interfering with the execution of the main loop.
notify_thread = threading.Thread(
target=send_notifications, kwargs={"title": title, "body": body}
target=send_all_notifications,
kwargs={
"title": title,
"body": body,
"destinations": split_destinations(config["notify"]["destinations"]),
"incident_type": "trigger",
"dedup_key": f"{self.clientid}-{self.last_alert_datetime()}",
},
)
notify_thread.start()

def resolve_existing_alerts(self):
"""Resolves the current alerts."""
categorized_destinations = split_destinations(config["notify"]["destinations"])
handle_pagerduty_incidents(
incident_type="resolve",
dedup_key=f"{self.clientid}-{self.last_alert_datetime()}",
destinations=categorized_destinations["pagerduty"],
)


def now_datetime():
"""Return the current datetime using the monotonic clock."""
Expand All @@ -290,20 +311,75 @@ def up_time():
return time.monotonic() - state["start_time"]


def send_notifications(title: str, body: str):
def split_destinations(destinations: List[str]) -> Dict[str, List[str]]:
"""Split destinations into categorized lists."""
categorized_destinations = {"standard": [], "pagerduty": []}

for source in destinations:
if source.startswith("pagerduty"):
categorized_destinations["pagerduty"].append(source)
else:
categorized_destinations["standard"].append(source)

return categorized_destinations


def send_all_notifications(
title: str, body: str, destinations: Dict[str, List[str]], incident_type: str, dedup_key: str
):
"""Send a notification to all receivers."""
send_standard_notifications(title=title, body=body, destinations=destinations["standard"])
handle_pagerduty_incidents(
incident_type=incident_type,
dedup_key=dedup_key,
destinations=destinations["pagerduty"],
incident_summary=body,
)


def send_standard_notifications(title: str, body: str, destinations: list):
"""Send a notification to all standard receivers."""
# TODO: Since this is run in its own thread, we have to make sure we properly
# log failures here.

# Send notifications to non-PagerDuty destinations
sender = apprise.Apprise()
for source in config["notify"]["destinations"]:
for source in destinations:
sender.add(source)
sender.notify(title=title, body=body)


def handle_pagerduty_incidents(
incident_type: str,
dedup_key: str,
destinations: list,
incident_summary: Optional[str] = None,
):
"""Handles PagerDuty incidents by triggering or resolving incidents based on the specified incident type.
Args:
incident_type (str): The type of incident action to perform. Should be either 'trigger' or 'resolve'.
dedup_key (str): The deduplication key to uniquely identify the incident.
destinations (list): List of destinations to handle PagerDuty incidents for.
incident_summary (str, optional): A summary of the incident, used only when triggering an incident. Defaults to None.
"""
for source in destinations:
integration_key = source.split("//")[1].split("@")[0]
session = EventsAPISession(integration_key)

if incident_type == "trigger":
session.trigger(source="cos-alerter", summary=incident_summary, dedup_key=dedup_key)
elif incident_type == "resolve":
session.resolve(dedup_key)


def send_test_notification():
"""Signal handler which sends a test email to all configured receivers."""
logger.info("Sending test notifications.")
send_notifications(
send_all_notifications(
title="COS-Alerter test email.",
body="This is a test email automatically generated by COS-alerter.",
destinations=split_destinations(config["notify"]["destinations"]),
incident_type="trigger",
dedup_key="test-dedup-key",
)
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "cos-alerter"
version = "0.8.0"
version = "0.9.0"
authors = [
{ name="Dylan Stephano-Shachter", email="dylan.stephano-shachter@canonical.com" }
]
Expand All @@ -28,6 +28,7 @@ dependencies = [
"timeago~=1.0",
"waitress~=2.1",
"xdg-base-dirs~=6.0.1",
"pdpyras~=5.2.0"
]

[project.urls]
Expand Down
2 changes: 1 addition & 1 deletion rockcraft.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name: cos-alerter
summary: A liveness checker for self-monitoring.
description: Receive regular pings from the cos stack and alert when they stop.
version: "0.8.0"
version: "0.9.0"
base: ubuntu@22.04
license: Apache-2.0
platforms:
Expand Down
2 changes: 1 addition & 1 deletion snap/snapcraft.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: cos-alerter
version: '0.8.0'
version: '0.9.0'
summary: A watchdog alerting on alertmanager notification failures.
license: Apache-2.0
contact: simon.aronsson@canonical.com
Expand Down
1 change: 1 addition & 0 deletions tests/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
DESTINATIONS = [
"mailtos://user:pass@domain/?to=example-0@example.com,example-1@example.com",
"slack://xoxb-1234-1234-4ddbc191d40ee098cbaae6f3523ada2d/#general",
"pagerduty://integration-key@api-key",
]

CONFIG = {
Expand Down
59 changes: 33 additions & 26 deletions tests/test_alerter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,24 @@
import freezegun
import yaml
from helpers import DESTINATIONS
from pdpyras import EventsAPISession

from cos_alerter.alerter import AlerterState, config, send_test_notification, up_time
from cos_alerter.alerter import (
AlerterState,
config,
send_test_notification,
split_destinations,
up_time,
)


def assert_notifications(notify_mock, add_mock, title, body):
add_mock.assert_has_calls([unittest.mock.call(x) for x in DESTINATIONS])
def assert_notifications(notify_mock, add_mock, pd_mock, title, body, dedup_key):
categorized_destinations = split_destinations(DESTINATIONS)
add_mock.assert_has_calls(
[unittest.mock.call(x) for x in categorized_destinations["standard"]]
)
notify_mock.assert_called_with(title=title, body=body)
pd_mock.assert_called_with(source="cos-alerter", summary=body, dedup_key=dedup_key)


def test_config_gets_item(fake_fs):
Expand Down Expand Up @@ -142,7 +153,8 @@ def test_is_down_from_initialize(monotonic_mock, fake_fs):

@freezegun.freeze_time("2023-01-01")
@unittest.mock.patch("time.monotonic")
def test_is_down_with_reset_alert_timeout(monotonic_mock, fake_fs):
@unittest.mock.patch.object(EventsAPISession, "resolve")
def test_is_down_with_reset_alert_timeout(pd_mock, monotonic_mock, fake_fs):
monotonic_mock.return_value = 1000
AlerterState.initialize()
state = AlerterState(clientid="clientid1")
Expand All @@ -153,6 +165,7 @@ def test_is_down_with_reset_alert_timeout(monotonic_mock, fake_fs):
assert state.is_down() is False
monotonic_mock.return_value = 2330 # Five and a half minutes have passed
assert state.is_down() is True
pd_mock.assert_called_with(f"{state.clientid}-None")


@freezegun.freeze_time("2023-01-01")
Expand Down Expand Up @@ -201,21 +214,6 @@ def test_is_down_from_graceful_shutdown(monotonic_mock, fake_fs):
assert state.is_down() is True


@freezegun.freeze_time("2023-01-01")
@unittest.mock.patch("time.monotonic")
def test_is_down(monotonic_mock, fake_fs):
monotonic_mock.return_value = 1000
AlerterState.initialize()
state = AlerterState(clientid="clientid1")
with state:
monotonic_mock.return_value = 2000
state.reset_alert_timeout()
monotonic_mock.return_value = 2180 # Three minutes have passed
assert state.is_down() is False
monotonic_mock.return_value = 2330 # Five and a half minutes have passed
assert state.is_down() is True


@freezegun.freeze_time("2023-01-01")
@unittest.mock.patch("time.monotonic")
def test_recently_notified(monotonic_mock, fake_fs):
Expand All @@ -234,46 +232,55 @@ def test_recently_notified(monotonic_mock, fake_fs):
@unittest.mock.patch("time.monotonic")
@unittest.mock.patch.object(apprise.Apprise, "add")
@unittest.mock.patch.object(apprise.Apprise, "notify")
def test_notify(notify_mock, add_mock, monotonic_mock, fake_fs):
@unittest.mock.patch.object(EventsAPISession, "trigger")
def test_notify(pd_mock, notify_mock, add_mock, monotonic_mock, fake_fs):
monotonic_mock.return_value = 1000
AlerterState.initialize()
state = AlerterState(clientid="clientid1")

dedup_key = f"{state.clientid}-{state.last_alert_datetime()}"
with state:
state.notify()
for thread in threading.enumerate():
if thread != threading.current_thread():
thread.join()

assert_notifications(
notify_mock,
add_mock,
notify_mock=notify_mock,
add_mock=add_mock,
pd_mock=pd_mock,
title="**Alertmanager is Down!**",
body=textwrap.dedent(
"""
Your Alertmanager instance: clientid1 seems to be down!
It has not alerted COS-Alerter ever.
"""
),
dedup_key=dedup_key,
)

# Make sure if we try again, nothing is sent
notify_mock.reset_mock()
pd_mock.reset_mock()

with state:
state.notify()
for thread in threading.enumerate():
if thread != threading.current_thread():
thread.join()
notify_mock.assert_not_called()
pd_mock.assert_not_called()


@unittest.mock.patch.object(apprise.Apprise, "add")
@unittest.mock.patch.object(apprise.Apprise, "notify")
def test_send_test_notification(notify_mock, add_mock, fake_fs):
@unittest.mock.patch.object(EventsAPISession, "trigger")
def test_send_test_notification(pd_mock, notify_mock, add_mock, fake_fs):
send_test_notification()
assert_notifications(
notify_mock,
add_mock,
notify_mock=notify_mock,
add_mock=add_mock,
pd_mock=pd_mock,
title="COS-Alerter test email.",
body="This is a test email automatically generated by COS-alerter.",
dedup_key="test-dedup-key",
)

0 comments on commit 9aaaf72

Please sign in to comment.