Skip to content

Commit

Permalink
[IMPROVEMENT] Tracking for failed alerts
Browse files Browse the repository at this point in the history
Even though Promgen is instrumented with Sentry, users need a way to
know if their alerts are being sent properly. While a more robust retry
mechanism will be attempted in the future this should give more
visibility into the current state

- Add `error_count` and `sent_count` to Alert model for tracking
- Add `AlertError` log for tracking failed messages
- Refactor `tasks.send_alert` to better log errors and increment counters

Future work to make sender.test more aware to avoid special casing the
alert_pk check in `tasks.send_alert`
  • Loading branch information
kfdm committed Nov 29, 2019
1 parent 167b4a9 commit efe8dd8
Show file tree
Hide file tree
Showing 11 changed files with 149 additions and 25 deletions.
27 changes: 16 additions & 11 deletions promgen/management/commands/test-alert.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,31 @@

from django.core.management.base import BaseCommand
from django.test import override_settings

from promgen import models, tasks, tests


class Command(BaseCommand):
data = tests.PromgenTest.data_json("examples", "alertmanager.json")

def add_arguments(self, parser):
parser.add_argument("--shard", default="Test Shard")
parser.add_argument("--service", default=self.data["commonLabels"]["service"])
parser.add_argument("--project", default=self.data["commonLabels"]["project"])

@override_settings(CELERY_TASK_ALWAYS_EAGER=True)
def handle(self, **kwargs):
def handle(self, shard, service, project, **kwargs):
logging._handlers = []
logging.basicConfig(level=logging.DEBUG)

data = tests.PromgenTest.data_json('examples', 'alertmanager.json')

shard, _ = models.Shard.objects.get_or_create(name='Shard Test')
service, _ = models.Service.objects.get_or_create(
shard=shard, name=data['commonLabels']['service']
)
shard, _ = models.Shard.objects.get_or_create(name=shard)
service, _ = models.Service.objects.get_or_create(name=service)
project, _ = models.Project.objects.get_or_create(
service=service, name=data['commonLabels']['project']
name=project, defaults={"shard": shard, "service": service}
)

alert = models.Alert.objects.create(
body=json.dumps(data)
)
alert = models.Alert.objects.create(body=json.dumps(self.data), error_count=1)

tasks.process_alert(alert.pk)

alert.alerterror_set.create(message="Test from CLI")
33 changes: 33 additions & 0 deletions promgen/migrations/0011_notifier_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Generated by Django 2.2.4 on 2019-11-28 02:21

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('promgen', '0010_app_label_migration'),
]

operations = [
migrations.AddField(
model_name='alert',
name='error_count',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='alert',
name='sent_count',
field=models.PositiveIntegerField(default=0),
),
migrations.CreateModel(
name='AlertError',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('message', models.TextField()),
('created', models.DateTimeField(default=django.utils.timezone.now)),
('alert', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='promgen.Alert')),
],
),
]
8 changes: 8 additions & 0 deletions promgen/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,8 @@ class RuleAnnotation(models.Model):
class Alert(models.Model):
created = models.DateTimeField(default=timezone.now)
body = models.TextField()
sent_count = models.PositiveIntegerField(default=0)
error_count = models.PositiveIntegerField(default=0)

def get_absolute_url(self):
return reverse("alert-detail", kwargs={"pk": self.pk})
Expand Down Expand Up @@ -531,6 +533,12 @@ def json(self):
return json.loads(self.body)


class AlertError(models.Model):
alert = models.ForeignKey(Alert, on_delete=models.CASCADE)
created = models.DateTimeField(default=timezone.now)
message = models.TextField()


class Audit(models.Model):
body = models.TextField()
created = models.DateTimeField()
Expand Down
11 changes: 9 additions & 2 deletions promgen/notification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,21 @@
import logging
import textwrap

from promgen import util

from django import forms
from django.template.loader import render_to_string

from promgen import plugins, util

logger = logging.getLogger(__name__)


def load(name):
for driver in plugins.notifications():
if name == driver.module_name:
return driver.load()()
raise ImportError("Unknown notification plugin %s" % name)


class FormSenderBase(forms.Form):
value = forms.CharField(required=True)
alias = forms.CharField(required=False)
Expand Down
30 changes: 23 additions & 7 deletions promgen/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

from atomicwrites import atomic_write
from celery import shared_task
from django.conf import settings
from promgen import models, plugins, prometheus, util

from promgen import models, prometheus, util, notification

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -62,13 +62,29 @@ def send_alert(sender, target, data, alert_pk=None):
"""
Send alert to specific target
alert_pk is used for debugging purposes
alert_pk is used when quering our alert normally and is missing
when we send a test message. In the case we send a test message
we want to raise any exceptions so that the test function can
handle it
"""
logger.debug("Sending %s %s", sender, target)
for plugin in plugins.notifications():
if sender == plugin.module_name:
instance = plugin.load()()
instance._send(target, data)
try:
notifier = notification.load(sender)
notifier._send(target, data)
except ImportError:
logging.exception("Error loading plugin %s", sender)
if alert_pk is None:
raise
except Exception as e:
logging.exception("Error sending notification")
if alert_pk:
util.inc_for_pk(models.Alert, pk=alert_pk, error_count=1)
models.AlertError.objects.create(alert_id=alert_pk, message=str(e))
else:
raise
else:
if alert_pk:
util.inc_for_pk(models.Alert, pk=alert_pk, sent_count=1)


@shared_task
Expand Down
10 changes: 10 additions & 0 deletions promgen/templates/promgen/alert_detail.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,15 @@
{% block content %}
{% breadcrumb alert %}

<ul class="list-unstyled">
{% for error in alert.alerterror_set.all %}
<li class="alert alert-danger" role="alert">
<span class="glyphicon glyphicon-alert" aria-hidden="true"></span>
{{error.message}}
<span class="pull-right">{{error.created}}</span>
</li>
{% endfor %}
</ul>

<pre>{{alert.json|pretty_json}}</pre>
{% endblock %}
8 changes: 7 additions & 1 deletion promgen/templates/promgen/alert_list.html
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
{% extends "base.html" %}

{% load promgen %}
{% block content %}

{% breadcrumb label="Alerts" %}

{% include 'promgen/pagination_short.html' %}

<table class="table">
Expand All @@ -12,6 +14,8 @@
<th>Service</th>
<th>Project</th>
<th>Job</th>
<th>Sent</th>
<th>Error</th>
</tr>
{% for alert in alert_list %}
<tr>
Expand All @@ -21,6 +25,8 @@
<td>{{alert.json.commonLabels.service}}</td>
<td>{{alert.json.commonLabels.project}}</td>
<td>{{alert.json.commonLabels.job}}</td>
<td {% if alert.sent_count == 0 %}class="warning" {% endif %}>{{alert.sent_count}}</td>
<td {% if alert.error_count %}class="danger" {% endif %}>{{alert.error_count}}</td>
</tr>
{% endfor %}
</table>
Expand Down
5 changes: 4 additions & 1 deletion promgen/templates/promgen/navbar.html
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">Misc <span class="caret"></span></a>
<ul class="dropdown-menu">
<li><a href="{% url 'audit-list' %}">Log</a></li>
<li><a href="{% url 'audit-list' %}">Edit History</a></li>
<li><a href="{% url 'alert-list' %}">Alert History</a></li>
<li role="separator" class="divider"></li>

<li><a href="{% url 'farm-list' %}">Farms</a></li>
<li><a href="{% url 'host-list' %}">Hosts</a></li>
<li><a href="{% url 'shard-list' %}">Shard</a></li>
Expand Down
22 changes: 22 additions & 0 deletions promgen/tests/test_webhook.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,25 @@ def test_filter(self, mock_post):

self.assertCount(models.Alert, 1, "Alert should be queued")
self.assertEqual(mock_post.call_count, 1, "One notification should be skipped")

@override_settings(PROMGEN=TEST_SETTINGS)
@override_settings(CELERY_TASK_ALWAYS_EAGER=True)
@override_settings(CELERY_TASK_EAGER_PROPAGATES=True)
@mock.patch("promgen.util.post")
def test_failure(self, mock_post):
# When our post results in a failure, then our error_count should be
# properly updated and some errors should be logged to be viewed later
mock_post.side_effect = Exception("Boom!")

response = self.client.post(
reverse("alert"), data=TEST_ALERT, content_type="application/json"
)

self.assertRoute(response, views.Alert, 202)
self.assertCount(models.Alert, 1, "Alert should be queued")
self.assertEqual(mock_post.call_count, 2, "Two posts should be attempted")
self.assertCount(models.AlertError, 2, "Two errors should be logged")

alert = models.Alert.objects.first()
self.assertEqual(alert.sent_count, 0, "No successful sent")
self.assertEqual(alert.error_count, 2, "Error incremented")
7 changes: 6 additions & 1 deletion promgen/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# These sources are released under the terms of the MIT license: see LICENSE

import requests.sessions

from django.db.models import F
from promgen.version import __version__

from django.conf import settings
Expand Down Expand Up @@ -61,3 +61,8 @@ def __init__(self, model):

def __getattr__(self, name):
return self.model._meta.get_field(name).help_text


def inc_for_pk(model, pk, **kwargs):
# key=F('key') + value
model.objects.filter(pk=pk).update(**{key: F(key) + kwargs[key] for key in kwargs})
13 changes: 11 additions & 2 deletions promgen/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,8 +360,7 @@ def post(self, request, pk):
sender = get_object_or_404(models.Sender, id=pk)
try:
sender.test()
except:
logger.exception('Error sending test message with %s', sender.sender)
except Exception:
messages.warning(request, 'Error sending test message with ' + sender.sender)
else:
messages.info(request, 'Sent test message with ' + sender.sender)
Expand Down Expand Up @@ -1101,6 +1100,16 @@ def collect(self):
except models.Alert.DoesNotExist:
pass

try:
yield CounterMetricFamily(
"promgen_alerts_failed",
"Failed Alerts",
models.AlertError.objects.latest("id").id,
)
except models.AlertError.DoesNotExist:
pass


yield GaugeMetricFamily(
"promgen_shards", "Registered Shards", models.Shard.objects.count()
)
Expand Down

0 comments on commit efe8dd8

Please sign in to comment.