Skip to content

Commit

Permalink
Re-send notifications previously suppressed by their time periods
Browse files Browse the repository at this point in the history
refs #6167
  • Loading branch information
Al2Klimov authored and Henrik Triem committed May 8, 2020
1 parent d5d89b7 commit 2912f3e
Show file tree
Hide file tree
Showing 7 changed files with 245 additions and 66 deletions.
103 changes: 62 additions & 41 deletions lib/icinga/checkable-notification.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,25 +147,7 @@ static void FireSuppressedNotifications(Checkable* checkable)

for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
if (suppressed_types & type) {
bool still_applies;
auto cr (checkable->GetLastCheckResult());

switch (type) {
case NotificationProblem:
still_applies = cr && !checkable->IsStateOK(cr->GetState()) && checkable->GetStateType() == StateTypeHard;
break;
case NotificationRecovery:
still_applies = cr && checkable->IsStateOK(cr->GetState());
break;
case NotificationFlappingStart:
still_applies = checkable->IsFlapping();
break;
case NotificationFlappingEnd:
still_applies = !checkable->IsFlapping();
break;
default:
break;
}
bool still_applies = checkable->NotificationReasonApplies(type);

if (still_applies) {
bool still_suppressed;
Expand All @@ -185,28 +167,8 @@ static void FireSuppressedNotifications(Checkable* checkable)
break;
}

if (!still_suppressed && checkable->GetEnableActiveChecks()) {
/* If e.g. the downtime just ended, but the service is still not ok, we would re-send the stashed problem notification.
* But if the next check result recovers the service soon, we would send a recovery notification soon after the problem one.
* This is not desired, especially for lots of services at once.
* Because of that if there's likely to be a check result soon,
* we delay the re-sending of the stashed notification until the next check.
* That check either doesn't change anything and we finally re-send the stashed problem notification
* or recovers the service and we drop the stashed notification. */

/* One minute unless the check interval is too short so the next check will always run during the next minute. */
auto threshold (checkable->GetCheckInterval() - 10);

if (threshold > 60)
threshold = 60;
else if (threshold < 0)
threshold = 0;

still_suppressed = checkable->GetNextCheck() <= Utility::GetTime() + threshold;
}

if (!still_suppressed) {
Checkable::OnNotificationsRequested(checkable, type, cr, "", "", nullptr);
if (!still_suppressed && !checkable->IsLikelyToBeCheckedSoon()) {
Checkable::OnNotificationsRequested(checkable, type, checkable->GetLastCheckResult(), "", "", nullptr);

subtract |= type;
}
Expand Down Expand Up @@ -241,3 +203,62 @@ void Checkable::FireSuppressedNotifications(const Timer * const&)
::FireSuppressedNotifications(service.get());
}
}

/**
* Returns whether sending a notification of type type right now would represent *this' current state correctly.
*
* @param type The type of notification to send (or not to send).
*
* @return Whether to send the notification.
*/
bool Checkable::NotificationReasonApplies(NotificationType type)
{
switch (type) {
case NotificationProblem:
{
auto cr (GetLastCheckResult());
return cr && !IsStateOK(cr->GetState()) && GetStateType() == StateTypeHard;
}
case NotificationRecovery:
{
auto cr (GetLastCheckResult());
return cr && IsStateOK(cr->GetState());
}
case NotificationFlappingStart:
return IsFlapping();
case NotificationFlappingEnd:
return !IsFlapping();
default:
VERIFY(!"Checkable#NotificationReasonStillApplies(): given type not implemented");
return false;
}
}

/**
* E.g. we're going to re-send a stashed problem notification as *this is still not ok.
* But if the next check result recovers *this soon, we would send a recovery notification soon after the problem one.
* This is not desired, especially for lots of checkables at once.
* Because of that if there's likely to be a check result soon,
* we delay the re-sending of the stashed notification until the next check.
* That check either doesn't change anything and we finally re-send the stashed problem notification
* or recovers *this and we drop the stashed notification.
*
* @return Whether *this is likely to be checked soon
*/
bool Checkable::IsLikelyToBeCheckedSoon()
{
if (!GetEnableActiveChecks()) {
return false;
}

// One minute unless the check interval is too short so the next check will always run during the next minute.
auto threshold (GetCheckInterval() - 10);

if (threshold > 60) {
threshold = 60;
} else if (threshold < 0) {
threshold = 0;
}

return GetNextCheck() <= Utility::GetTime() + threshold;
}
3 changes: 3 additions & 0 deletions lib/icinga/checkable.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,9 @@ class Checkable : public ObjectImpl<Checkable>
void ValidateRetryInterval(const Lazy<double>& lvalue, const ValidationUtils& value) final;
void ValidateMaxCheckAttempts(const Lazy<int>& lvalue, const ValidationUtils& value) final;

bool NotificationReasonApplies(NotificationType type);
bool IsLikelyToBeCheckedSoon();

static void IncreasePendingChecks();
static void DecreasePendingChecks();
static int GetPendingChecks();
Expand Down
48 changes: 48 additions & 0 deletions lib/icinga/clusterevents.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ INITIALIZE_ONCE(&ClusterEvents::StaticInitialize);
REGISTER_APIFUNCTION(CheckResult, event, &ClusterEvents::CheckResultAPIHandler);
REGISTER_APIFUNCTION(SetNextCheck, event, &ClusterEvents::NextCheckChangedAPIHandler);
REGISTER_APIFUNCTION(SetSuppressedNotifications, event, &ClusterEvents::SuppressedNotificationsChangedAPIHandler);
REGISTER_APIFUNCTION(SetSuppressedNotificationTypes, event, &ClusterEvents::SuppressedNotificationTypesChangedAPIHandler);
REGISTER_APIFUNCTION(SetNextNotification, event, &ClusterEvents::NextNotificationChangedAPIHandler);
REGISTER_APIFUNCTION(SetForceNextCheck, event, &ClusterEvents::ForceNextCheckChangedAPIHandler);
REGISTER_APIFUNCTION(SetForceNextNotification, event, &ClusterEvents::ForceNextNotificationChangedAPIHandler);
Expand All @@ -40,6 +41,7 @@ void ClusterEvents::StaticInitialize()
Checkable::OnNewCheckResult.connect(&ClusterEvents::CheckResultHandler);
Checkable::OnNextCheckChanged.connect(&ClusterEvents::NextCheckChangedHandler);
Checkable::OnSuppressedNotificationsChanged.connect(&ClusterEvents::SuppressedNotificationsChangedHandler);
Notification::OnSuppressedNotificationsChanged.connect(&ClusterEvents::SuppressedNotificationTypesChangedHandler);
Notification::OnNextNotificationChanged.connect(&ClusterEvents::NextNotificationChangedHandler);
Checkable::OnForceNextCheckChanged.connect(&ClusterEvents::ForceNextCheckChangedHandler);
Checkable::OnForceNextNotificationChanged.connect(&ClusterEvents::ForceNextNotificationChangedHandler);
Expand Down Expand Up @@ -296,6 +298,52 @@ Value ClusterEvents::SuppressedNotificationsChangedAPIHandler(const MessageOrigi
return Empty;
}

void ClusterEvents::SuppressedNotificationTypesChangedHandler(const Notification::Ptr& notification, const MessageOrigin::Ptr& origin)
{
ApiListener::Ptr listener = ApiListener::GetInstance();

if (!listener)
return;

Dictionary::Ptr params = new Dictionary();
params->Set("notification", notification->GetName());
params->Set("suppressed_notifications", notification->GetSuppressedNotifications());

Dictionary::Ptr message = new Dictionary();
message->Set("jsonrpc", "2.0");
message->Set("method", "event::SetSuppressedNotificationTypes");
message->Set("params", params);

listener->RelayMessage(origin, notification, message, true);
}

Value ClusterEvents::SuppressedNotificationTypesChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params)
{
Endpoint::Ptr endpoint = origin->FromClient->GetEndpoint();

if (!endpoint) {
Log(LogNotice, "ClusterEvents")
<< "Discarding 'suppressed notifications changed' message from '" << origin->FromClient->GetIdentity() << "': Invalid endpoint origin (client not allowed).";
return Empty;
}

auto notification (Notification::GetByName(params->Get("notification")));

if (!notification)
return Empty;

if (origin->FromZone && !origin->FromZone->CanAccessObject(notification)) {
Log(LogNotice, "ClusterEvents")
<< "Discarding 'suppressed notification types changed' message for notification '" << notification->GetName()
<< "' from '" << origin->FromClient->GetIdentity() << "': Unauthorized access.";
return Empty;
}

notification->SetSuppressedNotifications(params->Get("suppressed_notifications"), false, origin);

return Empty;
}

void ClusterEvents::NextNotificationChangedHandler(const Notification::Ptr& notification, const MessageOrigin::Ptr& origin)
{
ApiListener::Ptr listener = ApiListener::GetInstance();
Expand Down
3 changes: 3 additions & 0 deletions lib/icinga/clusterevents.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ class ClusterEvents
static void SuppressedNotificationsChangedHandler(const Checkable::Ptr& checkable, const MessageOrigin::Ptr& origin);
static Value SuppressedNotificationsChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);

static void SuppressedNotificationTypesChangedHandler(const Notification::Ptr& notification, const MessageOrigin::Ptr& origin);
static Value SuppressedNotificationTypesChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);

static void NextNotificationChangedHandler(const Notification::Ptr& notification, const MessageOrigin::Ptr& origin);
static Value NextNotificationChangedAPIHandler(const MessageOrigin::Ptr& origin, const Dictionary::Ptr& params);

Expand Down
33 changes: 33 additions & 0 deletions lib/icinga/notification.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,39 @@ void Notification::BeginExecuteNotification(NotificationType type, const CheckRe
Log(LogNotice, "Notification")
<< "Not sending " << (reminder ? "reminder " : "") << "notifications for notification object '" << notificationName
<< "': not in timeperiod '" << tp->GetName() << "'";

if (!reminder) {
switch (type) {
case NotificationProblem:
case NotificationRecovery:
case NotificationFlappingStart:
case NotificationFlappingEnd:
{
/* If a non-reminder notification was suppressed, but just because of its time period,
* stash it into a notification types bitmask for maybe re-sending later.
*/

ObjectLock olock (this);
int suppressedTypesBefore (GetSuppressedNotifications());
int suppressedTypesAfter (suppressedTypesBefore | type);

for (int conflict : {NotificationProblem | NotificationRecovery, NotificationFlappingStart | NotificationFlappingEnd}) {
/* E.g. problem and recovery notifications neutralize each other. */

if ((suppressedTypesAfter & conflict) == conflict) {
suppressedTypesAfter &= ~conflict;
}
}

if (suppressedTypesAfter != suppressedTypesBefore) {
SetSuppressedNotifications(suppressedTypesAfter);
}
}
default:
; // Cheating the compiler on "5 enumeration values not handled in switch"
}
}

return;
}

Expand Down
4 changes: 4 additions & 0 deletions lib/icinga/notification.ti
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ class Notification : CustomVarObject < NotificationNameComposer
[state] int notification_number;
[state] Timestamp last_problem_notification;

[state, no_user_view, no_user_modify] int suppressed_notifications {
default {{{ return 0; }}}
};

[config, navigation] name(Endpoint) command_endpoint (CommandEndpointRaw) {
navigate {{{
return Endpoint::GetByName(GetCommandEndpointRaw());
Expand Down
117 changes: 92 additions & 25 deletions lib/notification/notificationcomponent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,69 @@ void NotificationComponent::Stop(bool runtimeRemoved)
ObjectImpl<NotificationComponent>::Stop(runtimeRemoved);
}

static inline
void SubtractSuppressedNotificationTypes(const Notification::Ptr& notification, int types)
{
ObjectLock olock (notification);

int suppressedTypesBefore (notification->GetSuppressedNotifications());
int suppressedTypesAfter (suppressedTypesBefore & ~types);

if (suppressedTypesAfter != suppressedTypesBefore) {
notification->SetSuppressedNotifications(suppressedTypesAfter);
}
}

static inline
void FireSuppressedNotifications(const Notification::Ptr& notification)
{
int suppressedTypes (notification->GetSuppressedNotifications());
if (!suppressedTypes)
return;

int subtract = 0;
auto checkable (notification->GetCheckable());

for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
if ((suppressedTypes & type) && !checkable->NotificationReasonApplies(type)) {
subtract |= type;
suppressedTypes &= ~type;
}
}

if (suppressedTypes) {
auto tp (notification->GetPeriod());

if ((!tp || tp->IsInside(Utility::GetTime())) && !checkable->IsLikelyToBeCheckedSoon()) {
for (auto type : {NotificationProblem, NotificationRecovery, NotificationFlappingStart, NotificationFlappingEnd}) {
if (!(suppressedTypes & type))
continue;

auto notificationName (notification->GetName());

Log(LogNotice, "NotificationComponent")
<< "Attempting to re-send previously suppressed notification '" << notificationName << "'.";

subtract |= type;
SubtractSuppressedNotificationTypes(notification, subtract);
subtract = 0;

try {
notification->BeginExecuteNotification(type, checkable->GetLastCheckResult(), false, false);
} catch (const std::exception& ex) {
Log(LogWarning, "NotificationComponent")
<< "Exception occurred during notification for object '"
<< notificationName << "': " << DiagnosticInformation(ex, false);
}
}
}
}

if (subtract) {
SubtractSuppressedNotificationTypes(notification, subtract);
}
}

/**
* Periodically sends notifications.
*
Expand Down Expand Up @@ -104,37 +167,41 @@ void NotificationComponent::NotificationTimerHandler()
bool reachable = checkable->IsReachable(DependencyNotification);

if (reachable) {
Array::Ptr unstashedNotifications = new Array();

{
auto stashedNotifications (notification->GetStashedNotifications());
ObjectLock olock(stashedNotifications);
Array::Ptr unstashedNotifications = new Array();

stashedNotifications->CopyTo(unstashedNotifications);
stashedNotifications->Clear();
}
{
auto stashedNotifications (notification->GetStashedNotifications());
ObjectLock olock(stashedNotifications);

ObjectLock olock(unstashedNotifications);
stashedNotifications->CopyTo(unstashedNotifications);
stashedNotifications->Clear();
}

for (Dictionary::Ptr unstashedNotification : unstashedNotifications) {
try {
Log(LogNotice, "NotificationComponent")
<< "Attempting to send stashed notification '" << notificationName << "'.";

notification->BeginExecuteNotification(
(NotificationType)(int)unstashedNotification->Get("type"),
(CheckResult::Ptr)unstashedNotification->Get("cr"),
(bool)unstashedNotification->Get("force"),
(bool)unstashedNotification->Get("reminder"),
(String)unstashedNotification->Get("author"),
(String)unstashedNotification->Get("text")
);
} catch (const std::exception& ex) {
Log(LogWarning, "NotificationComponent")
<< "Exception occurred during notification for object '"
<< notificationName << "': " << DiagnosticInformation(ex, false);
ObjectLock olock(unstashedNotifications);

for (Dictionary::Ptr unstashedNotification : unstashedNotifications) {
try {
Log(LogNotice, "NotificationComponent")
<< "Attempting to send stashed notification '" << notificationName << "'.";

notification->BeginExecuteNotification(
(NotificationType)(int)unstashedNotification->Get("type"),
(CheckResult::Ptr)unstashedNotification->Get("cr"),
(bool)unstashedNotification->Get("force"),
(bool)unstashedNotification->Get("reminder"),
(String)unstashedNotification->Get("author"),
(String)unstashedNotification->Get("text")
);
} catch (const std::exception& ex) {
Log(LogWarning, "NotificationComponent")
<< "Exception occurred during notification for object '"
<< notificationName << "': " << DiagnosticInformation(ex, false);
}
}
}

FireSuppressedNotifications(notification);
}

if (notification->GetInterval() <= 0 && notification->GetNoMoreNotifications()) {
Expand Down

0 comments on commit 2912f3e

Please sign in to comment.