Skip to content

Commit

Permalink
MGDAPI-5085 - add alerting for MCG
Browse files Browse the repository at this point in the history
  • Loading branch information
adam-cattermole committed Jan 26, 2023
1 parent 654e6ff commit 1ac6788
Show file tree
Hide file tree
Showing 5 changed files with 292 additions and 36 deletions.
135 changes: 135 additions & 0 deletions pkg/products/mcg/prometheusRules.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
package mcg

import (
"context"
"fmt"

"github.com/integr8ly/integreatly-operator/pkg/resources"
l "github.com/integr8ly/integreatly-operator/pkg/resources/logger"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
)

func (r *Reconciler) newAlertReconciler(ctx context.Context, client k8sclient.Client, logger l.Logger, installType string) (resources.AlertReconciler, error) {
installationName := resources.InstallationNames[installType]

observabilityConfig, err := r.ConfigManager.ReadObservability()
if err != nil {
logger.Warning("failed to get observability config")
return nil, err
}

namespace := observabilityConfig.GetNamespace()
return &resources.AlertReconcilerImpl{
Installation: r.installation,
Log: logger,
ProductName: "mcg",
Alerts: []resources.AlertConfiguration{
{
AlertName: "mcg-operator-ksm-endpoint-alerts",
GroupName: "mcg-operator-endpoint.rules",
Namespace: namespace,
Rules: []monitoringv1.Rule{
{
Alert: "RHOAMMCGOperatorMetricsServiceEndpointDown",
Annotations: map[string]string{
"sop_url": resources.SopUrlEndpointAvailableAlert,
"message": fmt.Sprintf("No {{ $labels.endpoint }} endpoints in namespace %s. Expected at least 1.", r.Config.GetOperatorNamespace()),
},
Expr: intstr.FromString(fmt.Sprintf("kube_endpoint_address_available{namespace='%s', endpoint='noobaa-operator-service'} < 1", r.Config.GetOperatorNamespace())),
For: "5m",
Labels: map[string]string{"severity": "critical", "product": installationName},
},
{
Alert: "RHOAMMCGOperatorRhmiRegistryCsServiceEndpointDown",
Annotations: map[string]string{
"sop_url": resources.SopUrlEndpointAvailableAlert,
"message": fmt.Sprintf("No {{ $labels.endpoint }} endpoints in namespace %s. Expected at least 1.", r.Config.GetOperatorNamespace()),
},
Expr: intstr.FromString(fmt.Sprintf("kube_endpoint_address_available{endpoint='rhmi-registry-cs', namespace=`%s`} < 1", r.Config.GetOperatorNamespace())),
For: "5m",
Labels: map[string]string{"severity": "warning", "product": installationName},
},
},
},
{
AlertName: "mcg-ksm-endpoint-alerts",
GroupName: "general.rules",
Namespace: namespace,
Rules: []monitoringv1.Rule{
{
Alert: "NooBaaCorePod",
Annotations: map[string]string{
"sop_url": resources.SopUrlAlertsAndTroubleshooting,
"message": "MCG noobaa-core has no pods in a ready state.",
},
Expr: intstr.FromString(fmt.Sprintf("sum(kube_pod_status_ready{condition='true',namespace='%[1]s', pod=~'noobaa-core.*'} * on(pod, namespace) group_left() kube_pod_status_phase{phase='Running',namespace='%[1]s'}) < 1 OR absent(kube_pod_status_ready{condition='true',namespace='%[1]s',pod=~'noobaa-core.*'})", r.Config.GetOperatorNamespace())),
For: "5m",
Labels: map[string]string{"severity": "warning", "product": installationName},
},
{
Alert: "NooBaaDBPod",
Annotations: map[string]string{
"sop_url": resources.SopUrlAlertsAndTroubleshooting,
"message": "MCG noobaa-db has no pods in a ready state.",
},
Expr: intstr.FromString(fmt.Sprintf("sum(kube_pod_status_ready{condition='true',namespace='%[1]s', pod=~'noobaa-db.*'} * on(pod, namespace) group_left() kube_pod_status_phase{phase='Running',namespace='%[1]s'}) < 1 OR absent(kube_pod_status_ready{condition='true',namespace='%[1]s',pod=~'noobaa-db.*'})", r.Config.GetOperatorNamespace())),
For: "5m",
Labels: map[string]string{"severity": "warning", "product": installationName},
},
{
Alert: "NooBaaDefaultBackingStorePod",
Annotations: map[string]string{
"sop_url": resources.SopUrlAlertsAndTroubleshooting,
"message": "MCG noobaa-default-backing-store has no pods in a ready state.",
},
Expr: intstr.FromString(fmt.Sprintf("sum(kube_pod_status_ready{condition='true',namespace='%[1]s', pod=~'noobaa-default-backing-store.*'} * on(pod, namespace) group_left() kube_pod_status_phase{phase='Running',namespace='%[1]s'}) < 1 OR absent(kube_pod_status_ready{condition='true',namespace='%[1]s',pod=~'noobaa-default-backing-store.*'})", r.Config.GetOperatorNamespace())),
For: "5m",
Labels: map[string]string{"severity": "warning", "product": installationName},
},
{
Alert: "NooBaaEndpointPod",
Annotations: map[string]string{
"sop_url": resources.SopUrlAlertsAndTroubleshooting,
"message": "MCG noobaa-endpoint has no pods in a ready state.",
},
Expr: intstr.FromString(fmt.Sprintf("sum(kube_pod_status_ready{condition='true',namespace='%[1]s', pod=~'noobaa-endpoint.*'} * on(pod, namespace) group_left() kube_pod_status_phase{phase='Running',namespace='%[1]s'}) < 1 OR absent(kube_pod_status_ready{condition='true',namespace='%[1]s',pod=~'noobaa-endpoint.*'})", r.Config.GetOperatorNamespace())),
For: "5m",
Labels: map[string]string{"severity": "warning", "product": installationName},
},
{
Alert: "NooBaaS3Endpoint",
Annotations: map[string]string{
"sop_url": resources.SopUrlEndpointAvailableAlert,
"message": "MCG s3 endpoint is not available.",
},
Expr: intstr.FromString(fmt.Sprintf("kube_endpoint_address_available{namespace='%[1]s', endpoint='s3'} < 1", r.Config.GetOperatorNamespace())),
For: "5m",
Labels: map[string]string{"severity": "warning", "product": installationName},
},
{
Alert: "NooBaaBucketCapacityOver85Percent",
Annotations: map[string]string{
"sop_url": resources.SopUrlAlertsAndTroubleshooting,
"message": "MCG s3 bucket is over 85% capacity.",
},
Expr: intstr.FromString(fmt.Sprintf("NooBaa_bucket_capacity{namespace='%[1]s', bucket_name=~'%[2]s.*'} > 85", r.Config.GetOperatorNamespace(), threescaleBucket)),
For: "5m",
Labels: map[string]string{"severity": "warning", "product": installationName},
},
{
Alert: "NooBaaBucketCapacityOver95Percent",
Annotations: map[string]string{
"sop_url": resources.SopUrlAlertsAndTroubleshooting,
"message": "MCG s3 bucket is over 95% capacity.",
},
Expr: intstr.FromString(fmt.Sprintf("NooBaa_bucket_capacity{namespace='%[1]s', bucket_name=~'%[2]s.*'} > 95", r.Config.GetOperatorNamespace(), threescaleBucket)),
For: "5m",
Labels: map[string]string{"severity": "critical", "product": installationName},
},
},
},
},
}, nil
}
10 changes: 10 additions & 0 deletions pkg/products/mcg/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ const (
defaultStorageClassAnnotation = "storageclass.kubernetes.io/is-default-class"
threescaleBucket = "3scale-operator-bucket"
ThreescaleBucketClaim = threescaleBucket + "-claim"
S3RouteName = "s3"
)

type Reconciler struct {
Expand Down Expand Up @@ -155,6 +156,15 @@ func (r *Reconciler) Reconcile(ctx context.Context, installation *integreatlyv1a
return phase, err
}

alertsReconciler, err := r.newAlertReconciler(ctx, serverClient, r.log, r.installation.Spec.Type)
if err != nil {
return integreatlyv1alpha1.PhaseFailed, err
}
if phase, err := alertsReconciler.ReconcileAlerts(ctx, serverClient); err != nil || phase != integreatlyv1alpha1.PhaseCompleted {
events.HandleError(r.recorder, installation, phase, "Failed to reconcile mcg alerts", err)
return phase, err
}

productStatus.Host = r.Config.GetHost()
productStatus.Version = r.Config.GetProductVersion()
productStatus.OperatorVersion = r.Config.GetOperatorVersion()
Expand Down
Loading

0 comments on commit 1ac6788

Please sign in to comment.