Skip to content

Commit

Permalink
Add support for grouped alert rules
Browse files Browse the repository at this point in the history
  • Loading branch information
ralongit committed Nov 5, 2024
1 parent 4c3c236 commit 00b9ac8
Show file tree
Hide file tree
Showing 2 changed files with 131 additions and 13 deletions.
45 changes: 32 additions & 13 deletions controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package controller
import (
"context"
"fmt"
"time"

"github.com/logzio/logzio_terraform_client/grafana_alerts"
"github.com/logzio/logzio_terraform_client/grafana_contact_points"
"github.com/logzio/prometheus-alerts-migrator/common"
Expand All @@ -19,7 +21,6 @@ import (
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
"k8s.io/klog/v2"
"time"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -435,19 +436,36 @@ func (c *Controller) extractValues(cm *corev1.ConfigMap) []rulefmt.RuleNode {

fallbackNameStub := common.CreateNameStub(cm)

var toalRules []rulefmt.RuleNode
var rules []rulefmt.RuleNode

for key, value := range cm.Data {
// try each encoding
// try to extract a rules
var rule rulefmt.RuleNode
var err error
err, rule = c.extractRules(value)
if err != nil {
errorMsg := fmt.Sprintf("Configmap: %s key: %s Error during extraction.", fallbackNameStub, key)
c.configmapEventRecorderFunc(cm, corev1.EventTypeWarning, ErrInvalidKey, errorMsg)
if key == "" || value == "" {
continue
}

// Try to extract grouped rules first
var multiRuleGroups MultiRuleGroups
err := yaml.Unmarshal([]byte(value), &multiRuleGroups)
if err == nil && len(multiRuleGroups.Values) > 0 {
for _, ruleGroup := range multiRuleGroups.Values {
for _, group := range ruleGroup.Groups {
for _, rule := range group.Rules {
rules = append(rules, rule)
}
}
}
continue
}
// If not grouped, try to extract individual rule
err, rule := c.extractRules(value)
if err != nil {
klog.Warningf("Configmap: %s - key: %s Error during extraction.", cm.Name, key)
klog.Warningf("%v", err)
c.recordEventOnConfigMap(cm, corev1.EventTypeWarning, ErrInvalidKey, fmt.Sprintf("Configmap: %s - key: %s Error during extraction.", cm.Name, key))
c.recordEventOnConfigMap(cm, corev1.EventTypeWarning, ErrInvalidKey, fmt.Sprintf("%v", err))
c.recordEventOnConfigMap(cm, corev1.EventTypeWarning, ErrInvalidKey, fmt.Sprintf("Configmap: %s - key: %s Rejected, no valid rules.", cm.Name, key))
continue
}
// Add unique name for the alert rule to prevent duplicate rules ([alert_name]-[configmap_name]-[configmap_namespace])
rule.Alert.Value = fmt.Sprintf("%s-%s-%s", cm.Name, cm.Namespace, key)

Expand All @@ -466,13 +484,14 @@ func (c *Controller) extractValues(cm *corev1.ConfigMap) []rulefmt.RuleNode {

} else {
// add to the rulegroups
toalRules = append(toalRules, rule)
rules = append(rules, rule)
}
}
}
klog.Info(fmt.Sprintf("Found %d rules in %s configmap", len(toalRules), cm.Name))

return toalRules
klog.Info(fmt.Sprintf("Found %d rules in %s configmap", len(rules), cm.Name))

return rules
}

// extractRules extracts the rules from the configmap key
Expand Down
99 changes: 99 additions & 0 deletions testdata/cm3.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: infrastructure-rules-groups
namespace: alert-migrator-test
labels:
app: prometheus
annotations:
prometheus.io/kube-rules: "true"
data:
high_latency: |
groups:
- name: high_latency
rules:
- alert: High_Latency
expr: histogram_quantile(0.95, sum(rate(otelcol_process_latency_seconds_bucket{app="test-otel-collector"}[5m])) by (le)) > 0.6
for: 5m
labels:
team: "sre"
severity: "critical"
purpose: "test"
annotations:
description: "95th percentile latency is above 600ms for the test OpenTelemetry collector test"
summary: "High 95th percentile latency observed in test environment"
memory_usage: |
groups:
- name: memory_usage
rules:
- alert: High_Memory_Usage
expr: sum by (instance) (container_memory_usage_bytes{container="otel-collector-test"}) / sum by (instance) (container_spec_memory_limit_bytes{container="otel-collector-test"}) > 0.7
for: 5m
labels:
team: "sre"
severity: "warning"
purpose: "test"
annotations:
description: "Memory usage for the test OpenTelemetry collector is above 70% of the limit"
summary: "High memory usage detected for the test OpenTelemetry collector"
cpu_usage: |
groups:
- name: cpu_usage
rules:
- alert: High_CPU_Usage
expr: sum(rate(container_cpu_usage_seconds_total{container="otel-collector-test"}[5m])) by (instance) > 0.9
for: 5m
labels:
team: "sre"
severity: "warning"
purpose: "test"
annotations:
description: "CPU usage for the test OpenTelemetry collector is above 90%"
summary: "High CPU usage detected for the test OpenTelemetry collector"
packet_loss: |
groups:
- name: packet_loss
rules:
- alert: Packet_Loss
expr: rate(packet_loss_total{app="test-network"}[5m]) > 0.1
for: 5m
labels:
team: "network"
severity: "critical"
purpose: "test"
annotations:
description: "Packet loss rate is above 10% on the test network"
summary: "Significant packet loss detected in test network"
api_response: |
groups:
- name: api_response
rules:
- alert: API_Response_Failure
expr: rate(api_response_failures{endpoint="/test-api"}[5m]) > 0.05
for: 5m
labels:
team: "backend"
severity: "major"
purpose: "test"
annotations:
description: "API endpoint /test-api is failing more than 5% of the time"
summary: "High failure rate detected on /test-api endpoint"
disk_usage: |
groups:
- name: disk_usage
rules:
- alert: Disk_Usage
expr: (node_filesystem_size_bytes{mountpoint="/var/lib/docker"} - node_filesystem_free_bytes{mountpoint="/var/lib/docker"}) / node_filesystem_size_bytes{mountpoint="/var/lib/docker"} > 0.8
for: 5m
labels:
team: "ops"
severity: "warning"
purpose: "test"
annotations:
description: "Disk usage for /var/lib/docker is above 80%"
summary: "High disk usage detected on /var/lib/docker"

0 comments on commit 00b9ac8

Please sign in to comment.