-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
131 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
apiVersion: v1 | ||
kind: ConfigMap | ||
metadata: | ||
name: infrastructure-rules-groups | ||
namespace: alert-migrator-test | ||
labels: | ||
app: prometheus | ||
annotations: | ||
prometheus.io/kube-rules: "true" | ||
data: | ||
high_latency: | | ||
groups: | ||
- name: high_latency | ||
rules: | ||
- alert: High_Latency | ||
expr: histogram_quantile(0.95, sum(rate(otelcol_process_latency_seconds_bucket{app="test-otel-collector"}[5m])) by (le)) > 0.6 | ||
for: 5m | ||
labels: | ||
team: "sre" | ||
severity: "critical" | ||
purpose: "test" | ||
annotations: | ||
description: "95th percentile latency is above 600ms for the test OpenTelemetry collector test" | ||
summary: "High 95th percentile latency observed in test environment" | ||
memory_usage: | | ||
groups: | ||
- name: memory_usage | ||
rules: | ||
- alert: High_Memory_Usage | ||
expr: sum by (instance) (container_memory_usage_bytes{container="otel-collector-test"}) / sum by (instance) (container_spec_memory_limit_bytes{container="otel-collector-test"}) > 0.7 | ||
for: 5m | ||
labels: | ||
team: "sre" | ||
severity: "warning" | ||
purpose: "test" | ||
annotations: | ||
description: "Memory usage for the test OpenTelemetry collector is above 70% of the limit" | ||
summary: "High memory usage detected for the test OpenTelemetry collector" | ||
cpu_usage: | | ||
groups: | ||
- name: cpu_usage | ||
rules: | ||
- alert: High_CPU_Usage | ||
expr: sum(rate(container_cpu_usage_seconds_total{container="otel-collector-test"}[5m])) by (instance) > 0.9 | ||
for: 5m | ||
labels: | ||
team: "sre" | ||
severity: "warning" | ||
purpose: "test" | ||
annotations: | ||
description: "CPU usage for the test OpenTelemetry collector is above 90%" | ||
summary: "High CPU usage detected for the test OpenTelemetry collector" | ||
packet_loss: | | ||
groups: | ||
- name: packet_loss | ||
rules: | ||
- alert: Packet_Loss | ||
expr: rate(packet_loss_total{app="test-network"}[5m]) > 0.1 | ||
for: 5m | ||
labels: | ||
team: "network" | ||
severity: "critical" | ||
purpose: "test" | ||
annotations: | ||
description: "Packet loss rate is above 10% on the test network" | ||
summary: "Significant packet loss detected in test network" | ||
api_response: | | ||
groups: | ||
- name: api_response | ||
rules: | ||
- alert: API_Response_Failure | ||
expr: rate(api_response_failures{endpoint="/test-api"}[5m]) > 0.05 | ||
for: 5m | ||
labels: | ||
team: "backend" | ||
severity: "major" | ||
purpose: "test" | ||
annotations: | ||
description: "API endpoint /test-api is failing more than 5% of the time" | ||
summary: "High failure rate detected on /test-api endpoint" | ||
disk_usage: | | ||
groups: | ||
- name: disk_usage | ||
rules: | ||
- alert: Disk_Usage | ||
expr: (node_filesystem_size_bytes{mountpoint="/var/lib/docker"} - node_filesystem_free_bytes{mountpoint="/var/lib/docker"}) / node_filesystem_size_bytes{mountpoint="/var/lib/docker"} > 0.8 | ||
for: 5m | ||
labels: | ||
team: "ops" | ||
severity: "warning" | ||
purpose: "test" | ||
annotations: | ||
description: "Disk usage for /var/lib/docker is above 80%" | ||
summary: "High disk usage detected on /var/lib/docker" |