Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

temporarily remove rules with last_over_time function #2111

Merged
merged 1 commit into from
Sep 19, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ items:
kind: GlobalRuleGroup
metadata:
annotations:
alerting.kubesphere.io/initial-configuration: '{"apiVersion":"alerting.kubesphere.io/v2beta1","kind":"GlobalRuleGroup","metadata":{"annotations":{},"labels":{"alerting.kubesphere.io/builtin":"true","alerting.kubesphere.io/enable":"true"},"name":"etcd","namespace":"kubesphere-monitoring-system"},"spec":{"rules":[{"alert":"etcdMembersDown","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": members are down ({{ $value }}).","summary":"etcd cluster members are down."},"expr":"max without (endpoint) (\n sum without (instance) (up{job=~\".*etcd.*\"} == bool 0)\nor\n count without (To) (\n sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[120s])) > 0.01\n )\n)\n> 0\n","for":"10m","labels":{},"severity":"critical"},{"alert":"etcdInsufficientMembers","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value }}).","summary":"etcd cluster has insufficient number of members."},"expr":"sum(up{job=~\".*etcd.*\"} == bool 1) without (instance) < ((count(up{job=~\".*etcd.*\"}) without (instance) + 1) / 2)\n","for":"3m","labels":{},"severity":"critical"},{"alert":"etcdNoLeader","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }} has no leader.","summary":"etcd cluster has no leader."},"expr":"etcd_server_has_leader{job=~\".*etcd.*\"} == 0\n","for":"1m","labels":{},"severity":"critical"},{"alert":"etcdHighNumberOfLeaderChanges","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.","summary":"etcd cluster has high number of leader changes."},"expr":"increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}))[15m:1m]) >= 4\n","for":"5m","labels":{},"severity":"warning"},{"alert":"etcdHighNumberOfFailedGRPCRequests","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.","summary":"etcd cluster has high number of failed grpc requests."},"expr":"100 * sum(rate(grpc_server_handled_total{job=~\".*etcd.*\", grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[5m])) without (grpc_type, grpc_code)\n /\nsum(rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 1\n","for":"10m","labels":{},"severity":"warning"},{"alert":"etcdHighNumberOfFailedGRPCRequests","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.","summary":"etcd cluster has high number of failed grpc requests."},"expr":"100 * sum(rate(grpc_server_handled_total{job=~\".*etcd.*\", grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[5m])) without (grpc_type, grpc_code)\n /\nsum(rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 5\n","for":"5m","labels":{},"severity":"critical"},{"alert":"etcdGRPCRequestsSlow","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.","summary":"etcd grpc requests are slow"},"expr":"histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\", grpc_method!=\"Defragment\", grpc_type=\"unary\"}[5m])) without(grpc_type))\n> 0.15\n","for":"10m","labels":{},"severity":"critical"},{"alert":"etcdMemberCommunicationSlow","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.","summary":"etcd cluster member communication is slow."},"expr":"histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.15\n","for":"10m","labels":{},"severity":"warning"},{"alert":"etcdHighNumberOfFailedProposals","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.","summary":"etcd cluster has high number of proposal failures."},"expr":"rate(etcd_server_proposals_failed_total{job=~\".*etcd.*\"}[15m]) > 5\n","for":"15m","labels":{},"severity":"warning"},{"alert":"etcdHighFsyncDurations","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.","summary":"etcd cluster 99th percentile fsync durations are too high."},"expr":"histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.5\n","for":"10m","labels":{},"severity":"warning"},{"alert":"etcdHighFsyncDurations","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.","summary":"etcd cluster 99th percentile fsync durations are too high."},"expr":"histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 1\n","for":"10m","labels":{},"severity":"critical"},{"alert":"etcdHighCommitDurations","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.","summary":"etcd cluster 99th percentile commit durations are too high."},"expr":"histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.25\n","for":"10m","labels":{},"severity":"warning"},{"alert":"etcdDatabaseQuotaLowSpace","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.","summary":"etcd cluster database is running full."},"expr":"(last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95\n","for":"10m","labels":{},"severity":"critical"},{"alert":"etcdExcessiveDatabaseGrowth","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.","summary":"etcd cluster database growing very fast."},"expr":"predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes\n","for":"10m","labels":{},"severity":"warning"},{"alert":"etcdDatabaseHighFragmentationRatio","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.","runbook_url":"https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation","summary":"etcd database size in use is less than 50% of the actual allocated storage."},"expr":"(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5\n","for":"10m","labels":{},"severity":"warning"}]}}'
alerting.kubesphere.io/initial-configuration: '{"apiVersion":"alerting.kubesphere.io/v2beta1","kind":"GlobalRuleGroup","metadata":{"annotations":{},"labels":{"alerting.kubesphere.io/builtin":"true","alerting.kubesphere.io/enable":"true"},"name":"etcd","namespace":"kubesphere-monitoring-system"},"spec":{"rules":[{"alert":"etcdMembersDown","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": members are down ({{ $value }}).","summary":"etcd cluster members are down."},"expr":"max without (endpoint) (\n sum without (instance) (up{job=~\".*etcd.*\"} == bool 0)\nor\n count without (To) (\n sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~\".*etcd.*\"}[120s])) > 0.01\n )\n)\n> 0\n","for":"10m","labels":{},"severity":"critical"},{"alert":"etcdInsufficientMembers","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": insufficient members ({{ $value }}).","summary":"etcd cluster has insufficient number of members."},"expr":"sum(up{job=~\".*etcd.*\"} == bool 1) without (instance) < ((count(up{job=~\".*etcd.*\"}) without (instance) + 1) / 2)\n","for":"3m","labels":{},"severity":"critical"},{"alert":"etcdNoLeader","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": member {{ $labels.instance }} has no leader.","summary":"etcd cluster has no leader."},"expr":"etcd_server_has_leader{job=~\".*etcd.*\"} == 0\n","for":"1m","labels":{},"severity":"critical"},{"alert":"etcdHighNumberOfLeaderChanges","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.","summary":"etcd cluster has high number of leader changes."},"expr":"increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~\".*etcd.*\"}))[15m:1m]) >= 4\n","for":"5m","labels":{},"severity":"warning"},{"alert":"etcdHighNumberOfFailedGRPCRequests","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.","summary":"etcd cluster has high number of failed grpc requests."},"expr":"100 * sum(rate(grpc_server_handled_total{job=~\".*etcd.*\", grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[5m])) without (grpc_type, grpc_code)\n /\nsum(rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 1\n","for":"10m","labels":{},"severity":"warning"},{"alert":"etcdHighNumberOfFailedGRPCRequests","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.","summary":"etcd cluster has high number of failed grpc requests."},"expr":"100 * sum(rate(grpc_server_handled_total{job=~\".*etcd.*\", grpc_code=~\"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded\"}[5m])) without (grpc_type, grpc_code)\n /\nsum(rate(grpc_server_handled_total{job=~\".*etcd.*\"}[5m])) without (grpc_type, grpc_code)\n > 5\n","for":"5m","labels":{},"severity":"critical"},{"alert":"etcdGRPCRequestsSlow","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.","summary":"etcd grpc requests are slow"},"expr":"histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~\".*etcd.*\", grpc_method!=\"Defragment\", grpc_type=\"unary\"}[5m])) without(grpc_type))\n> 0.15\n","for":"10m","labels":{},"severity":"critical"},{"alert":"etcdMemberCommunicationSlow","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.","summary":"etcd cluster member communication is slow."},"expr":"histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.15\n","for":"10m","labels":{},"severity":"warning"},{"alert":"etcdHighNumberOfFailedProposals","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.","summary":"etcd cluster has high number of proposal failures."},"expr":"rate(etcd_server_proposals_failed_total{job=~\".*etcd.*\"}[15m]) > 5\n","for":"15m","labels":{},"severity":"warning"},{"alert":"etcdHighFsyncDurations","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.","summary":"etcd cluster 99th percentile fsync durations are too high."},"expr":"histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.5\n","for":"10m","labels":{},"severity":"warning"},{"alert":"etcdHighFsyncDurations","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.","summary":"etcd cluster 99th percentile fsync durations are too high."},"expr":"histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 1\n","for":"10m","labels":{},"severity":"critical"},{"alert":"etcdHighCommitDurations","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.","summary":"etcd cluster 99th percentile commit durations are too high."},"expr":"histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~\".*etcd.*\"}[5m]))\n> 0.25\n","for":"10m","labels":{},"severity":"warning"},{"alert":"etcdExcessiveDatabaseGrowth","annotations":{"description":"etcd cluster \"{{ $labels.job }}\": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.","summary":"etcd cluster database growing very fast."},"expr":"predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes\n","for":"10m","labels":{},"severity":"warning"}]}}'
labels:
alerting.kubesphere.io/builtin: "true"
alerting.kubesphere.io/enable: "true"
Expand Down Expand Up @@ -138,15 +138,6 @@ items:
for: 10m
labels: {}
severity: warning
- alert: etcdDatabaseQuotaLowSpace
annotations:
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
summary: etcd cluster database is running full.
expr: |
(last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
for: 10m
labels: {}
severity: critical
- alert: etcdExcessiveDatabaseGrowth
annotations:
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.'
Expand All @@ -156,14 +147,4 @@ items:
for: 10m
labels: {}
severity: warning
- alert: etcdDatabaseHighFragmentationRatio
annotations:
description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
summary: etcd database size in use is less than 50% of the actual allocated storage.
expr: |
(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
for: 10m
labels: {}
severity: warning
kind: List