From b9b94442d30b2811323f48f18a634de46fb8514c Mon Sep 17 00:00:00 2001 From: Benjamin Smith Date: Thu, 10 Feb 2022 10:04:27 -0800 Subject: [PATCH 1/5] allow enabling and disabling of particular monitors --- modules/monitors/main.tf | 2 +- modules/monitors/variables.tf | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/monitors/main.tf b/modules/monitors/main.tf index e04f542..f502558 100644 --- a/modules/monitors/main.tf +++ b/modules/monitors/main.tf @@ -6,7 +6,7 @@ locals { # https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor resource "datadog_monitor" "default" { - for_each = local.enabled ? var.datadog_monitors : {} + for_each = local.enabled ? {for k,v in var.datadog_monitors: k=>v if lookup(v, "enabled", true)} : {} name = each.value.name type = each.value.type diff --git a/modules/monitors/variables.tf b/modules/monitors/variables.tf index 2e15a22..a01e79f 100644 --- a/modules/monitors/variables.tf +++ b/modules/monitors/variables.tf @@ -1,6 +1,7 @@ # https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor variable "datadog_monitors" { type = map(object({ + enabled = bool name = string type = string message = string From 8167228f94f6aa9c3bb8fb4695212aeef4029227 Mon Sep 17 00:00:00 2001 From: Benjamin Smith Date: Thu, 10 Feb 2022 10:06:47 -0800 Subject: [PATCH 2/5] update examples --- catalog/monitors/amq.yaml | 5 +++++ catalog/monitors/aurora.yaml | 1 + catalog/monitors/ec2.yaml | 1 + catalog/monitors/host.yaml | 4 ++++ catalog/monitors/k8s.yaml | 21 +++++++++++++++++++++ catalog/monitors/rabbitmq.yaml | 2 ++ catalog/monitors/rds.yaml | 5 +++++ catalog/monitors/redshift.yaml | 9 +++++++++ 8 files changed, 48 insertions(+) diff --git a/catalog/monitors/amq.yaml b/catalog/monitors/amq.yaml index 3447dec..2d0dd24 100644 --- a/catalog/monitors/amq.yaml +++ b/catalog/monitors/amq.yaml @@ -3,6 +3,7 @@ amq-cpu-utilization: name: "(AMQ) CPU Utilization above 90%" + enabled: true type: metric alert query: | avg(last_15m):avg:aws.amazonmq.cpu_utilization{*} by {broker} > 90 @@ -45,6 +46,7 @@ amq-cpu-utilization: amq-heap-usage: name: "(AMQ) JVM heap usage above 95%" + enabled: true type: metric alert query: | avg(last_15m):avg:aws.amazonmq.heap_usage{*} by {broker} > 95 @@ -87,6 +89,7 @@ amq-heap-usage: amq-network-in: name: "(AMQ) Anomaly of a large variance in network-in bytes" + enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.amazonmq.network_in{*} by {broker}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1 @@ -128,6 +131,7 @@ amq-network-in: amq-network-out: name: "(AMQ) Anomaly of a large variance in network-out bytes" + enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.amazonmq.network_out{*} by {broker}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1 @@ -169,6 +173,7 @@ amq-network-out: amq-current-connections-count: name: "(AMQ) Anomaly of a large variance in broker connections" + enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.amazonmq.current_connections_count{*}.as_count(), 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true') >= 1 diff --git a/catalog/monitors/aurora.yaml b/catalog/monitors/aurora.yaml index bc88e29..290183a 100644 --- a/catalog/monitors/aurora.yaml +++ b/catalog/monitors/aurora.yaml @@ -3,6 +3,7 @@ aurora-replica-lag: name: "(RDS) Aurora Replica Lag Detected" + enabled: true type: metric alert query: | min(last_15m):min:aws.rds.aurora_replica_lag{*} by {dbinstanceidentifier} > 1000 diff --git a/catalog/monitors/ec2.yaml b/catalog/monitors/ec2.yaml index b4c842b..1419c0a 100644 --- a/catalog/monitors/ec2.yaml +++ b/catalog/monitors/ec2.yaml @@ -3,6 +3,7 @@ ec2-failed-status-check: name: "(EC2) Failed Status Check" + enabled: true type: metric alert query: | avg(last_10m):avg:aws.ec2.status_check_failed{*} by {instance_id} > 0 diff --git a/catalog/monitors/host.yaml b/catalog/monitors/host.yaml index 9f2bffd..0170425 100644 --- a/catalog/monitors/host.yaml +++ b/catalog/monitors/host.yaml @@ -3,6 +3,7 @@ host-io-wait-times: name: "(Host) I/O Wait Times" + enabled: true type: metric alert query: "avg(last_10m):avg:system.cpu.iowait{*} by {host} > 50" message: |- @@ -39,6 +40,7 @@ host-io-wait-times: host-disk-use: name: "(Host) Host Disk Usage" + enabled: true type: metric alert query: "avg(last_30m):(avg:system.disk.total{*} by {host} - avg:system.disk.free{*} by {host}) / avg:system.disk.total{*} by {host} * 100 > 90" message: |- @@ -75,6 +77,7 @@ host-disk-use: host-high-mem-use: name: "(Host) Memory Utilization" + enabled: true type: query alert query: "avg(last_15m):avg:system.mem.pct_usable{*} by {host} < 0.1" message: |- @@ -111,6 +114,7 @@ host-high-mem-use: host-high-load-avg: name: "(Host) High System Load Average" + enabled: true type: metric alert query: "avg(last_30m):avg:system.load.norm.5{*} by {host} > 2" message: |- diff --git a/catalog/monitors/k8s.yaml b/catalog/monitors/k8s.yaml index d6c2313..a0295b1 100644 --- a/catalog/monitors/k8s.yaml +++ b/catalog/monitors/k8s.yaml @@ -3,6 +3,7 @@ k8s-deployment-replica-pod-down: name: "(k8s) Deployment Replica Pod is down" + enabled: true type: query alert query: | avg(last_15m):avg:kubernetes_state.deployment.replicas_desired{*} by {cluster_name,deployment} - avg:kubernetes_state.deployment.replicas_ready{*} by {cluster_name,deployment} >= 2 @@ -35,6 +36,7 @@ k8s-deployment-replica-pod-down: k8s-pod-restarting: name: "(k8s) Pods are restarting multiple times" + enabled: true type: query alert query: | change(sum(last_5m),last_5m):exclude_null(avg:kubernetes.containers.restarts{*} by {cluster_name,kube_namespace,pod_name}) > 5 @@ -68,6 +70,7 @@ k8s-pod-restarting: k8s-statefulset-replica-down: name: "(k8s) StatefulSet Replica Pod is down" + enabled: true type: query alert query: | max(last_15m):sum:kubernetes_state.statefulset.replicas_desired{*} by {cluster_name,kube_namespace,statefulset} - sum:kubernetes_state.statefulset.replicas_ready{*} by {cluster_name,kube_namespace,statefulset} >= 2 @@ -101,6 +104,7 @@ k8s-statefulset-replica-down: k8s-daemonset-pod-down: name: "(k8s) DaemonSet Pod is down" + enabled: true type: query alert query: | max(last_15m):sum:kubernetes_state.daemonset.desired{*} by {cluster_name,kube_namespace,daemonset} - sum:kubernetes_state.daemonset.ready{*} by {cluster_name,kube_namespace,daemonset} >= 1 @@ -133,6 +137,7 @@ k8s-daemonset-pod-down: k8s-crashloopBackOff: name: "(k8s) CrashloopBackOff detected" + enabled: true type: query alert query: | max(last_10m):max:kubernetes_state.container.status_report.count.waiting{reason:crashloopbackoff} by {cluster_name,kube_namespace,pod_name} >= 1 @@ -165,6 +170,7 @@ k8s-crashloopBackOff: k8s-multiple-pods-failing: name: "(k8s) Multiple Pods are failing" + enabled: true type: query alert query: | change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {cluster_name,kube_namespace} > 10 @@ -198,6 +204,7 @@ k8s-multiple-pods-failing: k8s-unavailable-deployment-replica: name: "(k8s) Unavailable Deployment Replica(s) detected" + enabled: true type: metric alert query: | max(last_10m):max:kubernetes_state.deployment.replicas_unavailable{*} by {cluster_name,kube_namespace} > 0 @@ -235,6 +242,7 @@ k8s-unavailable-deployment-replica: k8s-unavailable-statefulset-replica: name: "(k8s) Unavailable Statefulset Replica(s) detected" + enabled: true type: metric alert query: | max(last_10m):max:kubernetes_state.statefulset.replicas_unavailable{*} by {cluster_name,kube_namespace} > 0 @@ -272,6 +280,7 @@ k8s-unavailable-statefulset-replica: k8s-node-status-unschedulable: name: "(k8s) Detected Unschedulable Node(s)" + enabled: true type: query alert query: | max(last_15m):sum:kubernetes_state.node.status{status:schedulable} by {cluster_name} * 100 / sum:kubernetes_state.node.status{*} by {cluster_name} < 80 @@ -309,6 +318,7 @@ k8s-node-status-unschedulable: k8s-imagepullbackoff: name: "(k8s) ImagePullBackOff detected" + enabled: true type: "query alert" query: | max(last_10m):max:kubernetes_state.container.status_report.count.waiting{reason:imagepullbackoff} by {kube_cluster_name,kube_namespace,pod_name} >= 1 @@ -346,6 +356,7 @@ k8s-imagepullbackoff: k8s-high-cpu-usage: name: "(k8s) High CPU Usage Detected" + enabled: true type: metric alert query: | avg(last_10m):avg:system.cpu.system{*} by {host} > 90 @@ -383,6 +394,7 @@ k8s-high-cpu-usage: k8s-high-disk-usage: name: "(k8s) High Disk Usage Detected" + enabled: true type: metric alert query: | min(last_5m):min:system.disk.used{*} by {host,cluster_name} / avg:system.disk.total{*} by {host,cluster_name} * 100 > 90 @@ -420,6 +432,7 @@ k8s-high-disk-usage: k8s-high-memory-usage: name: "(k8s) High Memory Usage Detected" + enabled: true type: metric alert query: | avg(last_10m):avg:kubernetes.memory.usage_pct{*} by {cluster_name} > 90 @@ -462,6 +475,7 @@ k8s-high-memory-usage: k8s-high-filesystem-usage: name: "(k8s) High Filesystem Usage Detected" + enabled: true type: metric alert query: | avg(last_10m):avg:kubernetes.filesystem.usage_pct{*} by {cluster_name} > 90 @@ -504,6 +518,7 @@ k8s-high-filesystem-usage: k8s-network-tx-errors: name: "(k8s) High Network TX (send) Errors" + enabled: true type: metric alert query: | avg(last_10m):avg:kubernetes.network.tx_errors{*} by {cluster_name} > 100 @@ -546,6 +561,7 @@ k8s-network-tx-errors: k8s-network-rx-errors: name: "(k8s) High Network RX (receive) Errors" + enabled: true type: metric alert query: | avg(last_10m):avg:kubernetes.network.rx_errors{*} by {cluster_name} > 100 @@ -588,6 +604,7 @@ k8s-network-rx-errors: k8s-node-not-ready: name: "(k8s) Node Not Ready" + enabled: true type: service check query: | "kubernetes_state.node.ready".by('host').last(5).count_by_status() @@ -625,6 +642,7 @@ k8s-node-not-ready: k8s-kube-api-down: name: "(k8s) KubeAPI Down" + enabled: true type: service check query: | "kube_apiserver_controlplane.up".by('host').last(5).count_by_status() @@ -662,6 +680,7 @@ k8s-kube-api-down: k8s-increased-pod-crash: name: "(k8s) Increased Pod Crashes" + enabled: true type: query alert query: | avg(last_5m):avg:kubernetes_state.container.restarts{*} by {cluster_name,kube_namespace,pod} - hour_before(avg:kubernetes_state.container.restarts{*} by {cluster_name,kube_namespace,pod}) > 3 @@ -699,6 +718,7 @@ k8s-increased-pod-crash: k8s-hpa-errors: name: "(k8s) HPA Errors" + enabled: true type: event alert query: | events('sources:kubernetes priority:all \"unable to fetch metrics from resource metrics API:\"').by('hpa').rollup('count').last('1h') > 200 @@ -736,6 +756,7 @@ k8s-hpa-errors: k8s-pending-pods: name: "(k8s) Pending Pods" + enabled: true type: metric alert query: | min(last_30m):sum:kubernetes_state.pod.status_phase{phase:running} by {cluster_name} - sum:kubernetes_state.pod.status_phase{phase:running} by {cluster_name} + sum:kubernetes_state.pod.status_phase{phase:pending} by {cluster_name}.fill(zero) >= 1 diff --git a/catalog/monitors/rabbitmq.yaml b/catalog/monitors/rabbitmq.yaml index 9cf03bb..7e41b08 100644 --- a/catalog/monitors/rabbitmq.yaml +++ b/catalog/monitors/rabbitmq.yaml @@ -1,5 +1,6 @@ rabbitmq-messages-unacknowledged-rate-too-high: name: "[RabbitMQ] - Messages unacknowledged rate is higher than usual on: {{host.name}}" + enabled: true type: "query alert" query: | avg(last_4h):anomalies(avg:rabbitmq.queue.messages_unacknowledged.rate{*} by {rabbitmq_queue,host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1 @@ -41,6 +42,7 @@ rabbitmq-messages-unacknowledged-rate-too-high: rabbitmq-disk-usage-too-high: name: "[RabbitMQ] Level of disk usage is too high for host: {{host.name}}" + enabled: true type: "query alert" query: | avg(last_5m):avg:rabbitmq.node.mem_used{*} by {host} / avg:system.mem.total{*} by {host} * 100 > 35 diff --git a/catalog/monitors/rds.yaml b/catalog/monitors/rds.yaml index 1b811ea..a4c11bc 100644 --- a/catalog/monitors/rds.yaml +++ b/catalog/monitors/rds.yaml @@ -3,6 +3,7 @@ rds-cpuutilization: name: "(RDS) CPU Utilization above 90%" + enabled: true type: metric alert query: | avg(last_15m):avg:aws.rds.cpuutilization{*} by {dbinstanceidentifier} > 90 @@ -45,6 +46,7 @@ rds-cpuutilization: rds-disk-queue-depth: name: "(RDS) Disk queue depth above 64" + enabled: true type: metric alert query: | avg(last_15m):avg:aws.rds.disk_queue_depth{*} by {dbinstanceidentifier} > 64 @@ -87,6 +89,7 @@ rds-disk-queue-depth: rds-freeable-memory: name: "(RDS) Freeable memory below 256 MB" + enabled: true type: metric alert query: | avg(last_5m):avg:aws.rds.freeable_memory{*} < 256000000 @@ -129,6 +132,7 @@ rds-freeable-memory: rds-swap-usage: name: "(RDS) Swap usage above 256 MB" + enabled: true type: metric alert query: | avg(last_15m):avg:aws.rds.swap_usage{*} by {dbinstanceidentifier} > 256000000 @@ -171,6 +175,7 @@ rds-swap-usage: rds-database-connections: name: "(RDS) Anomaly of a large variance in RDS connection count" + enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.rds.database_connections{*}, 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true') >= 1 diff --git a/catalog/monitors/redshift.yaml b/catalog/monitors/redshift.yaml index 712e567..addd75c 100644 --- a/catalog/monitors/redshift.yaml +++ b/catalog/monitors/redshift.yaml @@ -3,6 +3,7 @@ redshift-health-status: name: (Redshift) Health Status + enabled: true type: metric alert query: | min(last_1h):min:aws.redshift.health_status{*} by {clusteridentifier} <= 0 @@ -42,6 +43,7 @@ redshift-health-status: redshift-database-connections: name: (Redshift) Anomaly of a large variance in Redshift connection count + enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.redshift.database_connections{*} by {clusteridentifier}, 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true') >= 1 @@ -83,6 +85,7 @@ redshift-database-connections: redshift-cpuutilization: name: (Redshift) CPU Utilization above 90% + enabled: true type: metric alert query: | avg(last_15m):avg:aws.redshift.cpuutilization{*} by {clusteridentifier} > 90 @@ -125,6 +128,7 @@ redshift-cpuutilization: redshift-write-latency: name: (Redshift) Write latency/cluster + enabled: true type: metric alert query: | avg(last_15m):avg:aws.redshift.write_latency{*} by {clusteridentifier} > 15 @@ -167,6 +171,7 @@ redshift-write-latency: redshift-disk-space-used: name: (Redshift) Percent disk space used/cluster + enabled: true type: metric alert query: | avg(last_15m):avg:aws.redshift.percentage_disk_space_used{*} by {clusteridentifier} > 85 @@ -209,6 +214,7 @@ redshift-disk-space-used: redshift-network-receive: name: (Redshift) Network throughput - received + enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.redshift.network_receive_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1 @@ -250,6 +256,7 @@ redshift-network-receive: redshift-network-transmit: name: (Redshift) Network throughput - sent + enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.redshift.network_transmit_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1 @@ -291,6 +298,7 @@ redshift-network-transmit: redshift-read-throughput: name: (Redshift) Read throughput/cluster + enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.redshift.read_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1 @@ -332,6 +340,7 @@ redshift-read-throughput: redshift-write-throughput: name: (Redshift) Write throughput/cluster + enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.redshift.write_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1 From 5945c44d89b8de0df24f9b1421f58461aaac156c Mon Sep 17 00:00:00 2001 From: cloudpossebot <11232728+cloudpossebot@users.noreply.github.com> Date: Thu, 10 Feb 2022 18:08:40 +0000 Subject: [PATCH 3/5] Auto Format --- modules/monitors/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/monitors/main.tf b/modules/monitors/main.tf index f502558..e8eb77b 100644 --- a/modules/monitors/main.tf +++ b/modules/monitors/main.tf @@ -6,7 +6,7 @@ locals { # https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor resource "datadog_monitor" "default" { - for_each = local.enabled ? {for k,v in var.datadog_monitors: k=>v if lookup(v, "enabled", true)} : {} + for_each = local.enabled ? { for k, v in var.datadog_monitors : k => v if lookup(v, "enabled", true) } : {} name = each.value.name type = each.value.type From fcf53c9ce4749e14fca7c2d0b11dcb23337991ec Mon Sep 17 00:00:00 2001 From: Benjamin Smith Date: Thu, 10 Feb 2022 11:14:16 -0800 Subject: [PATCH 4/5] update to use any and revert changes to monitors --- catalog/monitors/amq.yaml | 5 --- catalog/monitors/aurora.yaml | 1 - catalog/monitors/ec2.yaml | 1 - catalog/monitors/host.yaml | 4 --- catalog/monitors/k8s.yaml | 21 ------------ catalog/monitors/rabbitmq.yaml | 2 -- catalog/monitors/rds.yaml | 5 --- catalog/monitors/redshift.yaml | 9 ------ modules/monitors/variables.tf | 58 +++++++++++++++++----------------- 9 files changed, 29 insertions(+), 77 deletions(-) diff --git a/catalog/monitors/amq.yaml b/catalog/monitors/amq.yaml index 2d0dd24..3447dec 100644 --- a/catalog/monitors/amq.yaml +++ b/catalog/monitors/amq.yaml @@ -3,7 +3,6 @@ amq-cpu-utilization: name: "(AMQ) CPU Utilization above 90%" - enabled: true type: metric alert query: | avg(last_15m):avg:aws.amazonmq.cpu_utilization{*} by {broker} > 90 @@ -46,7 +45,6 @@ amq-cpu-utilization: amq-heap-usage: name: "(AMQ) JVM heap usage above 95%" - enabled: true type: metric alert query: | avg(last_15m):avg:aws.amazonmq.heap_usage{*} by {broker} > 95 @@ -89,7 +87,6 @@ amq-heap-usage: amq-network-in: name: "(AMQ) Anomaly of a large variance in network-in bytes" - enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.amazonmq.network_in{*} by {broker}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1 @@ -131,7 +128,6 @@ amq-network-in: amq-network-out: name: "(AMQ) Anomaly of a large variance in network-out bytes" - enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.amazonmq.network_out{*} by {broker}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1 @@ -173,7 +169,6 @@ amq-network-out: amq-current-connections-count: name: "(AMQ) Anomaly of a large variance in broker connections" - enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.amazonmq.current_connections_count{*}.as_count(), 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true') >= 1 diff --git a/catalog/monitors/aurora.yaml b/catalog/monitors/aurora.yaml index 290183a..bc88e29 100644 --- a/catalog/monitors/aurora.yaml +++ b/catalog/monitors/aurora.yaml @@ -3,7 +3,6 @@ aurora-replica-lag: name: "(RDS) Aurora Replica Lag Detected" - enabled: true type: metric alert query: | min(last_15m):min:aws.rds.aurora_replica_lag{*} by {dbinstanceidentifier} > 1000 diff --git a/catalog/monitors/ec2.yaml b/catalog/monitors/ec2.yaml index 1419c0a..b4c842b 100644 --- a/catalog/monitors/ec2.yaml +++ b/catalog/monitors/ec2.yaml @@ -3,7 +3,6 @@ ec2-failed-status-check: name: "(EC2) Failed Status Check" - enabled: true type: metric alert query: | avg(last_10m):avg:aws.ec2.status_check_failed{*} by {instance_id} > 0 diff --git a/catalog/monitors/host.yaml b/catalog/monitors/host.yaml index 0170425..9f2bffd 100644 --- a/catalog/monitors/host.yaml +++ b/catalog/monitors/host.yaml @@ -3,7 +3,6 @@ host-io-wait-times: name: "(Host) I/O Wait Times" - enabled: true type: metric alert query: "avg(last_10m):avg:system.cpu.iowait{*} by {host} > 50" message: |- @@ -40,7 +39,6 @@ host-io-wait-times: host-disk-use: name: "(Host) Host Disk Usage" - enabled: true type: metric alert query: "avg(last_30m):(avg:system.disk.total{*} by {host} - avg:system.disk.free{*} by {host}) / avg:system.disk.total{*} by {host} * 100 > 90" message: |- @@ -77,7 +75,6 @@ host-disk-use: host-high-mem-use: name: "(Host) Memory Utilization" - enabled: true type: query alert query: "avg(last_15m):avg:system.mem.pct_usable{*} by {host} < 0.1" message: |- @@ -114,7 +111,6 @@ host-high-mem-use: host-high-load-avg: name: "(Host) High System Load Average" - enabled: true type: metric alert query: "avg(last_30m):avg:system.load.norm.5{*} by {host} > 2" message: |- diff --git a/catalog/monitors/k8s.yaml b/catalog/monitors/k8s.yaml index a0295b1..d6c2313 100644 --- a/catalog/monitors/k8s.yaml +++ b/catalog/monitors/k8s.yaml @@ -3,7 +3,6 @@ k8s-deployment-replica-pod-down: name: "(k8s) Deployment Replica Pod is down" - enabled: true type: query alert query: | avg(last_15m):avg:kubernetes_state.deployment.replicas_desired{*} by {cluster_name,deployment} - avg:kubernetes_state.deployment.replicas_ready{*} by {cluster_name,deployment} >= 2 @@ -36,7 +35,6 @@ k8s-deployment-replica-pod-down: k8s-pod-restarting: name: "(k8s) Pods are restarting multiple times" - enabled: true type: query alert query: | change(sum(last_5m),last_5m):exclude_null(avg:kubernetes.containers.restarts{*} by {cluster_name,kube_namespace,pod_name}) > 5 @@ -70,7 +68,6 @@ k8s-pod-restarting: k8s-statefulset-replica-down: name: "(k8s) StatefulSet Replica Pod is down" - enabled: true type: query alert query: | max(last_15m):sum:kubernetes_state.statefulset.replicas_desired{*} by {cluster_name,kube_namespace,statefulset} - sum:kubernetes_state.statefulset.replicas_ready{*} by {cluster_name,kube_namespace,statefulset} >= 2 @@ -104,7 +101,6 @@ k8s-statefulset-replica-down: k8s-daemonset-pod-down: name: "(k8s) DaemonSet Pod is down" - enabled: true type: query alert query: | max(last_15m):sum:kubernetes_state.daemonset.desired{*} by {cluster_name,kube_namespace,daemonset} - sum:kubernetes_state.daemonset.ready{*} by {cluster_name,kube_namespace,daemonset} >= 1 @@ -137,7 +133,6 @@ k8s-daemonset-pod-down: k8s-crashloopBackOff: name: "(k8s) CrashloopBackOff detected" - enabled: true type: query alert query: | max(last_10m):max:kubernetes_state.container.status_report.count.waiting{reason:crashloopbackoff} by {cluster_name,kube_namespace,pod_name} >= 1 @@ -170,7 +165,6 @@ k8s-crashloopBackOff: k8s-multiple-pods-failing: name: "(k8s) Multiple Pods are failing" - enabled: true type: query alert query: | change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {cluster_name,kube_namespace} > 10 @@ -204,7 +198,6 @@ k8s-multiple-pods-failing: k8s-unavailable-deployment-replica: name: "(k8s) Unavailable Deployment Replica(s) detected" - enabled: true type: metric alert query: | max(last_10m):max:kubernetes_state.deployment.replicas_unavailable{*} by {cluster_name,kube_namespace} > 0 @@ -242,7 +235,6 @@ k8s-unavailable-deployment-replica: k8s-unavailable-statefulset-replica: name: "(k8s) Unavailable Statefulset Replica(s) detected" - enabled: true type: metric alert query: | max(last_10m):max:kubernetes_state.statefulset.replicas_unavailable{*} by {cluster_name,kube_namespace} > 0 @@ -280,7 +272,6 @@ k8s-unavailable-statefulset-replica: k8s-node-status-unschedulable: name: "(k8s) Detected Unschedulable Node(s)" - enabled: true type: query alert query: | max(last_15m):sum:kubernetes_state.node.status{status:schedulable} by {cluster_name} * 100 / sum:kubernetes_state.node.status{*} by {cluster_name} < 80 @@ -318,7 +309,6 @@ k8s-node-status-unschedulable: k8s-imagepullbackoff: name: "(k8s) ImagePullBackOff detected" - enabled: true type: "query alert" query: | max(last_10m):max:kubernetes_state.container.status_report.count.waiting{reason:imagepullbackoff} by {kube_cluster_name,kube_namespace,pod_name} >= 1 @@ -356,7 +346,6 @@ k8s-imagepullbackoff: k8s-high-cpu-usage: name: "(k8s) High CPU Usage Detected" - enabled: true type: metric alert query: | avg(last_10m):avg:system.cpu.system{*} by {host} > 90 @@ -394,7 +383,6 @@ k8s-high-cpu-usage: k8s-high-disk-usage: name: "(k8s) High Disk Usage Detected" - enabled: true type: metric alert query: | min(last_5m):min:system.disk.used{*} by {host,cluster_name} / avg:system.disk.total{*} by {host,cluster_name} * 100 > 90 @@ -432,7 +420,6 @@ k8s-high-disk-usage: k8s-high-memory-usage: name: "(k8s) High Memory Usage Detected" - enabled: true type: metric alert query: | avg(last_10m):avg:kubernetes.memory.usage_pct{*} by {cluster_name} > 90 @@ -475,7 +462,6 @@ k8s-high-memory-usage: k8s-high-filesystem-usage: name: "(k8s) High Filesystem Usage Detected" - enabled: true type: metric alert query: | avg(last_10m):avg:kubernetes.filesystem.usage_pct{*} by {cluster_name} > 90 @@ -518,7 +504,6 @@ k8s-high-filesystem-usage: k8s-network-tx-errors: name: "(k8s) High Network TX (send) Errors" - enabled: true type: metric alert query: | avg(last_10m):avg:kubernetes.network.tx_errors{*} by {cluster_name} > 100 @@ -561,7 +546,6 @@ k8s-network-tx-errors: k8s-network-rx-errors: name: "(k8s) High Network RX (receive) Errors" - enabled: true type: metric alert query: | avg(last_10m):avg:kubernetes.network.rx_errors{*} by {cluster_name} > 100 @@ -604,7 +588,6 @@ k8s-network-rx-errors: k8s-node-not-ready: name: "(k8s) Node Not Ready" - enabled: true type: service check query: | "kubernetes_state.node.ready".by('host').last(5).count_by_status() @@ -642,7 +625,6 @@ k8s-node-not-ready: k8s-kube-api-down: name: "(k8s) KubeAPI Down" - enabled: true type: service check query: | "kube_apiserver_controlplane.up".by('host').last(5).count_by_status() @@ -680,7 +662,6 @@ k8s-kube-api-down: k8s-increased-pod-crash: name: "(k8s) Increased Pod Crashes" - enabled: true type: query alert query: | avg(last_5m):avg:kubernetes_state.container.restarts{*} by {cluster_name,kube_namespace,pod} - hour_before(avg:kubernetes_state.container.restarts{*} by {cluster_name,kube_namespace,pod}) > 3 @@ -718,7 +699,6 @@ k8s-increased-pod-crash: k8s-hpa-errors: name: "(k8s) HPA Errors" - enabled: true type: event alert query: | events('sources:kubernetes priority:all \"unable to fetch metrics from resource metrics API:\"').by('hpa').rollup('count').last('1h') > 200 @@ -756,7 +736,6 @@ k8s-hpa-errors: k8s-pending-pods: name: "(k8s) Pending Pods" - enabled: true type: metric alert query: | min(last_30m):sum:kubernetes_state.pod.status_phase{phase:running} by {cluster_name} - sum:kubernetes_state.pod.status_phase{phase:running} by {cluster_name} + sum:kubernetes_state.pod.status_phase{phase:pending} by {cluster_name}.fill(zero) >= 1 diff --git a/catalog/monitors/rabbitmq.yaml b/catalog/monitors/rabbitmq.yaml index 7e41b08..9cf03bb 100644 --- a/catalog/monitors/rabbitmq.yaml +++ b/catalog/monitors/rabbitmq.yaml @@ -1,6 +1,5 @@ rabbitmq-messages-unacknowledged-rate-too-high: name: "[RabbitMQ] - Messages unacknowledged rate is higher than usual on: {{host.name}}" - enabled: true type: "query alert" query: | avg(last_4h):anomalies(avg:rabbitmq.queue.messages_unacknowledged.rate{*} by {rabbitmq_queue,host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1 @@ -42,7 +41,6 @@ rabbitmq-messages-unacknowledged-rate-too-high: rabbitmq-disk-usage-too-high: name: "[RabbitMQ] Level of disk usage is too high for host: {{host.name}}" - enabled: true type: "query alert" query: | avg(last_5m):avg:rabbitmq.node.mem_used{*} by {host} / avg:system.mem.total{*} by {host} * 100 > 35 diff --git a/catalog/monitors/rds.yaml b/catalog/monitors/rds.yaml index a4c11bc..1b811ea 100644 --- a/catalog/monitors/rds.yaml +++ b/catalog/monitors/rds.yaml @@ -3,7 +3,6 @@ rds-cpuutilization: name: "(RDS) CPU Utilization above 90%" - enabled: true type: metric alert query: | avg(last_15m):avg:aws.rds.cpuutilization{*} by {dbinstanceidentifier} > 90 @@ -46,7 +45,6 @@ rds-cpuutilization: rds-disk-queue-depth: name: "(RDS) Disk queue depth above 64" - enabled: true type: metric alert query: | avg(last_15m):avg:aws.rds.disk_queue_depth{*} by {dbinstanceidentifier} > 64 @@ -89,7 +87,6 @@ rds-disk-queue-depth: rds-freeable-memory: name: "(RDS) Freeable memory below 256 MB" - enabled: true type: metric alert query: | avg(last_5m):avg:aws.rds.freeable_memory{*} < 256000000 @@ -132,7 +129,6 @@ rds-freeable-memory: rds-swap-usage: name: "(RDS) Swap usage above 256 MB" - enabled: true type: metric alert query: | avg(last_15m):avg:aws.rds.swap_usage{*} by {dbinstanceidentifier} > 256000000 @@ -175,7 +171,6 @@ rds-swap-usage: rds-database-connections: name: "(RDS) Anomaly of a large variance in RDS connection count" - enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.rds.database_connections{*}, 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true') >= 1 diff --git a/catalog/monitors/redshift.yaml b/catalog/monitors/redshift.yaml index addd75c..712e567 100644 --- a/catalog/monitors/redshift.yaml +++ b/catalog/monitors/redshift.yaml @@ -3,7 +3,6 @@ redshift-health-status: name: (Redshift) Health Status - enabled: true type: metric alert query: | min(last_1h):min:aws.redshift.health_status{*} by {clusteridentifier} <= 0 @@ -43,7 +42,6 @@ redshift-health-status: redshift-database-connections: name: (Redshift) Anomaly of a large variance in Redshift connection count - enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.redshift.database_connections{*} by {clusteridentifier}, 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true') >= 1 @@ -85,7 +83,6 @@ redshift-database-connections: redshift-cpuutilization: name: (Redshift) CPU Utilization above 90% - enabled: true type: metric alert query: | avg(last_15m):avg:aws.redshift.cpuutilization{*} by {clusteridentifier} > 90 @@ -128,7 +125,6 @@ redshift-cpuutilization: redshift-write-latency: name: (Redshift) Write latency/cluster - enabled: true type: metric alert query: | avg(last_15m):avg:aws.redshift.write_latency{*} by {clusteridentifier} > 15 @@ -171,7 +167,6 @@ redshift-write-latency: redshift-disk-space-used: name: (Redshift) Percent disk space used/cluster - enabled: true type: metric alert query: | avg(last_15m):avg:aws.redshift.percentage_disk_space_used{*} by {clusteridentifier} > 85 @@ -214,7 +209,6 @@ redshift-disk-space-used: redshift-network-receive: name: (Redshift) Network throughput - received - enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.redshift.network_receive_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1 @@ -256,7 +250,6 @@ redshift-network-receive: redshift-network-transmit: name: (Redshift) Network throughput - sent - enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.redshift.network_transmit_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1 @@ -298,7 +291,6 @@ redshift-network-transmit: redshift-read-throughput: name: (Redshift) Read throughput/cluster - enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.redshift.read_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1 @@ -340,7 +332,6 @@ redshift-read-throughput: redshift-write-throughput: name: (Redshift) Write throughput/cluster - enabled: true type: metric alert query: | avg(last_4h):anomalies(avg:aws.redshift.write_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1 diff --git a/modules/monitors/variables.tf b/modules/monitors/variables.tf index a01e79f..01eb25f 100644 --- a/modules/monitors/variables.tf +++ b/modules/monitors/variables.tf @@ -1,37 +1,37 @@ # https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor variable "datadog_monitors" { - type = map(object({ - enabled = bool - name = string - type = string - message = string - escalation_message = string - query = string - tags = map(string) - notify_no_data = bool - new_group_delay = number - evaluation_delay = number - no_data_timeframe = number - renotify_interval = number - renotify_occurrences = number - renotify_statuses = set(string) - notify_audit = bool - timeout_h = number - enable_logs_sample = bool - include_tags = bool - require_full_window = bool - locked = bool - force_delete = bool - threshold_windows = map(any) - thresholds = map(any) - priority = number - groupby_simple_monitor = bool - validate = bool + type = any +# enabled = bool +# name = string +# type = string +# message = string +# escalation_message = string +# query = string +# tags = map(string) +# notify_no_data = bool +# new_group_delay = number +# evaluation_delay = number +# no_data_timeframe = number +# renotify_interval = number +# renotify_occurrences = number +# renotify_statuses = set(string) +# notify_audit = bool +# timeout_h = number +# enable_logs_sample = bool +# include_tags = bool +# require_full_window = bool +# locked = bool +# force_delete = bool +# threshold_windows = map(any) +# thresholds = map(any) +# priority = number +# groupby_simple_monitor = bool +# validate = bool # TODO: deprecate in favor of new_group_delay once the options are fully clarified # See https://github.com/DataDog/terraform-provider-datadog/issues/1292 - new_host_delay = number - })) +# new_host_delay = number +# })) description = "Map of Datadog monitor configurations. See catalog for examples" } From bc32a2b9a9e82fad83b082a7442a3cac74429ada Mon Sep 17 00:00:00 2001 From: cloudpossebot <11232728+cloudpossebot@users.noreply.github.com> Date: Thu, 10 Feb 2022 19:15:16 +0000 Subject: [PATCH 5/5] Auto Format --- modules/monitors/variables.tf | 60 +++++++++++++++++------------------ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/modules/monitors/variables.tf b/modules/monitors/variables.tf index 01eb25f..aca4740 100644 --- a/modules/monitors/variables.tf +++ b/modules/monitors/variables.tf @@ -1,37 +1,37 @@ # https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor variable "datadog_monitors" { type = any -# enabled = bool -# name = string -# type = string -# message = string -# escalation_message = string -# query = string -# tags = map(string) -# notify_no_data = bool -# new_group_delay = number -# evaluation_delay = number -# no_data_timeframe = number -# renotify_interval = number -# renotify_occurrences = number -# renotify_statuses = set(string) -# notify_audit = bool -# timeout_h = number -# enable_logs_sample = bool -# include_tags = bool -# require_full_window = bool -# locked = bool -# force_delete = bool -# threshold_windows = map(any) -# thresholds = map(any) -# priority = number -# groupby_simple_monitor = bool -# validate = bool + # enabled = bool + # name = string + # type = string + # message = string + # escalation_message = string + # query = string + # tags = map(string) + # notify_no_data = bool + # new_group_delay = number + # evaluation_delay = number + # no_data_timeframe = number + # renotify_interval = number + # renotify_occurrences = number + # renotify_statuses = set(string) + # notify_audit = bool + # timeout_h = number + # enable_logs_sample = bool + # include_tags = bool + # require_full_window = bool + # locked = bool + # force_delete = bool + # threshold_windows = map(any) + # thresholds = map(any) + # priority = number + # groupby_simple_monitor = bool + # validate = bool - # TODO: deprecate in favor of new_group_delay once the options are fully clarified - # See https://github.com/DataDog/terraform-provider-datadog/issues/1292 -# new_host_delay = number -# })) + # TODO: deprecate in favor of new_group_delay once the options are fully clarified + # See https://github.com/DataDog/terraform-provider-datadog/issues/1292 + # new_host_delay = number + # })) description = "Map of Datadog monitor configurations. See catalog for examples" }