cloudposse · Benbentwo · Feb 11, 2022 · Feb 10, 2022 · Feb 10, 2022 · Feb 10, 2022
diff --git a/catalog/monitors/amq.yaml b/catalog/monitors/amq.yaml
@@ -3,6 +3,7 @@
 
 amq-cpu-utilization:
   name: "(AMQ) CPU Utilization above 90%"
+  enabled: true
   type: metric alert
   query: |
     avg(last_15m):avg:aws.amazonmq.cpu_utilization{*} by {broker} > 90
@@ -45,6 +46,7 @@ amq-cpu-utilization:
 
 amq-heap-usage:
   name: "(AMQ) JVM heap usage above 95%"
+  enabled: true
   type: metric alert
   query: |
     avg(last_15m):avg:aws.amazonmq.heap_usage{*} by {broker} > 95
@@ -87,6 +89,7 @@ amq-heap-usage:
 
 amq-network-in:
   name: "(AMQ) Anomaly of a large variance in network-in bytes"
+  enabled: true
   type: metric alert
   query: |
     avg(last_4h):anomalies(avg:aws.amazonmq.network_in{*} by {broker}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
@@ -128,6 +131,7 @@ amq-network-in:
 
 amq-network-out:
   name: "(AMQ) Anomaly of a large variance in network-out bytes"
+  enabled: true
   type: metric alert
   query: |
     avg(last_4h):anomalies(avg:aws.amazonmq.network_out{*} by {broker}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
@@ -169,6 +173,7 @@ amq-network-out:
 
 amq-current-connections-count:
   name: "(AMQ) Anomaly of a large variance in broker connections"
+  enabled: true
   type: metric alert
   query: |
     avg(last_4h):anomalies(avg:aws.amazonmq.current_connections_count{*}.as_count(), 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true') >= 1

diff --git a/catalog/monitors/aurora.yaml b/catalog/monitors/aurora.yaml
@@ -3,6 +3,7 @@
 
 aurora-replica-lag:
   name: "(RDS) Aurora Replica Lag Detected"
+  enabled: true
   type: metric alert
   query: |
     min(last_15m):min:aws.rds.aurora_replica_lag{*} by {dbinstanceidentifier} > 1000

diff --git a/catalog/monitors/ec2.yaml b/catalog/monitors/ec2.yaml
@@ -3,6 +3,7 @@
 
 ec2-failed-status-check:
   name: "(EC2) Failed Status Check"
+  enabled: true
   type: metric alert
   query: |
     avg(last_10m):avg:aws.ec2.status_check_failed{*} by {instance_id} > 0

diff --git a/catalog/monitors/host.yaml b/catalog/monitors/host.yaml
@@ -3,6 +3,7 @@
 
 host-io-wait-times:
   name: "(Host) I/O Wait Times"
+  enabled: true
   type: metric alert
   query: "avg(last_10m):avg:system.cpu.iowait{*} by {host} > 50"
   message: |-
@@ -39,6 +40,7 @@ host-io-wait-times:
 
 host-disk-use:
   name: "(Host) Host Disk Usage"
+  enabled: true
   type: metric alert
   query: "avg(last_30m):(avg:system.disk.total{*} by {host} - avg:system.disk.free{*} by {host}) / avg:system.disk.total{*} by {host} * 100 > 90"
   message: |-
@@ -75,6 +77,7 @@ host-disk-use:
 
 host-high-mem-use:
   name: "(Host) Memory Utilization"
+  enabled: true
   type: query alert
   query: "avg(last_15m):avg:system.mem.pct_usable{*} by {host} < 0.1"
   message: |-
@@ -111,6 +114,7 @@ host-high-mem-use:
 
 host-high-load-avg:
   name: "(Host) High System Load Average"
+  enabled: true
   type: metric alert
   query: "avg(last_30m):avg:system.load.norm.5{*} by {host} > 2"
   message: |-

diff --git a/catalog/monitors/k8s.yaml b/catalog/monitors/k8s.yaml
@@ -3,6 +3,7 @@
 
 k8s-deployment-replica-pod-down:
   name: "(k8s) Deployment Replica Pod is down"
+  enabled: true
   type: query alert
   query: |
     avg(last_15m):avg:kubernetes_state.deployment.replicas_desired{*} by {cluster_name,deployment} - avg:kubernetes_state.deployment.replicas_ready{*} by {cluster_name,deployment} >= 2
@@ -35,6 +36,7 @@ k8s-deployment-replica-pod-down:
 
 k8s-pod-restarting:
   name: "(k8s) Pods are restarting multiple times"
+  enabled: true
   type: query alert
   query: |
     change(sum(last_5m),last_5m):exclude_null(avg:kubernetes.containers.restarts{*} by {cluster_name,kube_namespace,pod_name}) > 5
@@ -68,6 +70,7 @@ k8s-pod-restarting:
 
 k8s-statefulset-replica-down:
   name: "(k8s) StatefulSet Replica Pod is down"
+  enabled: true
   type: query alert
   query: |
     max(last_15m):sum:kubernetes_state.statefulset.replicas_desired{*} by {cluster_name,kube_namespace,statefulset} - sum:kubernetes_state.statefulset.replicas_ready{*} by {cluster_name,kube_namespace,statefulset} >= 2
@@ -101,6 +104,7 @@ k8s-statefulset-replica-down:
 
 k8s-daemonset-pod-down:
   name: "(k8s) DaemonSet Pod is down"
+  enabled: true
   type: query alert
   query: |
     max(last_15m):sum:kubernetes_state.daemonset.desired{*} by {cluster_name,kube_namespace,daemonset} - sum:kubernetes_state.daemonset.ready{*} by {cluster_name,kube_namespace,daemonset} >= 1
@@ -133,6 +137,7 @@ k8s-daemonset-pod-down:
 
 k8s-crashloopBackOff:
   name: "(k8s) CrashloopBackOff detected"
+  enabled: true
   type: query alert
   query: |
     max(last_10m):max:kubernetes_state.container.status_report.count.waiting{reason:crashloopbackoff} by {cluster_name,kube_namespace,pod_name} >= 1
@@ -165,6 +170,7 @@ k8s-crashloopBackOff:
 
 k8s-multiple-pods-failing:
   name: "(k8s) Multiple Pods are failing"
+  enabled: true
   type: query alert
   query: |
     change(avg(last_5m),last_5m):sum:kubernetes_state.pod.status_phase{phase:failed} by {cluster_name,kube_namespace} > 10
@@ -198,6 +204,7 @@ k8s-multiple-pods-failing:
 
 k8s-unavailable-deployment-replica:
   name: "(k8s) Unavailable Deployment Replica(s) detected"
+  enabled: true
   type: metric alert
   query: |
     max(last_10m):max:kubernetes_state.deployment.replicas_unavailable{*} by {cluster_name,kube_namespace} > 0
@@ -235,6 +242,7 @@ k8s-unavailable-deployment-replica:
 
 k8s-unavailable-statefulset-replica:
   name: "(k8s) Unavailable Statefulset Replica(s) detected"
+  enabled: true
   type: metric alert
   query: |
     max(last_10m):max:kubernetes_state.statefulset.replicas_unavailable{*} by {cluster_name,kube_namespace} > 0
@@ -272,6 +280,7 @@ k8s-unavailable-statefulset-replica:
 
 k8s-node-status-unschedulable:
   name: "(k8s) Detected Unschedulable Node(s)"
+  enabled: true
   type: query alert
   query: |
     max(last_15m):sum:kubernetes_state.node.status{status:schedulable} by {cluster_name} * 100 / sum:kubernetes_state.node.status{*} by {cluster_name} < 80
@@ -309,6 +318,7 @@ k8s-node-status-unschedulable:
 
 k8s-imagepullbackoff:
   name: "(k8s) ImagePullBackOff detected"
+  enabled: true
   type: "query alert"
   query: |
     max(last_10m):max:kubernetes_state.container.status_report.count.waiting{reason:imagepullbackoff} by {kube_cluster_name,kube_namespace,pod_name} >= 1
@@ -346,6 +356,7 @@ k8s-imagepullbackoff:
 
 k8s-high-cpu-usage:
   name: "(k8s) High CPU Usage Detected"
+  enabled: true
   type: metric alert
   query: |
     avg(last_10m):avg:system.cpu.system{*} by {host} > 90
@@ -383,6 +394,7 @@ k8s-high-cpu-usage:
 
 k8s-high-disk-usage:
   name: "(k8s) High Disk Usage Detected"
+  enabled: true
   type: metric alert
   query: |
     min(last_5m):min:system.disk.used{*} by {host,cluster_name} / avg:system.disk.total{*} by {host,cluster_name} * 100 > 90
@@ -420,6 +432,7 @@ k8s-high-disk-usage:
 
 k8s-high-memory-usage:
   name: "(k8s) High Memory Usage Detected"
+  enabled: true
   type: metric alert
   query: |
     avg(last_10m):avg:kubernetes.memory.usage_pct{*} by {cluster_name} > 90
@@ -462,6 +475,7 @@ k8s-high-memory-usage:
 
 k8s-high-filesystem-usage:
   name: "(k8s) High Filesystem Usage Detected"
+  enabled: true
   type: metric alert
   query: |
     avg(last_10m):avg:kubernetes.filesystem.usage_pct{*} by {cluster_name} > 90
@@ -504,6 +518,7 @@ k8s-high-filesystem-usage:
 
 k8s-network-tx-errors:
   name: "(k8s) High Network TX (send) Errors"
+  enabled: true
   type: metric alert
   query: |
     avg(last_10m):avg:kubernetes.network.tx_errors{*} by {cluster_name} > 100
@@ -546,6 +561,7 @@ k8s-network-tx-errors:
 
 k8s-network-rx-errors:
   name: "(k8s) High Network RX (receive) Errors"
+  enabled: true
   type: metric alert
   query: |
     avg(last_10m):avg:kubernetes.network.rx_errors{*} by {cluster_name} > 100
@@ -588,6 +604,7 @@ k8s-network-rx-errors:
 
 k8s-node-not-ready:
   name: "(k8s) Node Not Ready"
+  enabled: true
   type: service check
   query: |
     "kubernetes_state.node.ready".by('host').last(5).count_by_status()
@@ -625,6 +642,7 @@ k8s-node-not-ready:
 
 k8s-kube-api-down:
   name: "(k8s) KubeAPI Down"
+  enabled: true
   type: service check
   query: |
     "kube_apiserver_controlplane.up".by('host').last(5).count_by_status()
@@ -662,6 +680,7 @@ k8s-kube-api-down:
 
 k8s-increased-pod-crash:
   name: "(k8s) Increased Pod Crashes"
+  enabled: true
   type: query alert
   query: |
     avg(last_5m):avg:kubernetes_state.container.restarts{*} by {cluster_name,kube_namespace,pod} - hour_before(avg:kubernetes_state.container.restarts{*} by {cluster_name,kube_namespace,pod}) > 3
@@ -699,6 +718,7 @@ k8s-increased-pod-crash:
 
 k8s-hpa-errors:
   name: "(k8s) HPA Errors"
+  enabled: true
   type: event alert
   query: |
     events('sources:kubernetes priority:all \"unable to fetch metrics from resource metrics API:\"').by('hpa').rollup('count').last('1h') > 200
@@ -736,6 +756,7 @@ k8s-hpa-errors:
 
 k8s-pending-pods:
   name: "(k8s) Pending Pods"
+  enabled: true
   type: metric alert
   query: |
     min(last_30m):sum:kubernetes_state.pod.status_phase{phase:running} by {cluster_name} - sum:kubernetes_state.pod.status_phase{phase:running} by {cluster_name} + sum:kubernetes_state.pod.status_phase{phase:pending} by {cluster_name}.fill(zero) >= 1

diff --git a/catalog/monitors/rabbitmq.yaml b/catalog/monitors/rabbitmq.yaml
@@ -1,5 +1,6 @@
 rabbitmq-messages-unacknowledged-rate-too-high:
   name: "[RabbitMQ] - Messages unacknowledged rate is higher than usual on: {{host.name}}"
+  enabled: true
   type: "query alert"
   query: |
     avg(last_4h):anomalies(avg:rabbitmq.queue.messages_unacknowledged.rate{*} by {rabbitmq_queue,host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
@@ -41,6 +42,7 @@ rabbitmq-messages-unacknowledged-rate-too-high:
 
 rabbitmq-disk-usage-too-high:
   name: "[RabbitMQ] Level of disk usage is too high for host: {{host.name}}"
+  enabled: true
   type: "query alert"
   query: |
     avg(last_5m):avg:rabbitmq.node.mem_used{*} by {host} / avg:system.mem.total{*} by {host} * 100 > 35

diff --git a/catalog/monitors/rds.yaml b/catalog/monitors/rds.yaml
@@ -3,6 +3,7 @@
 
 rds-cpuutilization:
   name: "(RDS) CPU Utilization above 90%"
+  enabled: true
   type: metric alert
   query: |
     avg(last_15m):avg:aws.rds.cpuutilization{*} by {dbinstanceidentifier} > 90
@@ -45,6 +46,7 @@ rds-cpuutilization:
 
 rds-disk-queue-depth:
   name: "(RDS) Disk queue depth above 64"
+  enabled: true
   type: metric alert
   query: |
     avg(last_15m):avg:aws.rds.disk_queue_depth{*} by {dbinstanceidentifier} > 64
@@ -87,6 +89,7 @@ rds-disk-queue-depth:
 
 rds-freeable-memory:
   name: "(RDS) Freeable memory below 256 MB"
+  enabled: true
   type: metric alert
   query: |
     avg(last_5m):avg:aws.rds.freeable_memory{*} < 256000000
@@ -129,6 +132,7 @@ rds-freeable-memory:
 
 rds-swap-usage:
   name: "(RDS) Swap usage above 256 MB"
+  enabled: true
   type: metric alert
   query: |
     avg(last_15m):avg:aws.rds.swap_usage{*} by {dbinstanceidentifier} > 256000000
@@ -171,6 +175,7 @@ rds-swap-usage:
 
 rds-database-connections:
   name: "(RDS) Anomaly of a large variance in RDS connection count"
+  enabled: true
   type: metric alert
   query: |
     avg(last_4h):anomalies(avg:aws.rds.database_connections{*}, 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true') >= 1

diff --git a/catalog/monitors/redshift.yaml b/catalog/monitors/redshift.yaml
@@ -3,6 +3,7 @@
 
 redshift-health-status:
   name: (Redshift) Health Status
+  enabled: true
   type: metric alert
   query: |
     min(last_1h):min:aws.redshift.health_status{*} by {clusteridentifier} <= 0
@@ -42,6 +43,7 @@ redshift-health-status:
 
 redshift-database-connections:
   name: (Redshift) Anomaly of a large variance in Redshift connection count
+  enabled: true
   type: metric alert
   query: |
     avg(last_4h):anomalies(avg:aws.redshift.database_connections{*} by {clusteridentifier}, 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true') >= 1
@@ -83,6 +85,7 @@ redshift-database-connections:
 
 redshift-cpuutilization:
   name: (Redshift) CPU Utilization above 90%
+  enabled: true
   type: metric alert
   query: |
     avg(last_15m):avg:aws.redshift.cpuutilization{*} by {clusteridentifier} > 90
@@ -125,6 +128,7 @@ redshift-cpuutilization:
 
 redshift-write-latency:
   name: (Redshift) Write latency/cluster
+  enabled: true
   type: metric alert
   query: |
     avg(last_15m):avg:aws.redshift.write_latency{*} by {clusteridentifier} > 15
@@ -167,6 +171,7 @@ redshift-write-latency:
 
 redshift-disk-space-used:
   name: (Redshift) Percent disk space used/cluster
+  enabled: true
   type: metric alert
   query: |
     avg(last_15m):avg:aws.redshift.percentage_disk_space_used{*} by {clusteridentifier} > 85
@@ -209,6 +214,7 @@ redshift-disk-space-used:
 
 redshift-network-receive:
   name: (Redshift) Network throughput - received
+  enabled: true
   type: metric alert
   query: |
     avg(last_4h):anomalies(avg:aws.redshift.network_receive_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
@@ -250,6 +256,7 @@ redshift-network-receive:
 
 redshift-network-transmit:
   name: (Redshift) Network throughput - sent
+  enabled: true
   type: metric alert
   query: |
     avg(last_4h):anomalies(avg:aws.redshift.network_transmit_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
@@ -291,6 +298,7 @@ redshift-network-transmit:
 
 redshift-read-throughput:
   name: (Redshift) Read throughput/cluster
+  enabled: true
   type: metric alert
   query: |
     avg(last_4h):anomalies(avg:aws.redshift.read_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
@@ -332,6 +340,7 @@ redshift-read-throughput:
 
 redshift-write-throughput:
   name: (Redshift) Write throughput/cluster
+  enabled: true
   type: metric alert
   query: |
     avg(last_4h):anomalies(avg:aws.redshift.write_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1

diff --git a/modules/monitors/main.tf b/modules/monitors/main.tf
@@ -6,7 +6,7 @@ locals {
 
 # https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor
 resource "datadog_monitor" "default" {
-  for_each = local.enabled ? var.datadog_monitors : {}
+  for_each = local.enabled ? { for k, v in var.datadog_monitors : k => v if lookup(v, "enabled", true) } : {}
 
   name                   = each.value.name
   type                   = each.value.type

diff --git a/modules/monitors/variables.tf b/modules/monitors/variables.tf
@@ -1,6 +1,7 @@
 # https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor
 variable "datadog_monitors" {
   type = map(object({
+    enabled                = bool
     name                   = string
     type                   = string
     message                = string