Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor slo module. Update monitor-based SLOs to be able to create corresponding monitors #69

Merged
merged 12 commits into from
Aug 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
synthetics-slo:
metric-slo:
name: "(SLO) Synthetic Checks"
type: metric
query:
Expand All @@ -17,9 +17,7 @@ synthetics-slo:
- target: "99"
timeframe: "30d"
warning: "99.5"
groups: []
monitor_ids: []
tags:
managedby: terraform
ManagedBy: terraform
test: true
api_version: null
56 changes: 56 additions & 0 deletions examples/slo/catalog/monitor_slo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
monitor-slo:
name: "(SLO) EC2 Availability"
type: monitor
description: |
Number of EC2 failed status checks.
message: |
({stage} {region}) {instance_id} failed a SLO check
force_delete: true
validate: true
thresholds:
- target: "99.5"
timeframe: "7d"
warning: "99.9"
- target: "99"
timeframe: "30d"
warning: "99.5"
# Either `monitor_ids` or `monitors` should be provided
# `monitor_ids` is a list of externally created monitors to use for this monitor-based SLO
# If `monitors` map is provided, the monitors will be created by the module and assigned to the SLO
monitor_ids: null
monitors:
ec2-failed-status-check:
name: "(EC2) Status Check"
type: metric alert
query: |
avg(last_10m):avg:aws.ec2.status_check_failed{*} by {instance_id} > 0
message: |
({stage} {region}) {instance_id} failed a status check
escalation_message: ""
tags:
ManagedBy: Terraform
priority: 3
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 0
evaluation_delay: 60
new_host_delay: 300
new_group_delay: 0
groupby_simple_monitor: false
renotify_occurrences: 0
renotify_statuses: []
validate: true
no_data_timeframe: 10
threshold_windows: {}
thresholds:
critical: 0
tags:
ManagedBy: terraform
test: true
api_version: null
23 changes: 14 additions & 9 deletions examples/slo/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
output "datadog_metric_slos" {
value = module.datadog_slo.datadog_metric_slos
description = "Map of created Metric Based SLOs"
}

output "datadog_monitor_slos" {
value = module.datadog_slo.datadog_monitor_slos
description = "Map of created Monitor Based SLOs"
description = "Map of created monitor-based SLOs"
}

output "datadog_monitor_slo_monitors" {
value = module.datadog_slo.datadog_monitor_slo_monitors
description = "Created monitors for the monitor-based SLOs"
}

output "datadog_metric_slos" {
value = module.datadog_slo.datadog_metric_slos
description = "Map of created metric-based SLOs"
}

output "datadog_slo_alerts" {
value = module.datadog_slo.datadog_slo_alerts
description = "Map of created SLO Based Alerts"
output "datadog_metric_slo_alerts" {
value = module.datadog_slo.datadog_metric_slo_alerts
description = "Map of created metric-based SLO alerts"
}
97 changes: 80 additions & 17 deletions modules/slo/README.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
# Datadog SLO

This module is responsible for creating Datadog [Service Level Objectives](https://docs.datadoghq.com/monitors/service_level_objectives/) and their related alerts.
This module is responsible for creating Datadog [Service Level Objectives](https://docs.datadoghq.com/monitors/service_level_objectives/) and their related monitors and alerts.

The module can create metric-based SLOs (and the corresponding alerts) and monitor-based SLOs (and the corresponding monitors).

## Alerts
Datadog Alerts for SLOs are terraformed through the monitor object.

Datadog alerts for SLOs are terraformed through the monitor object.

An SLO can have many thresholds set, but a monitor can only have one. In order to get around this, the module creates Datadog monitors for each threshold within an SLO.

For example
## Usage

Example of metric-based SLO:

```yaml
synthetics-slo:
metric-slo:
name: "(SLO) Synthetic Checks"
type: metric
query:
Expand All @@ -23,23 +28,81 @@ synthetics-slo:
force_delete: true
validate: true
thresholds:
- target: "99.5"
target_display: "99.50"
timeframe: "7d"
warning: "99.9"
warning_display: "99.90"
- target: "99"
target_display: "99.00"
timeframe: "30d"
warning: "99.5"
warning_display: "99.50"
groups: []
monitor_ids: []
- target: "99.5"
timeframe: "7d"
warning: "99.9"
- target: "99"
timeframe: "30d"
warning: "99.5"
tags:
managedby: terraform
ManagedBy: terraform
test: true
api_version: null
```

Example of monitor-based SLO:

```yaml
monitor-slo:
name: "(SLO) EC2 Availability"
type: monitor
description: |
Number of EC2 failed status checks.
message: |
({stage} {region}) {instance_id} failed a SLO check
force_delete: true
validate: true
thresholds:
- target: "99.5"
timeframe: "7d"
warning: "99.9"
- target: "99"
timeframe: "30d"
warning: "99.5"
# Either `monitor_ids` or `monitors` should be provided
# `monitor_ids` is a list of externally created monitors to use for this monitor-based SLO
# If `monitors` map is provided, the monitors will be created by the module and assigned to the SLO
monitor_ids: null
monitors:
ec2-failed-status-check:
name: "(EC2) Status Check"
type: metric alert
query: |
avg(last_10m):avg:aws.ec2.status_check_failed{*} by {instance_id} > 0
message: |
({stage} {region}) {instance_id} failed a status check
escalation_message: ""
tags:
ManagedBy: Terraform
priority: 3
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 0
evaluation_delay: 60
new_host_delay: 300
new_group_delay: 0
groupby_simple_monitor: false
renotify_occurrences: 0
renotify_statuses: []
validate: true
no_data_timeframe: 10
threshold_windows: {}
thresholds:
critical: 0
tags:
ManagedBy: terraform
test: true
api_version: null
```

## References
- [Service Level Objectives](https://docs.datadoghq.com/monitors/service_level_objectives/)
- [Monitor-based SLOs](https://docs.datadoghq.com/monitors/service_level_objectives/monitor/)
- [Datadog Error Budget](https://docs.datadoghq.com/monitors/service_level_objectives/error_budget/)
- [Monitor-based SLO example](https://github.com/DataDog/terraform-provider-datadog/issues/667)
119 changes: 0 additions & 119 deletions modules/slo/main.tf
Original file line number Diff line number Diff line change
@@ -1,124 +1,5 @@
locals {
enabled = module.this.enabled

datadog_monitor_slos = { for slo in var.datadog_slos : slo.name => slo if slo.type == "monitor" && lookup(slo, "enabled", true) && local.enabled }
datadog_metric_slos = { for slo in var.datadog_slos : slo.name => slo if slo.type == "metric" && lookup(slo, "enabled", true) && local.enabled }

temp_datadog_slo_metric_monitors = flatten([
for name, slo in var.datadog_slos : [
for i, threshold in slo.thresholds : {
slo = slo,
slo_name = format("%s_threshold%s", name, i)
threshold = threshold
}
if slo.type == "metric" && local.enabled && lookup(slo, "enabled", true)
]
])

datadog_slo_metric_monitors = { for monitor in local.temp_datadog_slo_metric_monitors : monitor.slo_name => monitor }

alert_tags = local.enabled && var.alert_tags != null ? format("%s%s", var.alert_tags_separator, join(var.alert_tags_separator, var.alert_tags)) : ""
}

resource "datadog_service_level_objective" "monitor_slo" {
for_each = local.datadog_monitor_slos

# Required
name = each.value.name
type = each.value.type

dynamic "thresholds" {
for_each = each.value.thresholds
content {
target = lookup(thresholds, "target", "99.00")
timeframe = lookup(thresholds, "timeframe", "7d")

target_display = lookup(thresholds, "target_display", "98.00")
warning = lookup(thresholds, "warning", "99.95")
warning_display = lookup(thresholds, "warning_display", "98.00")
}
}

groups = lookup(each.value, "groups", [])
monitor_ids = each.value.monitor_ids

# Optional
description = lookup(each.value, "description", null)
force_delete = lookup(each.value, "force_delete", true)
validate = lookup(each.value, "validate", false)

# Convert terraform tags map to Datadog tags map
# If a key is supplied with a value, it will render "key:value" as a tag
# tags:
# key: value
# If a key is supplied without a value (null), it will render "key" as a tag
# tags:
# key: null
tags = [
for tagk, tagv in lookup(each.value, "tags", module.this.tags) : (tagv != null ? format("%s:%s", tagk, tagv) : tagk)
]
}

resource "datadog_service_level_objective" "metric_slo" {
for_each = local.datadog_metric_slos

# Required
name = each.value.name
type = each.value.type

query {
denominator = each.value.query.denominator
numerator = each.value.query.numerator
}

# Optional
description = lookup(each.value, "description", null)
force_delete = lookup(each.value, "force_delete", true)
validate = lookup(each.value, "validate", false)

dynamic "thresholds" {
for_each = each.value.thresholds
content {
target = lookup(thresholds.value, "target", null)
timeframe = lookup(thresholds.value, "timeframe", null)
warning = lookup(thresholds.value, "warning", null)
}
}

# Convert terraform tags map to Datadog tags map
# If a key is supplied with a value, it will render "key:value" as a tag
# tags:
# key: value
# If a key is supplied without a value (null), it will render "key" as a tag
# tags:
# key: null
tags = [
for tagk, tagv in lookup(each.value, "tags", module.this.tags) : (tagv != null ? format("%s:%s", tagk, tagv) : tagk)
]
}

resource "datadog_monitor" "metric_slo_alert" {
for_each = local.datadog_slo_metric_monitors

name = format("(SLO Error Budget Alert) %s", each.value.slo.name)
type = "slo alert"
message = format("%s%s", each.value.slo.message, local.alert_tags)

query = <<EOF
error_budget("${datadog_service_level_objective.metric_slo[each.value.slo.name].id}").over("${each.value.threshold.timeframe}") > ${lookup(each.value.threshold, "target", "99.00")}
EOF
monitor_thresholds {
critical = lookup(each.value.threshold, "target", null)
}

# Convert terraform tags map to Datadog tags map
# If a key is supplied with a value, it will render "key:value" as a tag
# tags:
# key: value
# If a key is supplied without a value (null), it will render "key" as a tag
# tags:
# key: null
tags = [
for tagk, tagv in lookup(each.value.slo, "tags", module.this.tags) : (tagv != null ? format("%s:%s", tagk, tagv) : tagk)
]
}
Loading