Skip to content

Commit

Permalink
[terraform] break up alarms into separate terraform files
Browse files Browse the repository at this point in the history
Summary:
Noticed that our aws_cloudwatch_alarms.tf file was getting a bit large.

Separated alarms into different files based on service.
The lambda alarm also has its own file. Alarms shared between services like failed connection alarms and ecs task stop alarms were kept in aws_cloudwatch_alarms.tf

This is simply moving around code. Will not result in any changes in deployment

Depends on D13942

Test Plan: terraform plan had no changes

Reviewers: bartek, varun

Reviewed By: bartek

Subscribers: ashoat, tomek

Differential Revision: https://phab.comm.dev/D13943
  • Loading branch information
wyilio committed Nov 15, 2024
1 parent 3d2db76 commit efe9bf6
Show file tree
Hide file tree
Showing 5 changed files with 254 additions and 250 deletions.
82 changes: 82 additions & 0 deletions services/terraform/remote/alarms_blob.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
locals {
blob_error_patterns = {
S3 = { name = "S3", pattern = "S3 Error" },
DDB = { name = "DDB", pattern = "DDB Error" },
HTTP = { name = "HTTP", pattern = "HTTP Error" },
Other = { name = "Other", pattern = "Other Error" },
}
}

resource "aws_sns_topic" "blob_error_topic" {
name = "blob-error-topic"
}

resource "aws_sns_topic_subscription" "blob_email_subscription" {
topic_arn = aws_sns_topic.blob_error_topic.arn
protocol = "email"
endpoint = local.error_reports_subscribed_email
}

resource "aws_cloudwatch_log_metric_filter" "blob_error_filters" {
for_each = local.blob_error_patterns

name = "Blob${each.value.name}ErrorCount"
pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }"
log_group_name = "/ecs/blob-service-task-def"

metric_transformation {
name = "Blob${each.value.name}ErrorCount"
namespace = "BlobServiceMetricFilters"
value = "1"
}
}

resource "aws_cloudwatch_metric_alarm" "blob_error_alarms" {
for_each = local.blob_error_patterns

alarm_name = "Blob${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "Blob${each.value.name}ErrorCount"
namespace = "BlobServiceMetricFilters"
period = "300"
statistic = "Sum"
threshold = 1
alarm_description = "Alarm when Blob ${each.value.name} errors exceed threshold"
actions_enabled = true
alarm_actions = [aws_sns_topic.blob_error_topic.arn]
}

resource "aws_cloudwatch_metric_alarm" "blob_memory_utilization" {
alarm_name = "BlobMemoryUtilizationAlarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "MemoryUtilization"
namespace = "AWS/ECS"
period = 60
statistic = "Average"
threshold = 90
alarm_description = "Alarm when Blob service memory utilization exceeds 90%"
dimensions = {
ClusterName = aws_ecs_cluster.comm_services.name
ServiceName = aws_ecs_service.blob_service.name
}
alarm_actions = [aws_sns_topic.blob_error_topic.arn]
}

resource "aws_cloudwatch_metric_alarm" "blob_cpu_utilization" {
alarm_name = "BlobCPUUtilizationAlarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "CPUUtilization"
namespace = "AWS/ECS"
period = 60
statistic = "Average"
threshold = 90
alarm_description = "Alarm when Blob service CPU utilization exceeds 90%"
dimensions = {
ClusterName = aws_ecs_cluster.comm_services.name
ServiceName = aws_ecs_service.blob_service.name
}
alarm_actions = [aws_sns_topic.blob_error_topic.arn]
}
54 changes: 54 additions & 0 deletions services/terraform/remote/alarms_identity.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
locals {
identity_error_patterns = {
Search = { name = "Search", pattern = "Search Error" },
Sync = { name = "Sync", pattern = "Sync Error" },
Database = { name = "DB", pattern = "*DB Error" },
GrpcServices = { name = "GrpcServices", pattern = "gRPC Services Error" },
Siwe = { name = "Siwe", pattern = "SIWE Error" },
Tunnelbroker = { name = "Tunnelbroker", pattern = "Tunnelbroker Error" }
Http = { name = "HTTP", pattern = "HTTP Error" }
}

identity_error_threshold = 1
}

resource "aws_sns_topic" "identity_error_topic" {
name = "identity-error-topic"
}

resource "aws_sns_topic_subscription" "identity_email_subscription" {
topic_arn = aws_sns_topic.identity_error_topic.arn
protocol = "email"
endpoint = local.error_reports_subscribed_email
}

resource "aws_cloudwatch_log_metric_filter" "identity_error_filters" {
for_each = local.identity_error_patterns

name = "Identity${each.value.name}ErrorCount"
pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }"
log_group_name = "/ecs/identity-service-task-def"

metric_transformation {
name = "Identity${each.value.name}ErrorCount"
namespace = "IdentityServiceMetricFilters"
value = "1"
}
}

resource "aws_cloudwatch_metric_alarm" "identity_error_alarms" {
for_each = local.identity_error_patterns

alarm_name = "Identity${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "Identity${each.value.name}ErrorCount"
namespace = "IdentityServiceMetricFilters"
period = "300"
statistic = "Sum"
threshold = local.identity_error_threshold
alarm_description = "Alarm when Identity ${each.value.name} errors exceed threshold"
actions_enabled = true
alarm_actions = [aws_sns_topic.identity_error_topic.arn]
}

30 changes: 30 additions & 0 deletions services/terraform/remote/alarms_lambda.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
locals {
lambda_error_threshold = "2"
}

resource "aws_sns_topic" "lambda_alarm_topic" {
name = "lambda-error-alarm-topic"
}

resource "aws_sns_topic_subscription" "email_subscription" {
topic_arn = aws_sns_topic.lambda_alarm_topic.arn
protocol = "email"
endpoint = local.error_reports_subscribed_email
}

resource "aws_cloudwatch_metric_alarm" "lambda_error_alarm" {
alarm_name = "SearchIndexLambdaErrorAlarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "LambdaErrors"
namespace = "AWS/Lambda"
period = "300"
statistic = "Sum"
threshold = local.lambda_error_threshold
alarm_description = "Alarm tracking search index lambda function failure"
actions_enabled = true
alarm_actions = [aws_sns_topic.lambda_alarm_topic.arn]
dimensions = {
FunctionName = module.shared.search_index_lambda.function_name
}
}
88 changes: 88 additions & 0 deletions services/terraform/remote/alarms_tunnelbroker.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
locals {
tunnelbroker_error_patterns = {
AMQP = { name = "AMQP", pattern = "AMQP Error" },
DDB = { name = "DDB", pattern = "DDB Error" },
FCM = { name = "FCM", pattern = "FCM Error" },
APNs = { name = "APNs", pattern = "APNs Error" },
WebPush = { name = "WebPush", pattern = "Web Push Error" },
WNS = { name = "WNS", pattern = "WNS Error" },
Identity = { name = "Identity", pattern = "Identity Error" },
Websocket = { name = "Websocket", pattern = "Websocket Error" },
Server = { name = "Server", pattern = "Server Error" },
}
}

resource "aws_sns_topic" "tunnelbroker_error_topic" {
name = "tunnelbroker-error-topic"
}

resource "aws_sns_topic_subscription" "tunnelbroker_email_subscription" {
topic_arn = aws_sns_topic.tunnelbroker_error_topic.arn
protocol = "email"
endpoint = local.error_reports_subscribed_email
}

resource "aws_cloudwatch_log_metric_filter" "tunnelbroker_error_filters" {
for_each = local.tunnelbroker_error_patterns

name = "Tunnelbroker${each.value.name}ErrorCount"
pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }"
log_group_name = "/ecs/tunnelbroker-task-def"

metric_transformation {
name = "Tunnelbroker${each.value.name}ErrorCount"
namespace = "TunnelbrokerServiceMetricFilters"
value = "1"
}
}

resource "aws_cloudwatch_metric_alarm" "tunnelbroker_error_alarms" {
for_each = local.tunnelbroker_error_patterns

alarm_name = "Tunnelbroker${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "1"
metric_name = "Tunnelbroker${each.value.name}ErrorCount"
namespace = "TunnelbrokerServiceMetricFilters"
period = "300"
statistic = "Sum"
threshold = 1
alarm_description = "Alarm when Tunnelbroker ${each.value.name} errors exceed threshold"
actions_enabled = true
alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn]
}

resource "aws_cloudwatch_metric_alarm" "tunnelbroker_memory_utilization" {
alarm_name = "TunnelbrokerMemoryUtilizationAlarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "MemoryUtilization"
period = 60
statistic = "Average"
threshold = 90
alarm_description = "Alarm when Tunnelbroker service memory utilization exceeds 90%"
alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn]
namespace = "AWS/ECS"
dimensions = {
ClusterName = aws_ecs_cluster.comm_services.name
ServiceName = aws_ecs_service.tunnelbroker.name
}
}


resource "aws_cloudwatch_metric_alarm" "tunnelbroker_cpu_utilization" {
alarm_name = "TunnelbrokerCPUUtilizationAlarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "CPUUtilization"
period = 60
statistic = "Average"
threshold = 90
alarm_description = "Alarm when Tunnelbroker service CPU utilization exceeds 90%"
alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn]
namespace = "AWS/ECS"
dimensions = {
ClusterName = aws_ecs_cluster.comm_services.name
ServiceName = aws_ecs_service.tunnelbroker.name
}
}
Loading

0 comments on commit efe9bf6

Please sign in to comment.