diff --git a/services/terraform/remote/alarms_blob.tf b/services/terraform/remote/alarms_blob.tf new file mode 100644 index 0000000000..0d5583dcb9 --- /dev/null +++ b/services/terraform/remote/alarms_blob.tf @@ -0,0 +1,82 @@ +locals { + blob_error_patterns = { + S3 = { name = "S3", pattern = "S3 Error" }, + DDB = { name = "DDB", pattern = "DDB Error" }, + HTTP = { name = "HTTP", pattern = "HTTP Error" }, + Other = { name = "Other", pattern = "Other Error" }, + } +} + +resource "aws_sns_topic" "blob_error_topic" { + name = "blob-error-topic" +} + +resource "aws_sns_topic_subscription" "blob_email_subscription" { + topic_arn = aws_sns_topic.blob_error_topic.arn + protocol = "email" + endpoint = local.error_reports_subscribed_email +} + +resource "aws_cloudwatch_log_metric_filter" "blob_error_filters" { + for_each = local.blob_error_patterns + + name = "Blob${each.value.name}ErrorCount" + pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }" + log_group_name = "/ecs/blob-service-task-def" + + metric_transformation { + name = "Blob${each.value.name}ErrorCount" + namespace = "BlobServiceMetricFilters" + value = "1" + } +} + +resource "aws_cloudwatch_metric_alarm" "blob_error_alarms" { + for_each = local.blob_error_patterns + + alarm_name = "Blob${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = "Blob${each.value.name}ErrorCount" + namespace = "BlobServiceMetricFilters" + period = "300" + statistic = "Sum" + threshold = 1 + alarm_description = "Alarm when Blob ${each.value.name} errors exceed threshold" + actions_enabled = true + alarm_actions = [aws_sns_topic.blob_error_topic.arn] +} + +resource "aws_cloudwatch_metric_alarm" "blob_memory_utilization" { + alarm_name = "BlobMemoryUtilizationAlarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = "MemoryUtilization" + namespace = "AWS/ECS" + period = 60 + statistic = "Average" + threshold = 90 + alarm_description = "Alarm when Blob service memory utilization exceeds 90%" + dimensions = { + ClusterName = aws_ecs_cluster.comm_services.name + ServiceName = aws_ecs_service.blob_service.name + } + alarm_actions = [aws_sns_topic.blob_error_topic.arn] +} + +resource "aws_cloudwatch_metric_alarm" "blob_cpu_utilization" { + alarm_name = "BlobCPUUtilizationAlarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = "CPUUtilization" + namespace = "AWS/ECS" + period = 60 + statistic = "Average" + threshold = 90 + alarm_description = "Alarm when Blob service CPU utilization exceeds 90%" + dimensions = { + ClusterName = aws_ecs_cluster.comm_services.name + ServiceName = aws_ecs_service.blob_service.name + } + alarm_actions = [aws_sns_topic.blob_error_topic.arn] +} diff --git a/services/terraform/remote/alarms_identity.tf b/services/terraform/remote/alarms_identity.tf new file mode 100644 index 0000000000..f1dc06d9a3 --- /dev/null +++ b/services/terraform/remote/alarms_identity.tf @@ -0,0 +1,54 @@ +locals { + identity_error_patterns = { + Search = { name = "Search", pattern = "Search Error" }, + Sync = { name = "Sync", pattern = "Sync Error" }, + Database = { name = "DB", pattern = "*DB Error" }, + GrpcServices = { name = "GrpcServices", pattern = "gRPC Services Error" }, + Siwe = { name = "Siwe", pattern = "SIWE Error" }, + Tunnelbroker = { name = "Tunnelbroker", pattern = "Tunnelbroker Error" } + Http = { name = "HTTP", pattern = "HTTP Error" } + } + + identity_error_threshold = 1 +} + +resource "aws_sns_topic" "identity_error_topic" { + name = "identity-error-topic" +} + +resource "aws_sns_topic_subscription" "identity_email_subscription" { + topic_arn = aws_sns_topic.identity_error_topic.arn + protocol = "email" + endpoint = local.error_reports_subscribed_email +} + +resource "aws_cloudwatch_log_metric_filter" "identity_error_filters" { + for_each = local.identity_error_patterns + + name = "Identity${each.value.name}ErrorCount" + pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }" + log_group_name = "/ecs/identity-service-task-def" + + metric_transformation { + name = "Identity${each.value.name}ErrorCount" + namespace = "IdentityServiceMetricFilters" + value = "1" + } +} + +resource "aws_cloudwatch_metric_alarm" "identity_error_alarms" { + for_each = local.identity_error_patterns + + alarm_name = "Identity${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = "Identity${each.value.name}ErrorCount" + namespace = "IdentityServiceMetricFilters" + period = "300" + statistic = "Sum" + threshold = local.identity_error_threshold + alarm_description = "Alarm when Identity ${each.value.name} errors exceed threshold" + actions_enabled = true + alarm_actions = [aws_sns_topic.identity_error_topic.arn] +} + diff --git a/services/terraform/remote/alarms_lambda.tf b/services/terraform/remote/alarms_lambda.tf new file mode 100644 index 0000000000..399a6d3b55 --- /dev/null +++ b/services/terraform/remote/alarms_lambda.tf @@ -0,0 +1,30 @@ +locals { + lambda_error_threshold = "2" +} + +resource "aws_sns_topic" "lambda_alarm_topic" { + name = "lambda-error-alarm-topic" +} + +resource "aws_sns_topic_subscription" "email_subscription" { + topic_arn = aws_sns_topic.lambda_alarm_topic.arn + protocol = "email" + endpoint = local.error_reports_subscribed_email +} + +resource "aws_cloudwatch_metric_alarm" "lambda_error_alarm" { + alarm_name = "SearchIndexLambdaErrorAlarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = "LambdaErrors" + namespace = "AWS/Lambda" + period = "300" + statistic = "Sum" + threshold = local.lambda_error_threshold + alarm_description = "Alarm tracking search index lambda function failure" + actions_enabled = true + alarm_actions = [aws_sns_topic.lambda_alarm_topic.arn] + dimensions = { + FunctionName = module.shared.search_index_lambda.function_name + } +} diff --git a/services/terraform/remote/alarms_tunnelbroker.tf b/services/terraform/remote/alarms_tunnelbroker.tf new file mode 100644 index 0000000000..02b2aba43e --- /dev/null +++ b/services/terraform/remote/alarms_tunnelbroker.tf @@ -0,0 +1,88 @@ +locals { + tunnelbroker_error_patterns = { + AMQP = { name = "AMQP", pattern = "AMQP Error" }, + DDB = { name = "DDB", pattern = "DDB Error" }, + FCM = { name = "FCM", pattern = "FCM Error" }, + APNs = { name = "APNs", pattern = "APNs Error" }, + WebPush = { name = "WebPush", pattern = "Web Push Error" }, + WNS = { name = "WNS", pattern = "WNS Error" }, + Identity = { name = "Identity", pattern = "Identity Error" }, + Websocket = { name = "Websocket", pattern = "Websocket Error" }, + Server = { name = "Server", pattern = "Server Error" }, + } +} + +resource "aws_sns_topic" "tunnelbroker_error_topic" { + name = "tunnelbroker-error-topic" +} + +resource "aws_sns_topic_subscription" "tunnelbroker_email_subscription" { + topic_arn = aws_sns_topic.tunnelbroker_error_topic.arn + protocol = "email" + endpoint = local.error_reports_subscribed_email +} + +resource "aws_cloudwatch_log_metric_filter" "tunnelbroker_error_filters" { + for_each = local.tunnelbroker_error_patterns + + name = "Tunnelbroker${each.value.name}ErrorCount" + pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }" + log_group_name = "/ecs/tunnelbroker-task-def" + + metric_transformation { + name = "Tunnelbroker${each.value.name}ErrorCount" + namespace = "TunnelbrokerServiceMetricFilters" + value = "1" + } +} + +resource "aws_cloudwatch_metric_alarm" "tunnelbroker_error_alarms" { + for_each = local.tunnelbroker_error_patterns + + alarm_name = "Tunnelbroker${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = "1" + metric_name = "Tunnelbroker${each.value.name}ErrorCount" + namespace = "TunnelbrokerServiceMetricFilters" + period = "300" + statistic = "Sum" + threshold = 1 + alarm_description = "Alarm when Tunnelbroker ${each.value.name} errors exceed threshold" + actions_enabled = true + alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn] +} + +resource "aws_cloudwatch_metric_alarm" "tunnelbroker_memory_utilization" { + alarm_name = "TunnelbrokerMemoryUtilizationAlarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = "MemoryUtilization" + period = 60 + statistic = "Average" + threshold = 90 + alarm_description = "Alarm when Tunnelbroker service memory utilization exceeds 90%" + alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn] + namespace = "AWS/ECS" + dimensions = { + ClusterName = aws_ecs_cluster.comm_services.name + ServiceName = aws_ecs_service.tunnelbroker.name + } +} + + +resource "aws_cloudwatch_metric_alarm" "tunnelbroker_cpu_utilization" { + alarm_name = "TunnelbrokerCPUUtilizationAlarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = "CPUUtilization" + period = 60 + statistic = "Average" + threshold = 90 + alarm_description = "Alarm when Tunnelbroker service CPU utilization exceeds 90%" + alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn] + namespace = "AWS/ECS" + dimensions = { + ClusterName = aws_ecs_cluster.comm_services.name + ServiceName = aws_ecs_service.tunnelbroker.name + } +} diff --git a/services/terraform/remote/aws_cloudwatch_alarms.tf b/services/terraform/remote/aws_cloudwatch_alarms.tf index 30932e3c4d..d1c48b25ec 100644 --- a/services/terraform/remote/aws_cloudwatch_alarms.tf +++ b/services/terraform/remote/aws_cloudwatch_alarms.tf @@ -1,38 +1,6 @@ locals { error_reports_subscribed_email = "error-reports@comm.app" - lambda_error_threshold = "2" - identity_error_threshold = "1" - - identity_error_patterns = { - Search = { name = "Search", pattern = "Search Error" }, - Sync = { name = "Sync", pattern = "Sync Error" }, - Database = { name = "DB", pattern = "*DB Error" }, - GrpcServices = { name = "GrpcServices", pattern = "gRPC Services Error" }, - Siwe = { name = "Siwe", pattern = "SIWE Error" }, - Tunnelbroker = { name = "Tunnelbroker", pattern = "Tunnelbroker Error" } - Http = { name = "HTTP", pattern = "HTTP Error" } - } - - blob_error_patterns = { - S3 = { name = "S3", pattern = "S3 Error" }, - DDB = { name = "DDB", pattern = "DDB Error" }, - HTTP = { name = "HTTP", pattern = "HTTP Error" }, - Other = { name = "Other", pattern = "Other Error" }, - } - - tunnelbroker_error_patterns = { - AMQP = { name = "AMQP", pattern = "AMQP Error" }, - DDB = { name = "DDB", pattern = "DDB Error" }, - FCM = { name = "FCM", pattern = "FCM Error" }, - APNs = { name = "APNs", pattern = "APNs Error" }, - WebPush = { name = "WebPush", pattern = "Web Push Error" }, - WNS = { name = "WNS", pattern = "WNS Error" }, - Identity = { name = "Identity", pattern = "Identity Error" }, - Websocket = { name = "Websocket", pattern = "Websocket Error" }, - Server = { name = "Server", pattern = "Server Error" }, - } - service_log_groups = { Backup = { name = "Backup", log_group_name = "/ecs/backup-service-task-def" }, Blob = { name = "Blob", log_group_name = "/ecs/blob-service-task-def" }, @@ -44,79 +12,12 @@ locals { } } -resource "aws_sns_topic" "lambda_alarm_topic" { - name = "lambda-error-alarm-topic" -} - -resource "aws_sns_topic_subscription" "email_subscription" { - topic_arn = aws_sns_topic.lambda_alarm_topic.arn - protocol = "email" - endpoint = local.error_reports_subscribed_email -} - -resource "aws_cloudwatch_metric_alarm" "lambda_error_alarm" { - alarm_name = "SearchIndexLambdaErrorAlarm" - comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = "1" - metric_name = "LambdaErrors" - namespace = "AWS/Lambda" - period = "300" - statistic = "Sum" - threshold = local.lambda_error_threshold - alarm_description = "Alarm tracking search index lambda function failure" - actions_enabled = true - alarm_actions = [aws_sns_topic.lambda_alarm_topic.arn] - dimensions = { - FunctionName = module.shared.search_index_lambda.function_name - } -} - -resource "aws_sns_topic" "identity_error_topic" { - name = "identity-error-topic" -} - -resource "aws_sns_topic_subscription" "identity_email_subscription" { - topic_arn = aws_sns_topic.identity_error_topic.arn - protocol = "email" - endpoint = local.error_reports_subscribed_email -} - resource "aws_sns_topic_subscription" "ecs_task_stop_subscription" { topic_arn = aws_sns_topic.ecs_task_stop_topic.arn protocol = "email" endpoint = local.error_reports_subscribed_email } -resource "aws_cloudwatch_log_metric_filter" "identity_error_filters" { - for_each = local.identity_error_patterns - - name = "Identity${each.value.name}ErrorCount" - pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }" - log_group_name = "/ecs/identity-service-task-def" - - metric_transformation { - name = "Identity${each.value.name}ErrorCount" - namespace = "IdentityServiceMetricFilters" - value = "1" - } -} - -resource "aws_cloudwatch_metric_alarm" "identity_error_alarms" { - for_each = local.identity_error_patterns - - alarm_name = "Identity${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm" - comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = "1" - metric_name = "Identity${each.value.name}ErrorCount" - namespace = "IdentityServiceMetricFilters" - period = "300" - statistic = "Sum" - threshold = local.identity_error_threshold - alarm_description = "Alarm when Identity ${each.value.name} errors exceed threshold" - actions_enabled = true - alarm_actions = [aws_sns_topic.identity_error_topic.arn] -} - resource "aws_sns_topic" "ecs_task_stop_topic" { name = "ecs-task-stop-topic" } @@ -217,154 +118,3 @@ resource "aws_cloudwatch_metric_alarm" "service_connection_error_alarms" { actions_enabled = true alarm_actions = [aws_sns_topic.service_connection_error_topic.arn] } - - -resource "aws_sns_topic" "blob_error_topic" { - name = "blob-error-topic" -} - -resource "aws_sns_topic_subscription" "blob_email_subscription" { - topic_arn = aws_sns_topic.blob_error_topic.arn - protocol = "email" - endpoint = local.error_reports_subscribed_email -} - -resource "aws_cloudwatch_log_metric_filter" "blob_error_filters" { - for_each = local.blob_error_patterns - - name = "Blob${each.value.name}ErrorCount" - pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }" - log_group_name = "/ecs/blob-service-task-def" - - metric_transformation { - name = "Blob${each.value.name}ErrorCount" - namespace = "BlobServiceMetricFilters" - value = "1" - } -} - -resource "aws_cloudwatch_metric_alarm" "blob_error_alarms" { - for_each = local.blob_error_patterns - - alarm_name = "Blob${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm" - comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = "1" - metric_name = "Blob${each.value.name}ErrorCount" - namespace = "BlobServiceMetricFilters" - period = "300" - statistic = "Sum" - threshold = 1 - alarm_description = "Alarm when Blob ${each.value.name} errors exceed threshold" - actions_enabled = true - alarm_actions = [aws_sns_topic.blob_error_topic.arn] -} - -resource "aws_cloudwatch_metric_alarm" "blob_memory_utilization" { - alarm_name = "BlobMemoryUtilizationAlarm" - comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = 1 - metric_name = "MemoryUtilization" - namespace = "AWS/ECS" - period = 60 - statistic = "Average" - threshold = 90 - alarm_description = "Alarm when Blob service memory utilization exceeds 90%" - dimensions = { - ClusterName = aws_ecs_cluster.comm_services.name - ServiceName = aws_ecs_service.blob_service.name - } - alarm_actions = [aws_sns_topic.blob_error_topic.arn] -} - - -resource "aws_cloudwatch_metric_alarm" "blob_cpu_utilization" { - alarm_name = "BlobCPUUtilizationAlarm" - comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = 1 - metric_name = "CPUUtilization" - namespace = "AWS/ECS" - period = 60 - statistic = "Average" - threshold = 90 - alarm_description = "Alarm when Blob service CPU utilization exceeds 90%" - dimensions = { - ClusterName = aws_ecs_cluster.comm_services.name - ServiceName = aws_ecs_service.blob_service.name - } - alarm_actions = [aws_sns_topic.blob_error_topic.arn] -} - -resource "aws_sns_topic" "tunnelbroker_error_topic" { - name = "tunnelbroker-error-topic" -} - -resource "aws_sns_topic_subscription" "tunnelbroker_email_subscription" { - topic_arn = aws_sns_topic.tunnelbroker_error_topic.arn - protocol = "email" - endpoint = local.error_reports_subscribed_email -} - -resource "aws_cloudwatch_log_metric_filter" "tunnelbroker_error_filters" { - for_each = local.tunnelbroker_error_patterns - - name = "Tunnelbroker${each.value.name}ErrorCount" - pattern = "{ $.level = \"ERROR\" && $.fields.errorType = \"${each.value.pattern}\" }" - log_group_name = "/ecs/tunnelbroker-task-def" - - metric_transformation { - name = "Tunnelbroker${each.value.name}ErrorCount" - namespace = "TunnelbrokerServiceMetricFilters" - value = "1" - } -} - -resource "aws_cloudwatch_metric_alarm" "tunnelbroker_error_alarms" { - for_each = local.tunnelbroker_error_patterns - - alarm_name = "Tunnelbroker${local.is_staging ? "Staging" : "Production"}${each.value.name}ErrorAlarm" - comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = "1" - metric_name = "Tunnelbroker${each.value.name}ErrorCount" - namespace = "TunnelbrokerServiceMetricFilters" - period = "300" - statistic = "Sum" - threshold = 1 - alarm_description = "Alarm when Tunnelbroker ${each.value.name} errors exceed threshold" - actions_enabled = true - alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn] -} - -resource "aws_cloudwatch_metric_alarm" "tunnelbroker_memory_utilization" { - alarm_name = "TunnelbrokerMemoryUtilizationAlarm" - comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = 1 - metric_name = "MemoryUtilization" - period = 60 - statistic = "Average" - threshold = 90 - alarm_description = "Alarm when Tunnelbroker service memory utilization exceeds 90%" - alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn] - namespace = "AWS/ECS" - dimensions = { - ClusterName = aws_ecs_cluster.comm_services.name - ServiceName = aws_ecs_service.tunnelbroker.name - } -} - - -resource "aws_cloudwatch_metric_alarm" "tunnelbroker_cpu_utilization" { - alarm_name = "TunnelbrokerCPUUtilizationAlarm" - comparison_operator = "GreaterThanOrEqualToThreshold" - evaluation_periods = 1 - metric_name = "CPUUtilization" - period = 60 - statistic = "Average" - threshold = 90 - alarm_description = "Alarm when Tunnelbroker service CPU utilization exceeds 90%" - alarm_actions = [aws_sns_topic.tunnelbroker_error_topic.arn] - namespace = "AWS/ECS" - dimensions = { - ClusterName = aws_ecs_cluster.comm_services.name - ServiceName = aws_ecs_service.tunnelbroker.name - } -}