Skip to content

Commit

Permalink
add gemma 2 27b
Browse files Browse the repository at this point in the history
  • Loading branch information
aidanrussell committed Dec 20, 2024
1 parent affe076 commit e08b7b5
Showing 1 changed file with 190 additions and 0 deletions.
190 changes: 190 additions & 0 deletions infra/sagemaker_llm_resources.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ locals {
module.gpt_neo_125_deployment.endpoint_name,
module.llama_3_2_1b_deployment.endpoint_name,
module.mistral_7b_deployment.endpoint_name,
module.gemma_2_27b_deployment.endpoint_name,
]
}

Expand Down Expand Up @@ -573,3 +574,192 @@ module "mistral_7b_deployment" {
]
slack_lambda_name = "slack-integration-${module.mistral_7b_deployment.endpoint_name}"
}


###############
# Gemma 2 27B
###############
module "gemma_2_27b_deployment" {
source = "./modules/sagemaker_deployment"
model_name = "gemma-2-27b"
sns_success_topic_arn = module.sagemaker_output_mover.sns_success_topic_arn
execution_role_arn = module.iam.inference_role
container_image = "763104351884.dkr.ecr.eu-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi2.3.1-gpu-py311-cu124-ubuntu22.04"
uncompressed_model_uri = "s3://jumpstart-private-cache-prod-eu-west-2/huggingface-llm/huggingface-llm-gemma-2-27b/artifacts/inference-prepack/v1.0.0/"
environment_variables = {
"ENDPOINT_SERVER_TIMEOUT" : "3600",
"HF_MODEL_ID" : "/opt/ml/model",
"MAX_BATCH_PREFILL_TOKENS" : "8191",
"MAX_INPUT_LENGTH" : "8191",
"MAX_TOTAL_TOKENS" : "8192",
"MODEL_CACHE_ROOT" : "/opt/ml/model",
"SAGEMAKER_PROGRAM" : "inference.py",
"SM_NUM_GPUS" : "8"
}
instance_type = "ml.g5.48xlarge"
security_group_ids = [aws_security_group.notebooks.id]
subnets = aws_subnet.private_without_egress.*.id
endpoint_config_name = "sagemaker-endpoint-config-gemma-2-27b"
endpoint_name = "gemma-2-27b-endpoint"
variant_name = "gemma-2-27b-endpoint-dev"
initial_instance_count = 1
s3_output_path = "https://${module.iam.default_sagemaker_bucket.bucket_regional_domain_name}"
max_capacity = 2
min_capacity = 0
scale_up_adjustment = 1
scale_up_cooldown = 30
scale_in_to_zero_cooldown = 120
log_group_name = "/aws/sagemaker/Endpoints/${module.gemma_2_27b_deployment.endpoint_name}"
aws_account_id = data.aws_caller_identity.aws_caller_identity.account_id

alarms = [
{
alarm_name = "backlog-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
alarm_description = "Scale up to 1 when queries are in the backlog, if 0 instances"
metric_name = "HasBacklogWithoutCapacity"
namespace = "AWS/SageMaker"
comparison_operator = "GreaterThanOrEqualToThreshold"
threshold = 1
evaluation_periods = 1
datapoints_to_alarm = 1
period = 30
statistic = "Average"
alarm_actions = [module.gemma_2_27b_deployment.scale_up_policy_arn]
sns_topic_name = "backlog-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
slack_webhook_url = var.slack_webhook_backlog_alerts
},
{
alarm_name = "low-cpu-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
alarm_description = "Scale in to zero when CPU < 5%"
metric_name = "CPUUtilization"
namespace = "/aws/sagemaker/Endpoints"
comparison_operator = "LessThanThreshold"
threshold = 5.0
evaluation_periods = 3
datapoints_to_alarm = 2
period = 60
statistic = "Average"
alarm_actions = [module.gemma_2_27b_deployment.scale_in_to_zero_policy_arn]
sns_topic_name = "low-cpu-alert-${module.gemma_2_27b_deployment.endpoint_name}"
slack_webhook_url = var.slack_webhook_cpu_alerts
},
{
alarm_name = "no-query-in-backlog-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
alarm_description = "Scale in to zero when no queries are in the backlog for > 3 minutes"
metric_name = "ApproximateBacklogSize"
namespace = "AWS/SageMaker"
comparison_operator = "LessThanThreshold"
threshold = 0
evaluation_periods = 3
datapoints_to_alarm = 2
period = 60
statistic = "Sum"
alarm_actions = [module.gemma_2_27b_deployment.scale_in_to_zero_based_on_backlog_arn]
sns_topic_name = "no-query-in-backlog-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
slack_webhook_url = var.slack_webhook_backlog_alerts
},
{
alarm_name = "high-cpu-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
alarm_description = "Scale out when CPU is at 70% threshold"
metric_name = "CPUUtilization"
namespace = "/aws/sagemaker/Endpoints"
comparison_operator = "GreaterThanThreshold"
threshold = 70
evaluation_periods = 1
datapoints_to_alarm = 1
period = 60
statistic = "Average"
alarm_actions = [module.gemma_2_27b_deployment.scale_up_policy_arn]
sns_topic_name = "high-cpu-alert-${module.gemma_2_27b_deployment.endpoint_name}"
slack_webhook_url = var.slack_webhook_cpu_alerts
},
{
alarm_name = "high-memory-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
alarm_description = "Scale up memory usage > 80%"
metric_name = "MemoryUtilization"
namespace = "/aws/sagemaker/Endpoints"
comparison_operator = "GreaterThanThreshold"
threshold = 80
evaluation_periods = 2
datapoints_to_alarm = 1
period = 60
statistic = "Average"
alarm_actions = [module.gemma_2_27b_deployment.scale_up_policy_arn]
sns_topic_name = "high-memory-alert-${module.gemma_2_27b_deployment.endpoint_name}"
slack_webhook_url = var.slack_webhook_resource_alerts
},
{
alarm_name = "high-GPU-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
alarm_description = "Scale up GPU usage > 70%"
metric_name = "GPUUtilization"
namespace = "/aws/sagemaker/Endpoints"
comparison_operator = "GreaterThanThreshold"
threshold = 70
evaluation_periods = 2
datapoints_to_alarm = 1
period = 60
statistic = "Average"
alarm_actions = [module.gemma_2_27b_deployment.scale_up_policy_arn]
sns_topic_name = "high-gpu-alert-${module.gemma_2_27b_deployment.endpoint_name}"
slack_webhook_url = var.slack_webhook_gpu_alerts
},
{
alarm_name = "network-spike-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
alarm_description = "Scale up to 1 (deactivated) when endpoint experiences a backlog of requests beyond threshold"
metric_name = "ApproximateBacklogSize"
namespace = "AWS/SageMaker"
comparison_operator = "GreaterThanThreshold"
threshold = 10
evaluation_periods = 2
datapoints_to_alarm = 2
period = 30
statistic = "Average"
sns_topic_name = "network-spike-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
slack_webhook_url = var.slack_webhook_resource_alerts
},
{
alarm_name = "disk-util-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
alarm_description = "Alerts when disk util is high"
metric_name = "DiskUtilization"
namespace = "/aws/sagemaker/Endpoints"
comparison_operator = "GreaterThanThreshold"
threshold = 80
evaluation_periods = 2
datapoints_to_alarm = 2
period = 30
statistic = "Average"
sns_topic_name = "dik-util-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
slack_webhook_url = var.slack_webhook_resource_alerts
},
{
alarm_name = "error-rate-high-${module.gemma_2_27b_deployment.endpoint_name}"
alarm_description = "Scales up (deactivated) when Invocation Error rate exceeds 1% over 5 minutes"
metric_name = "Invocation4XXErrors"
namespace = "AWS/SageMaker"
comparison_operator = "GreaterThanThreshold"
threshold = 200 * 0.01
evaluation_periods = 1
datapoints_to_alarm = 1
period = 300
statistic = "Sum"
sns_topic_name = "High-error-rate-operations-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
slack_webhook_url = var.slack_webhook_resource_alerts
},
{
alarm_name = "unathorized-operations-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
alarm_description = "Triggers when unauthorized operations are detected in the CloudTrail Logs"
metric_name = "UnauthorizedOperationsCount"
namespace = "CloudTrailMetrics"
comparison_operator = "GreaterThanThreshold"
threshold = 1
evaluation_periods = 1
datapoints_to_alarm = 1
period = 300
statistic = "Sum"
alarm_actions = [module.sns.unauthorised_access_sns_topic_arn]
sns_topic_name = "unauthorised-operations-alarm-${module.gemma_2_27b_deployment.endpoint_name}"
slack_webhook_url = var.slack_webhook_security_alerts
}
]
slack_lambda_name = "slack-integration-${module.gemma_2_27b_deployment.endpoint_name}"
}

0 comments on commit e08b7b5

Please sign in to comment.