GoogleCloudPlatform · achandrasekar · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024
@@ -25,7 +25,7 @@ for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do
   # TODO: Check if profile already exists, if so then skip
   timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
   output_file="latency-profile-${timestamp}.txt"
-  PYTHON_OPTS="$PYTHON_OPTS --save-json-results --host=$IP   --port=$PORT   --model=$TOKENIZER --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH"
+  PYTHON_OPTS="$PYTHON_OPTS --save-json-results --host=$IP   --port=$PORT   --model=$TOKENIZER --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$((request_rate * $BENCHMARK_TIME_SECONDS)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH"
   if [[ "$SCRAPE_SERVER_METRICS" = "true" ]]; then
     PYTHON_OPTS="$PYTHON_OPTS --scrape-server-metrics"
   fi

@@ -72,6 +72,7 @@ module "latency-profile" {
   max_output_len                             = var.max_output_len
   max_prompt_len                             = var.max_prompt_len
   request_rates                              = var.request_rates
+  benchmark_time_seconds                     = var.benchmark_time_seconds
   output_bucket                              = var.output_bucket
   latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
   k8s_hf_secret                              = var.k8s_hf_secret

@@ -57,6 +57,7 @@ resource "kubernetes_manifest" "latency-profile-generator" {
     max_num_prompts                            = var.max_num_prompts
     max_output_len                             = var.max_output_len
     max_prompt_len                             = var.max_prompt_len
+    benchmark_time_seconds                     = var.benchmark_time_seconds
     request_rates                              = join(",", [for number in var.request_rates : tostring(number)])
     hugging_face_token_secret_list             = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
     k8s_hf_secret_list                         = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]

@@ -34,6 +34,8 @@ spec:
               value: ${max_output_len}
             - name: REQUEST_RATES
               value: ${request_rates}
+            - name: BENCHMARK_TIME_SECONDS
+              value: ${benchmark_time_seconds}
             - name: OUTPUT_BUCKET
               value: ${output_bucket}
             - name: SCRAPE_SERVER_METRICS

@@ -40,5 +40,6 @@ inference_server = {
   }
 }
 
-# Benchmark configuration for Locust Docker accessing inference server
-request_rates = [5, 10, 15, 20]
+# Benchmark configuration for Latency Profile Generator container accessing inference server
+request_rates          = [5, 10, 15, 20]
+benchmark_time_seconds = 120
@@ -158,4 +158,11 @@ variable "scrape_server_metrics" {
   description = "Whether to scrape server metrics."
   type        = bool
   default     = false
+}
+
+variable "benchmark_time_seconds" {
+  description = "The amount of time (in seconds) the benchmark should be run at each request rate"
+  type        = number
+  default     = 120
+  nullable    = false
 }
@@ -31,9 +31,10 @@ latency_profile_kubernetes_service_account = "prom-frontend-sa"
 output_bucket                              = "your_project_id-benchmark-output-bucket"
 k8s_hf_secret                              = "hf-token"
 
-# Benchmark configuration for Locust Docker accessing inference server
-request_rates     = [5, 10, 15, 20]
-artifact_registry = "your_project_artifact_registry"
+# Benchmark configuration for Latency Profile Generator accessing inference server
+request_rates          = [5, 10, 15, 20]
+benchmark_time_seconds = 120
+artifact_registry      = "your_project_artifact_registry"
 
 # Model server configuration information
 targets = {

@@ -150,4 +150,11 @@ variable "scrape_server_metrics" {
   description = "Whether to scrape server metrics."
   type        = bool
   default     = false
+}
+
+variable "benchmark_time_seconds" {
+  description = "The amount of time (in seconds) the benchmark should be run at each request rate"
+  type        = number
+  default     = 120
+  nullable    = false
 }