GoogleCloudPlatform · Bslabe123 · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024
@@ -19,13 +19,17 @@ export IP=$IP
 
 huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential
 
+if [[ "$PROMPT_DATASET" = "sharegpt" ]]; then
+  PROMPT_DATASET_FILE="ShareGPT_V3_unfiltered_cleaned_split.json"
+fi
+
 PYTHON="python3"
 PYTHON_OPTS="benchmark_serving.py "
 for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do
   # TODO: Check if profile already exists, if so then skip
   timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
   output_file="latency-profile-${timestamp}.txt"
-  PYTHON_OPTS="$PYTHON_OPTS --save-json-results --host=$IP   --port=$PORT   --model=$TOKENIZER --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$((request_rate * $BENCHMARK_TIME_SECONDS)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH"
+  PYTHON_OPTS="$PYTHON_OPTS --save-json-results --host=$IP   --port=$PORT   --model=$TOKENIZER --dataset=$PROMPT_DATASET_FILE --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$((request_rate * $BENCHMARK_TIME_SECONDS)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH"
   if [[ "$SCRAPE_SERVER_METRICS" = "true" ]]; then
     PYTHON_OPTS="$PYTHON_OPTS --scrape-server-metrics"
   fi

@@ -68,6 +68,7 @@ module "latency-profile" {
       port = var.targets.manual.service_port
     }
   }
+  prompt_dataset                             = var.prompt_dataset
   max_num_prompts                            = var.max_num_prompts
   max_output_len                             = var.max_output_len
   max_prompt_len                             = var.max_prompt_len

@@ -54,6 +54,7 @@ resource "kubernetes_manifest" "latency-profile-generator" {
     inference_server_service_port              = var.inference_server.service.port
     tokenizer                                  = var.inference_server.tokenizer
     latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
+    prompt_dataset                             = var.prompt_dataset
     max_num_prompts                            = var.max_num_prompts
     max_output_len                             = var.max_output_len
     max_prompt_len                             = var.max_prompt_len

@@ -28,6 +28,8 @@ spec:
               value: ${inference_server_service_port}
             - name: BACKEND
               value: ${inference_server_framework}
+            - name: PROMPT_DATASET
+              value: ${prompt_dataset}
             - name: INPUT_LENGTH
               value: ${max_prompt_len}
             - name: OUTPUT_LENGTH

@@ -77,6 +77,17 @@ variable "inference_server" {
   }
 }
 
+variable "prompt_dataset" {
+  description = "Prompt dataset URL"
+  type        = string
+  nullable    = false
+  default     = "sharegpt"
+  validation {
+    condition     = contains(["sharegpt"], var.prompt_dataset)
+    error_message = "prompt_dataset must be one of the following: 'sharegpt'"
+  }
+}
+
 variable "max_num_prompts" {
   description = "Benchmark server configuration for max number of prompts."
   type        = number

@@ -64,6 +64,17 @@ variable "build_latency_profile_generator_image" {
   default     = true
 }
 
+variable "prompt_dataset" {
+  description = "Prompt dataset URL"
+  type        = string
+  nullable    = false
+  default     = "sharegpt"
+  validation {
+    condition     = contains(["sharegpt"], var.prompt_dataset)
+    error_message = "prompt_dataset must be one of the following: 'sharegpt'"
+  }
+}
+
 variable "max_num_prompts" {
   description = "Benchmark server configuration for max number of prompts."
   type        = number