diff --git a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh index f246bff8d..97ba646d3 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh +++ b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh @@ -19,13 +19,17 @@ export IP=$IP huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential +if [[ "$PROMPT_DATASET" = "sharegpt" ]]; then + PROMPT_DATASET_FILE="ShareGPT_V3_unfiltered_cleaned_split.json" +fi + PYTHON="python3" PYTHON_OPTS="benchmark_serving.py " for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do # TODO: Check if profile already exists, if so then skip timestamp=$(date +"%Y-%m-%d_%H-%M-%S") output_file="latency-profile-${timestamp}.txt" - PYTHON_OPTS="$PYTHON_OPTS --save-json-results --host=$IP --port=$PORT --model=$TOKENIZER --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$((request_rate * $BENCHMARK_TIME_SECONDS)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH" + PYTHON_OPTS="$PYTHON_OPTS --save-json-results --host=$IP --port=$PORT --model=$TOKENIZER --dataset=$PROMPT_DATASET_FILE --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$((request_rate * $BENCHMARK_TIME_SECONDS)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH" if [[ "$SCRAPE_SERVER_METRICS" = "true" ]]; then PYTHON_OPTS="$PYTHON_OPTS --scrape-server-metrics" fi diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf index 3a1727b80..495cfae84 100644 --- a/benchmarks/benchmark/tools/profile-generator/main.tf +++ b/benchmarks/benchmark/tools/profile-generator/main.tf @@ -68,6 +68,7 @@ module "latency-profile" { port = var.targets.manual.service_port } } + prompt_dataset = var.prompt_dataset max_num_prompts = var.max_num_prompts max_output_len = var.max_output_len max_prompt_len = var.max_prompt_len diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf index 335d1eadd..79b050e3e 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf @@ -54,6 +54,7 @@ resource "kubernetes_manifest" "latency-profile-generator" { inference_server_service_port = var.inference_server.service.port tokenizer = var.inference_server.tokenizer latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account + prompt_dataset = var.prompt_dataset max_num_prompts = var.max_num_prompts max_output_len = var.max_output_len max_prompt_len = var.max_prompt_len diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl index e826a584f..8dafa56d1 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl @@ -28,6 +28,8 @@ spec: value: ${inference_server_service_port} - name: BACKEND value: ${inference_server_framework} + - name: PROMPT_DATASET + value: ${prompt_dataset} - name: INPUT_LENGTH value: ${max_prompt_len} - name: OUTPUT_LENGTH diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf index 2f77be5bf..6e7702e7b 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf @@ -77,6 +77,17 @@ variable "inference_server" { } } +variable "prompt_dataset" { + description = "Prompt dataset URL" + type = string + nullable = false + default = "sharegpt" + validation { + condition = contains(["sharegpt"], var.prompt_dataset) + error_message = "prompt_dataset must be one of the following: 'sharegpt'" + } +} + variable "max_num_prompts" { description = "Benchmark server configuration for max number of prompts." type = number diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf index 3d29046ec..a9ab4b7d4 100644 --- a/benchmarks/benchmark/tools/profile-generator/variables.tf +++ b/benchmarks/benchmark/tools/profile-generator/variables.tf @@ -64,6 +64,17 @@ variable "build_latency_profile_generator_image" { default = true } +variable "prompt_dataset" { + description = "Prompt dataset URL" + type = string + nullable = false + default = "sharegpt" + validation { + condition = contains(["sharegpt"], var.prompt_dataset) + error_message = "prompt_dataset must be one of the following: 'sharegpt'" + } +} + variable "max_num_prompts" { description = "Benchmark server configuration for max number of prompts." type = number