Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make the benchmark time configurable and set default to 2 minutes #833

Merged
merged 2 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do
# TODO: Check if profile already exists, if so then skip
timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
output_file="latency-profile-${timestamp}.txt"
PYTHON_OPTS="$PYTHON_OPTS --save-json-results --host=$IP --port=$PORT --model=$TOKENIZER --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH"
PYTHON_OPTS="$PYTHON_OPTS --save-json-results --host=$IP --port=$PORT --model=$TOKENIZER --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$((request_rate * $BENCHMARK_TIME_SECONDS)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH"
if [[ "$SCRAPE_SERVER_METRICS" = "true" ]]; then
PYTHON_OPTS="$PYTHON_OPTS --scrape-server-metrics"
fi
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark/tools/profile-generator/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ module "latency-profile" {
max_output_len = var.max_output_len
max_prompt_len = var.max_prompt_len
request_rates = var.request_rates
benchmark_time_seconds = var.benchmark_time_seconds
output_bucket = var.output_bucket
latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
k8s_hf_secret = var.k8s_hf_secret
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ resource "kubernetes_manifest" "latency-profile-generator" {
max_num_prompts = var.max_num_prompts
max_output_len = var.max_output_len
max_prompt_len = var.max_prompt_len
benchmark_time_seconds = var.benchmark_time_seconds
request_rates = join(",", [for number in var.request_rates : tostring(number)])
hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
k8s_hf_secret_list = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ spec:
value: ${max_output_len}
- name: REQUEST_RATES
value: ${request_rates}
- name: BENCHMARK_TIME_SECONDS
value: ${benchmark_time_seconds}
- name: OUTPUT_BUCKET
value: ${output_bucket}
- name: SCRAPE_SERVER_METRICS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,6 @@ inference_server = {
}
}

# Benchmark configuration for Locust Docker accessing inference server
request_rates = [5, 10, 15, 20]
# Benchmark configuration for Latency Profile Generator container accessing inference server
request_rates = [5, 10, 15, 20]
benchmark_time_seconds = 120
Original file line number Diff line number Diff line change
Expand Up @@ -158,4 +158,11 @@ variable "scrape_server_metrics" {
description = "Whether to scrape server metrics."
type = bool
default = false
}

variable "benchmark_time_seconds" {
description = "The amount of time (in seconds) the benchmark should be run at each request rate"
type = number
default = 120
nullable = false
}
7 changes: 4 additions & 3 deletions benchmarks/benchmark/tools/profile-generator/sample.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@ latency_profile_kubernetes_service_account = "prom-frontend-sa"
output_bucket = "your_project_id-benchmark-output-bucket"
k8s_hf_secret = "hf-token"

# Benchmark configuration for Locust Docker accessing inference server
request_rates = [5, 10, 15, 20]
artifact_registry = "your_project_artifact_registry"
# Benchmark configuration for Latency Profile Generator accessing inference server
request_rates = [5, 10, 15, 20]
benchmark_time_seconds = 120
artifact_registry = "your_project_artifact_registry"

# Model server configuration information
targets = {
Expand Down
7 changes: 7 additions & 0 deletions benchmarks/benchmark/tools/profile-generator/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -150,4 +150,11 @@ variable "scrape_server_metrics" {
description = "Whether to scrape server metrics."
type = bool
default = false
}

variable "benchmark_time_seconds" {
description = "The amount of time (in seconds) the benchmark should be run at each request rate"
type = number
default = 120
nullable = false
}