diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index fb18fa6cc..617c38b0e 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -17,6 +17,7 @@ import google.auth import google.auth.transport.requests +from google.cloud import storage import aiohttp import numpy as np @@ -47,6 +48,10 @@ async def on_request_end(session, trace_config_ctx, params): trace_config.on_request_start.append(on_request_start) trace_config.on_request_end.append(on_request_end) +# Google Cloud Storage Client +gcs_client = None +gcs_bucket = None + def sample_requests( dataset_path: str, num_requests: int, @@ -337,7 +342,6 @@ async def benchmark( print_and_save_result(args, benchmark_duration, len(input_requests), model, combined_latencies, combined_errors) return combined_latencies, combined_errors - def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics, model, errors): # Setup start_dt_proto = Timestamp() @@ -427,6 +431,9 @@ def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics ) with open(file_name, "w", encoding="utf-8") as outfile: json.dump(final_json, outfile) + if gcs_bucket is not None: + gcs_bucket.blob(f"{args.output_bucket_filepath}/{file_name}").upload_from_filename(file_name) + print(f"File {file_name} uploaded to gs://{args.output_bucket}/{args.output_bucket_filepath}") def metrics_to_scrape(backend: str) -> List[str]: # Each key in the map is a metric, it has a corresponding 'stats' object @@ -610,6 +617,19 @@ async def main(args: argparse.Namespace): if args.backend == "vllm" else args.endpoint ) + + # Create GCS client before benchmarking + # Should fail fast if client is misconfigured or missing permissions + if args.output_bucket is not None: + global gcs_client + gcs_client = storage.Client() + global gcs_bucket + gcs_bucket = gcs_client.bucket(args.output_bucket) + + if args.output_bucket_filepath: + blob = gcs_bucket.blob(args.output_bucket_filepath) + if not blob.exists(): + blob.upload_from_string('') print(f"Starting Prometheus Server on port {PROMETHEUS_PORT}") start_http_server(PROMETHEUS_PORT) @@ -759,6 +779,27 @@ async def main(args: argparse.Namespace): action="store_true", help="Whether to save benchmark results to a json file.", ) + parser.add_argument( + "--output-bucket", + type=str, + default=None, + help=( + "Specifies the Google Cloud Storage bucket to which JSON-format results" + " will be uploaded. If not provided, no upload will occur." + ) + ) + parser.add_argument( + "--output-bucket-filepath", + type=str, + default=None, + help=( + "Specifies the destination path within the bucket provided by" + " --output-bucket for uploading the JSON results. This argument requires" + " --output-bucket to be set. If not specified, results will be uploaded " + " to the root of the bucket. If the filepath doesnt exist, it will be" + " created for you." + ) + ) parser.add_argument( "--save-aggregated-result", action="store_true", diff --git a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh index 7c8f638e3..f76de4c49 100755 --- a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh +++ b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh @@ -37,7 +37,7 @@ for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do num_prompts=$(awk "BEGIN {print int($request_rate * $BENCHMARK_TIME_SECONDS)}") fi echo "TOTAL prompts: $num_prompts" # Output: 8 - PYTHON_OPTS="$PYTHON_OPTS --save-json-results --host=$IP --port=$PORT --dataset=$PROMPT_DATASET_FILE --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$num_prompts --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH --file-prefix=$FILE_PREFIX --models=$MODELS" + PYTHON_OPTS="$PYTHON_OPTS --save-json-results --output-bucket=$OUTPUT_BUCKET --output-bucket-filepath $OUTPUT_BUCKET_FILEPATH --host=$IP --port=$PORT --dataset=$PROMPT_DATASET_FILE --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$num_prompts --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH --file-prefix=$FILE_PREFIX --models=$MODELS" if [[ "$SCRAPE_SERVER_METRICS" = "true" ]]; then PYTHON_OPTS="$PYTHON_OPTS --scrape-server-metrics" fi diff --git a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt index a9f6d99a6..c3bfdaca3 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt +++ b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt @@ -35,4 +35,5 @@ pynvml == 11.5.0 accelerate aiohttp google-auth +google-cloud-storage >= 2.18.2 prometheus_client >= 0.21.0 \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf index 766004b62..637ce4c1c 100644 --- a/benchmarks/benchmark/tools/profile-generator/main.tf +++ b/benchmarks/benchmark/tools/profile-generator/main.tf @@ -68,13 +68,16 @@ module "latency-profile" { port = var.targets.manual.service_port } } - prompt_dataset = var.prompt_dataset - max_num_prompts = var.max_num_prompts - max_output_len = var.max_output_len - max_prompt_len = var.max_prompt_len - request_rates = var.request_rates - benchmark_time_seconds = var.benchmark_time_seconds - output_bucket = var.output_bucket + prompt_dataset = var.prompt_dataset + max_num_prompts = var.max_num_prompts + max_output_len = var.max_output_len + max_prompt_len = var.max_prompt_len + request_rates = var.request_rates + benchmark_time_seconds = var.benchmark_time_seconds + gcs_output = { + bucket = var.output_bucket + filepath = var.output_bucket_filepath + } latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account k8s_hf_secret = var.k8s_hf_secret hugging_face_secret = var.hugging_face_secret diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf index 9d6591394..e74e0bb3e 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf @@ -63,7 +63,8 @@ resource "kubernetes_manifest" "latency-profile-generator" { request_rates = join(",", [for number in var.request_rates : tostring(number)]) hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret] k8s_hf_secret_list = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret] - output_bucket = var.output_bucket + output_bucket = var.gcs_output.bucket + output_bucket_filepath = var.gcs_output.filepath scrape_server_metrics = var.scrape_server_metrics file_prefix = var.file_prefix save_aggregated_result = var.save_aggregated_result diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl index 60eacf4c7..0be39a529 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl @@ -42,6 +42,8 @@ spec: value: ${benchmark_time_seconds} - name: OUTPUT_BUCKET value: ${output_bucket} + - name: OUTPUT_BUCKET_FILEPATH + value: ${output_bucket_filepath} - name: SCRAPE_SERVER_METRICS value: ${scrape_server_metrics} - name: MAX_NUM_PROMPTS diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf index 3489b63ad..1284c0b4a 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf @@ -139,9 +139,13 @@ variable "models" { default = "tiiuae/falcon-7b" } -variable "output_bucket" { - description = "Bucket name for storing results" - type = string +variable "gcs_output" { + description = "Bucket name and filepath for storing json results, if filepath not specified, results uploaded to root of bucket" + type = object({ + bucket = string + filepath = optional(string) + }) + nullable = true } variable "latency_profile_kubernetes_service_account" { diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf index 1e7864bad..d51fb2ce7 100644 --- a/benchmarks/benchmark/tools/profile-generator/variables.tf +++ b/benchmarks/benchmark/tools/profile-generator/variables.tf @@ -117,6 +117,12 @@ variable "output_bucket" { type = string } +variable "output_bucket_filepath" { + description = "Where in bucket to store json results, will upload to root of bucket if not specified" + type = string + nullable = true +} + variable "latency_profile_kubernetes_service_account" { description = "Kubernetes Service Account to be used for the latency profile generator tool" type = string