From 89af4bd420d34bfd0d94b7c43d0b76a6aa1b882b Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Wed, 25 Sep 2024 22:13:05 +0000 Subject: [PATCH 1/5] first commit --- .../container/benchmark_serving.py | 185 ++++++++++-------- 1 file changed, 107 insertions(+), 78 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index e208ac530..ab98cf694 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -285,7 +285,7 @@ def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics "num_prompts": args.num_prompts, "request_rate": args.request_rate, 'server_metrics': { - **server_metrics.get("metric_names", {}) + **server_metrics }, **benchmark_result }, @@ -303,9 +303,49 @@ def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics "start_time": start_dt_proto.ToJsonString(), }, "summary_stats": { - "stats": { - **server_metrics.get("metric_aliases", {}) - } + "stats": [{ + "request_rate": 0, + "request_latency": { + "mean": benchmark_result["avg_latency"], + "median": benchmark_result["median_latency"], + "sd": benchmark_result["sd_latency"], + "min": benchmark_result["min_latency"], + "max": benchmark_result["max_latency"], + "p90": benchmark_result["p90_latency"], + "p99": benchmark_result["p99_latency"], + }, + "throughput": { + "mean": benchmark_result['throughput'] + }, + "input_length": { + "mean": benchmark_result["avg_input_len"], + "median": benchmark_result["median_input_len"], + "sd": benchmark_result["sd_input_len"], + "min": benchmark_result["min_input_len"], + "max": benchmark_result["max_input_len"], + "p90": benchmark_result["p90_input_len"], + "p99": benchmark_result["p99_input_len"], + }, + "output_length": { + "mean": benchmark_result["avg_output_len"], + "median": benchmark_result["median_output_len"], + "sd": benchmark_result["sd_output_len"], + "min": benchmark_result["min_output_len"], + "max": benchmark_result["max_output_len"], + "p90": benchmark_result["p90_output_len"], + "p99": benchmark_result["p99_output_len"], + }, + "tpot": { + "mean": benchmark_result["avg_per_output_token_latency"], + "median": benchmark_result["median_per_output_token_latency"], + "sd": benchmark_result["sd_per_output_token_latency"], + "min": benchmark_result["min_per_output_token_latency"], + "max": benchmark_result["max_per_output_token_latency"], + "p90": benchmark_result["p90_per_output_token_latency"], + "p99": benchmark_result["p99_per_output_token_latency"], + }, + "model_server_metrics" : [{"Name": name, **metrics} for name, metrics in server_metrics.items()] + }] } } @@ -317,27 +357,24 @@ def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics with open(file_name, "w", encoding="utf-8") as outfile: json.dump(final_json, outfile) -def metrics_to_scrape(backend: str) -> Dict[str, Optional[str]]: - # Each key in the map is a metric, it has a corresponding 'stats' object - # It must be populated on the outputs 'metrics' field as 'key':'stats' - # If a value is specified for a given key, it will be populated on the outputs `summary_stats.stats` field as 'value':'stats' as well. - if backend == "vllm": - return { - "vllm:gpu_cache_usage_perc": None, - "vllm:num_requests_waiting": None - } - elif backend == "jetstream": - return { - "jetstream_slots_used_percentage": None, - "jetstream_prefill_backlog_size": None, - "jetstream_time_to_first_token": "ttft", - "jetstream_time_per_output_token": "tpot", - "jetstream_time_per_request": "request_latency" - } - else: - return {} - -def print_metrics(metrics: Dict[str, Optional[str]], duration: float, backend: str): +def metrics_to_scrape(backend: str) -> List[str]: + # Each key in the map is a metric, it has a corresponding 'stats' object + # It must be populated on the outputs 'metrics' field as 'key':'stats' + # If a value is specified for a given key, it will be populated on the outputs `summary_stats.stats` field as 'value':'stats' as well. + if backend == "vllm": + return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"] + elif backend == "jetstream": + return [ + "jetstream_slots_used_percentage", + "jetstream_prefill_backlog_size", + "jetstream_time_to_first_token", + "jetstream_time_per_output_token", + "jetstream_time_per_request", + ] + else: + return [] + +def print_metrics(metrics: List[str], duration: float, backend: str): # Creates a credentials object from the default service account file # Assumes that script has appropriate default credentials set up, ref: # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials @@ -345,10 +382,7 @@ def print_metrics(metrics: Dict[str, Optional[str]], duration: float, backend: s # Prepare an authentication request - helps format the request auth token auth_req = google.auth.transport.requests.Request() - all_metric_results = { - "metric_names" : {}, - "metric_aliases": {}, - } + server_metrics = {} # Request refresh tokens credentials.refresh(auth_req) @@ -361,7 +395,7 @@ def print_metrics(metrics: Dict[str, Optional[str]], duration: float, backend: s if all_metrics_metadata["status"] != "success": print("Metadata error response: %s" % all_metrics_metadata["error"]) - for metric, metric_alias in metrics.items(): + for metric in metrics: print("Metric Name: %s" % (metric)) # Find metric type @@ -378,7 +412,7 @@ def print_metrics(metrics: Dict[str, Optional[str]], duration: float, backend: s "gauge": { "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), - "Std": "stddev_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "Sd": "stddev_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), "Min": "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), "Max": "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), @@ -410,11 +444,35 @@ def print_metrics(metrics: Dict[str, Optional[str]], duration: float, backend: s print("Cloud Monitoring PromQL Error: %s" % (response["error"])) else: print("HTTP Error: %s" % (response)) - all_metric_results["metric_names"][metric] = metric_results - if metric_alias is not None: - all_metric_results["metric_aliases"][metric_alias] = metric_results - return all_metric_results - + server_metrics[metric] = metric_results + return server_metrics + +def get_stats_for_set(name, description, points): + avg = np.mean(points) + median = np.median(points) + sd = np.std(points) + min = np.min(points) + max = np.max(points) + p90 = np.percentile(points, 90) + p99 = np.percentile(points, 99) + + print(f"Average {description}:" f" {avg:.2f}") + print(f"Median {description}:" f" {median:.2f}") + print(f"Standard deviation of {description}:" f" {sd:.2f}") + print(f"Minimum {description}:" f" {min:.2f}") + print(f"Maximum {description}:" f" {max:.2f}") + print(f"90th Percentile {description}:" f" {p90:.2f}") + print(f"99th Percentile {description}:" f" {p99:.2f}") + + return { + f'avg_{name}': avg, + f'median_{name}': median, + f'sd_{name}': sd, + f'min_{name}': min, + f'max_{name}': max, + f'p90_{name}': p90, + f'p99_{name}': p99, + } def main(args: argparse.Namespace): print(args) @@ -463,6 +521,7 @@ def main(args: argparse.Namespace): print(f"Total time: {benchmark_time:.2f} s") print(f"Requests/min: {60 * args.num_prompts / benchmark_time:.2f}") benchmark_result['benchmark_time'] = benchmark_time + benchmark_result['throughput'] = (args.num_prompts / benchmark_time) total_output_tokens = np.sum([output_len for _, output_len, _ in REQUEST_LATENCY]) @@ -489,51 +548,21 @@ def main(args: argparse.Namespace): "Cost $/1k tokens:" f" {args.machine_cost * 1000 / (60 * output_tokens_per_min)}" ) - # NOTE: The latency below includes requests awaiting time on server side. - # It's not comparable with the model inference latency for batch size 1. - avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY]) - print( - "Average seconds/request (includes waiting time on server):" - f" {avg_latency:.2f}" - ) - benchmark_result['avg_latency'] = avg_latency - avg_per_token_latency = np.mean([ + benchmark_result = { + **benchmark_result, + **(get_stats_for_set("per_token_latency", "seconds/token (includes waiting time on server)", [ latency / (prompt_len + output_len) for prompt_len, output_len, latency in REQUEST_LATENCY - ]) - print( - "Average milliseconds/token (includes waiting time on server):" - f" {1000 * avg_per_token_latency:.2f}" - ) - benchmark_result['avg_per_token_latency'] = avg_per_token_latency - - avg_per_output_token_latency = np.mean( - [latency / output_len for _, output_len, latency in REQUEST_LATENCY] - ) - print( - "Average milliseconds/output_token (includes waiting time on server):" - f" {1000 * avg_per_output_token_latency:.2f}" - ) - benchmark_result['avg_per_output_token_latency'] = avg_per_output_token_latency - - avg_input_len = np.mean( - [prompt_len for prompt_len, _, _ in REQUEST_LATENCY] - ) - print( - "Average input length:" - f" {avg_input_len:.2f}" - ) - benchmark_result['avg_input_len'] = avg_input_len - - avg_output_len = np.mean( - [output_len for _, output_len, _ in REQUEST_LATENCY] - ) - print( - "Average output length:" - f" {avg_output_len:.2f}" - ) - benchmark_result['avg_output_len'] = avg_output_len + ])), + + # NOTE: The latency below includes requests awaiting time on server side. + # It's not comparable with the model inference latency for batch size 1. + **(get_stats_for_set("latency", "Average milliseconds/request (includes waiting time on server)" ,[1000 * latency for _, _, latency in REQUEST_LATENCY])), + **(get_stats_for_set("per_output_token_latency", "Average milliseconds/output_token (includes waiting time on server)", [1000 * latency / output_len for _, output_len, latency in REQUEST_LATENCY])), + **(get_stats_for_set("input_len", "input length", [float(prompt_len) for prompt_len, _, _ in REQUEST_LATENCY])), + **(get_stats_for_set("output_len", "output length", [float(output_len) for _, output_len, _ in REQUEST_LATENCY])) + } server_metrics = {} if args.scrape_server_metrics: From fd46e2f8907f187db2040a1efc27cbbfac1af44c Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Wed, 25 Sep 2024 22:15:05 +0000 Subject: [PATCH 2/5] remove start time --- .../tools/profile-generator/container/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index ab98cf694..5828a3673 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -300,7 +300,7 @@ def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics "config": { "model": args.model, "model_server": args.backend, - "start_time": start_dt_proto.ToJsonString(), + # "start_time": start_dt_proto.ToJsonString(), TODO: Marshall this such that the unmarshaller doesnt error }, "summary_stats": { "stats": [{ From 1e03e44ec3134ea59f16875395fb084f4e27f850 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Wed, 25 Sep 2024 22:39:08 +0000 Subject: [PATCH 3/5] parse float --- .../tools/profile-generator/container/benchmark_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 5828a3673..374658545 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -438,7 +438,7 @@ def print_metrics(metrics: List[str], duration: float, backend: str): # handle response if request_post.ok: if response["status"] == "success": - metric_results[query_name] = response["data"]["result"][0]["value"][1] + metric_results[query_name] = float(response["data"]["result"][0]["value"][1]) print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1])) else: print("Cloud Monitoring PromQL Error: %s" % (response["error"])) From b8a511ba1e78354d9f96e3a561468f5f69ea9a00 Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Thu, 26 Sep 2024 17:00:44 +0000 Subject: [PATCH 4/5] Only print average to .txt output --- .../tools/profile-generator/container/benchmark_serving.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 374658545..def475535 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -457,12 +457,6 @@ def get_stats_for_set(name, description, points): p99 = np.percentile(points, 99) print(f"Average {description}:" f" {avg:.2f}") - print(f"Median {description}:" f" {median:.2f}") - print(f"Standard deviation of {description}:" f" {sd:.2f}") - print(f"Minimum {description}:" f" {min:.2f}") - print(f"Maximum {description}:" f" {max:.2f}") - print(f"90th Percentile {description}:" f" {p90:.2f}") - print(f"99th Percentile {description}:" f" {p99:.2f}") return { f'avg_{name}': avg, From f9f76c586ee76e4509c5f09c2d68d16ccd8150ed Mon Sep 17 00:00:00 2001 From: Brendan Slabe Date: Thu, 26 Sep 2024 18:31:35 +0000 Subject: [PATCH 5/5] fix start_time --- .../tools/profile-generator/container/benchmark_serving.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index def475535..ca82d07cc 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -300,7 +300,10 @@ def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics "config": { "model": args.model, "model_server": args.backend, - # "start_time": start_dt_proto.ToJsonString(), TODO: Marshall this such that the unmarshaller doesnt error + "start_time": { + "seconds" : start_dt_proto.seconds, + "nanos" : start_dt_proto.nanos + } }, "summary_stats": { "stats": [{