diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 29ab4cf00..829e8c930 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -24,10 +24,6 @@ from google.protobuf.timestamp_pb2 import Timestamp - -# (prompt len, output len, latency) -REQUEST_LATENCY: List[Tuple[int, int, float]] = [] - MIN_SEQ_LEN = 4 CLIENT_TIMEOUT_SEC = 3 * 60 * 60 NEW_TEXT_KEY = "\nOutput:\n" @@ -109,6 +105,16 @@ async def get_request( # The next request will be sent after the interval. await asyncio.sleep(interval) +def init_errors_map() -> Dict[str, int]: + errors = { + "ClientConnectorError": 0, + "TimeoutError": 0, + "ContentTypeError": 0, + "ClientOSError": 0, + "ServerDisconnectedError": 0, + "unknown_error": 0, + } + return errors async def send_request( backend: str, @@ -122,9 +128,10 @@ async def send_request( tokenizer: PreTrainedTokenizerBase, sax_model: str, model: str, -) -> None: +) -> Tuple[Tuple[int, int, float], Dict[str, int]]: """Sends request to server.""" request_start_time = time.time() + errors = init_errors_map() headers = {"User-Agent": "Benchmark Client"} if backend == "vllm": @@ -195,18 +202,39 @@ async def send_request( # Set client timeout to be 3 hrs. timeout = aiohttp.ClientTimeout(total=CLIENT_TIMEOUT_SEC) - async with aiohttp.ClientSession(timeout=timeout) as session: + async with aiohttp.ClientSession(timeout=timeout,trust_env=True) as session: while True: - async with session.post(api_url, headers=headers, json=pload) as response: - chunks = [] - async for chunk, _ in response.content.iter_chunks(): - chunks.append(chunk) - output = b"".join(chunks).decode("utf-8") - output = json.loads(output) - - # Re-send the request if it failed. - if "error" not in output: - break + try: + async with session.post(api_url, headers=headers, json=pload, ssl=False) as response: + output = await response.json() + + # Re-send the request if it failed. + if "error" not in output: + break + except aiohttp.client_exceptions.ClientConnectorError as client_err: + errors["ClientConnectorError"] += 1 + print(f"ClientConnectorError: {client_err}") + return None, errors + except asyncio.TimeoutError as timeout_err: + errors["TimeoutError"] += 1 + print(f"TimeoutError: {timeout_err}") + return None, errors + except aiohttp.client_exceptions.ClientOSError as e: + errors["ClientOSError"] += 1 + print(f"ClientOSError: {e}") + return None, errors + except aiohttp.client_exceptions.ContentTypeError as e: + print(f"ContentTypeError: {e}, response: {response}") + errors["ContentTypeError"] += 1 + return None, errors + except aiohttp.client_exceptions.ServerDisconnectedError as e: + errors["ServerDisconnectedError"] += 1 + print(f"ServerDisconnectedError: {e}") + return None, errors + except Exception as e: + print(f"Unknown error {e}") + errors["unknown_error"] += 1 + return None, errors request_end_time = time.time() # Naive HF transformers generation and TensorRT-LLM generation stops at EOS @@ -234,46 +262,61 @@ async def send_request( output_token_ids = tokenizer(output["response"]).input_ids output_len = len(output_token_ids) - request_latency = request_end_time - request_start_time - REQUEST_LATENCY.append((prompt_len, output_len, request_latency)) - + # (prompt len, output len, latency, success) + request_latency = (prompt_len, output_len, (request_end_time - request_start_time)) + return request_latency, None async def benchmark( - backend: str, + args: argparse.Namespace, api_url: str, - input_requests: List[Tuple[str, int, int]], - best_of: int, - use_beam_search: bool, - request_rate: float, - top_k: int, tokenizer: PreTrainedTokenizerBase, - sax_model: str, model: str, -) -> None: +) -> Tuple[List[Tuple[int, int, float]], Dict[str, int]]: """Runs benchmark with asynchronous requests.""" + input_requests = sample_requests( + args.dataset, + args.num_prompts, + args.max_input_length, + args.max_output_length, + tokenizer, + args.use_dummy_text, + ) + benchmark_start_time = time.time() tasks: List[asyncio.Task] = [] - async for request in get_request(input_requests, request_rate): + async for request in get_request(input_requests, args.request_rate): prompt, prompt_len, output_len = request task = asyncio.create_task( send_request( - backend, + args.backend, api_url, prompt, prompt_len, output_len, - best_of, - use_beam_search, - top_k, + args.best_of, + args.use_beam_search, + args.top_k, tokenizer, - sax_model, + args.sax_model, model, ) ) tasks.append(task) - await asyncio.gather(*tasks) + results = await asyncio.gather(*tasks) + combined_latencies = [] + combined_errors = init_errors_map() + for latency, errors in results: + if latency: + combined_latencies.append(latency) + if errors: + for err, count in errors.items(): + combined_errors[err] = combined_errors[err] + count + + benchmark_duration = time.time() - benchmark_start_time + print_and_save_result(args, benchmark_duration, len(input_requests), model, combined_latencies, combined_errors) + return combined_latencies, combined_errors -def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics): +def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics, model, errors): # Setup start_dt_proto = Timestamp() start_dt_proto.FromDatetime(args.start_datetime) @@ -282,23 +325,26 @@ def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics # metrics values are numerical "metrics" : { # Traffic - "num_prompts": args.num_prompts, + "num_prompts_attempted": benchmark_result['num_prompts_attempted'], + "num_prompts_succeeded": benchmark_result['num_prompts_succeeded'], "request_rate": args.request_rate, 'server_metrics': { **server_metrics }, - **benchmark_result + **benchmark_result, + **errors, }, # dimensions values are strings "dimensions": { "date": args.start_datetime.strftime('%Y%m%d-%H%M%S'), "backend": args.backend, - "model_id": args.model, + "model_id": model, "tokenizer_id": args.tokenizer, **(json.loads(args.additional_metadata_metrics_to_save) if args.additional_metadata_metrics_to_save else {}) }, "config": { - "model": args.model, + "model": model, + "num_models": len(args.models.split(',')), "model_server": args.backend, "start_time": { "seconds" : start_dt_proto.seconds, @@ -353,9 +399,9 @@ def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics } # Save to file - base_model_id = args.model.split("/")[-1] + model_without_slash = model.replace("/","-") file_name = ( - f"{args.backend}-{args.request_rate}qps-{base_model_id}-{args.start_datetime.strftime('%Y%m%d-%H%M%S')}.json" + f"{args.file_prefix}-{args.backend}-{args.request_rate}qps-{args.start_datetime.strftime('%Y%m%d-%H%M%S')}-{model_without_slash}.json" ) with open(file_name, "w", encoding="utf-8") as outfile: json.dump(final_json, outfile) @@ -448,13 +494,13 @@ def print_metrics(metrics: List[str], duration: float, backend: str): return server_metrics def get_stats_for_set(name, description, points): - avg = np.mean(points) - median = np.median(points) - sd = np.std(points) - min = np.min(points) - max = np.max(points) - p90 = np.percentile(points, 90) - p99 = np.percentile(points, 99) + avg = np.mean(points) if points else 0 + median = np.median(points) if points else 0 + sd = np.std(points) if points else 0 + min = np.min(points) if points else 0 + max = np.max(points) if points else 0 + p90 = np.percentile(points, 90) if points else 0 + p99 = np.percentile(points, 99) if points else 0 print(f"Average {description}:" f" {avg:.2f}") @@ -468,58 +514,22 @@ def get_stats_for_set(name, description, points): f'p99_{name}': p99, } -def main(args: argparse.Namespace): - print(args) - random.seed(args.seed) - np.random.seed(args.seed) - - endpoint = ( - "v1/completions" - if args.backend == "vllm" - else args.endpoint -) - - api_url = f"http://{args.host}:{args.port}/{endpoint}" - tokenizer = AutoTokenizer.from_pretrained( - args.tokenizer, trust_remote_code=args.trust_remote_code - ) - input_requests = sample_requests( - args.dataset, - args.num_prompts, - args.max_input_length, - args.max_output_length, - tokenizer, - args.use_dummy_text, - ) - - benchmark_start_time = time.time() - args.start_datetime = datetime.fromtimestamp(benchmark_start_time) - - asyncio.run( - benchmark( - args.backend, - api_url, - input_requests, - args.best_of, - args.use_beam_search, - args.request_rate, - args.top_k, - tokenizer, - args.sax_model, - args.model, - ) - ) +def print_and_save_result(args: argparse.Namespace, benchmark_duration, total_requests, model, request_latencies, errors): benchmark_result = {} - benchmark_end_time = time.time() - benchmark_time = benchmark_end_time - benchmark_start_time - print(f"Total time: {benchmark_time:.2f} s") - print(f"Requests/min: {60 * args.num_prompts / benchmark_time:.2f}") - benchmark_result['benchmark_time'] = benchmark_time - benchmark_result['throughput_rps'] = (args.num_prompts / benchmark_time) + + print(f"====Result for Model: {model}====") + print(f"Errors: {errors}") + print(f"Total time: {benchmark_duration:.2f} s") + print(f"Successful/total requests: {len(request_latencies)}/{total_requests}") + print(f"Requests/min: {60 * total_requests / benchmark_duration:.2f}") + benchmark_result["num_prompts_attempted"] = total_requests + benchmark_result["num_prompts_succeeded"] = len(request_latencies) + benchmark_result['benchmark_time'] = benchmark_duration + benchmark_result['throughput_rps'] = (args.num_prompts / benchmark_duration) total_output_tokens = np.sum([output_len for _, output_len, _ in - REQUEST_LATENCY]) - output_tokens_per_second = total_output_tokens / benchmark_time + request_latencies]) + output_tokens_per_second = total_output_tokens / benchmark_duration benchmark_result['throughput'] = output_tokens_per_second output_tokens_per_min = 60 * output_tokens_per_second @@ -528,14 +538,14 @@ def main(args: argparse.Namespace): benchmark_result['output_tokens_per_min'] = output_tokens_per_min total_input_tokens = np.sum([prompt_len for prompt_len, _, _ in - REQUEST_LATENCY]) - input_tokens_per_min = 60 * total_input_tokens / benchmark_time + request_latencies]) + input_tokens_per_min = 60 * total_input_tokens / benchmark_duration print(f"Input_tokens/min: {input_tokens_per_min:.2f}") benchmark_result['total_input_tokens'] = int(total_input_tokens) benchmark_result['input_tokens_per_min'] = input_tokens_per_min total_tokens = total_input_tokens + total_output_tokens - tokens_per_min = 60 * total_tokens / benchmark_time + tokens_per_min = 60 * total_tokens / benchmark_duration print(f"Tokens/min: {tokens_per_min:.2f}") benchmark_result['total_tokens'] = int(total_tokens) benchmark_result['tokens_per_min'] = tokens_per_min @@ -550,23 +560,65 @@ def main(args: argparse.Namespace): **benchmark_result, **(get_stats_for_set("per_token_latency", "seconds/token (includes waiting time on server)", [ latency / (prompt_len + output_len) - for prompt_len, output_len, latency in REQUEST_LATENCY + for prompt_len, output_len, latency in request_latencies ])), # NOTE: The latency below includes requests awaiting time on server side. # It's not comparable with the model inference latency for batch size 1. - **(get_stats_for_set("latency", "milliseconds/request (includes waiting time on server)" ,[1000 * latency for _, _, latency in REQUEST_LATENCY])), - **(get_stats_for_set("per_output_token_latency", "milliseconds/output_token (includes waiting time on server)", [1000 * latency / output_len for _, output_len, latency in REQUEST_LATENCY])), - **(get_stats_for_set("input_len", "input length", [float(prompt_len) for prompt_len, _, _ in REQUEST_LATENCY])), - **(get_stats_for_set("output_len", "output length", [float(output_len) for _, output_len, _ in REQUEST_LATENCY])) + **(get_stats_for_set("latency", "milliseconds/request (includes waiting time on server)" ,[1000 * latency for _, _, latency in request_latencies])), + **(get_stats_for_set("per_output_token_latency", "milliseconds/output_token (includes waiting time on server)", [1000 * latency / output_len for _, output_len, latency in request_latencies])), + **(get_stats_for_set("input_len", "input length", [float(prompt_len) for prompt_len, _, _ in request_latencies])), + **(get_stats_for_set("output_len", "output length", [float(output_len) for _, output_len, _ in request_latencies])) } server_metrics = {} if args.scrape_server_metrics: - server_metrics = print_metrics(metrics_to_scrape(args.backend), benchmark_time, args.backend) + server_metrics = print_metrics(metrics_to_scrape(args.backend), benchmark_duration, args.backend) if args.save_json_results: - save_json_results(args, benchmark_result, server_metrics) + save_json_results(args, benchmark_result, server_metrics, model, errors) + +async def main(args: argparse.Namespace): + print(args) + models = args.models.split(',') + print(f"Models to benchmark: {models}") + random.seed(args.seed) + np.random.seed(args.seed) + endpoint = ( + "v1/completions" + if args.backend == "vllm" + else args.endpoint +) + + api_url = f"http://{args.host}:{args.port}/{endpoint}" + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code + ) + benchmark_start_time = time.time() + args.start_datetime = datetime.fromtimestamp(benchmark_start_time) + + results = await asyncio.gather( + *[benchmark(args, api_url, tokenizer, model) for model in models] + ) + + # Summarize results + combined_latencies = [] + combined_errors = { + "ClientConnectorError": 0, + "TimeoutError": 0, + "ContentTypeError": 0, + "ClientOSError": 0, + "unknown_error": 0, + "ServerDisconnectedError": 0, + } + for latencies, errors in results: + combined_latencies.extend(latencies) + for k, v in errors.items(): + combined_errors[k] = combined_errors[k] + v + + benchmark_duration_all_models = time.time() - benchmark_start_time + if args.save_aggregated_result: + print_and_save_result(args, benchmark_duration_all_models, len(models)*args.num_prompts, f"ALL-{len(models)}-MODELS", combined_latencies, combined_errors) if __name__ == "__main__": parser = argparse.ArgumentParser( @@ -591,14 +643,15 @@ def main(args: argparse.Namespace): default="", help="Model name to send request to at API server for SAX model server.", ) + parser.add_argument("--file-prefix", type=str, default="benchmark") parser.add_argument("--endpoint", type=str, default="generate") parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=7080) parser.add_argument("--dataset", type=str, help="Path to the dataset.") parser.add_argument( - "--model", + "--models", type=str, - help="Name of the model.", + help="Comma separated list of models to benchmark.", ) parser.add_argument( "--tokenizer", @@ -656,7 +709,7 @@ def main(args: argparse.Namespace): "the request arrival times." ), ) - parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--seed", type=int, default=int(time.time())) parser.add_argument( "--trust-remote-code", action="store_true", @@ -681,6 +734,11 @@ def main(args: argparse.Namespace): action="store_true", help="Whether to save benchmark results to a json file.", ) + parser.add_argument( + "--save-aggregated-result", + action="store_true", + help="Whether to aggregate results of all models and save the result.", + ) parser.add_argument( "--additional-metadata-metrics-to-save", type=str, @@ -695,4 +753,4 @@ def main(args: argparse.Namespace): help="Whether to scrape server metrics.", ) cmd_args = parser.parse_args() - main(cmd_args) \ No newline at end of file + asyncio.run(main(cmd_args)) \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh old mode 100644 new mode 100755 index 53e9da2d0..7c8f638e3 --- a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh +++ b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh @@ -26,20 +26,24 @@ fi PYTHON="python3" PYTHON_OPTS="benchmark_serving.py " for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do + echo "Benchmaking request rate: ${request_rate}" # TODO: Check if profile already exists, if so then skip timestamp=$(date +"%Y-%m-%d_%H-%M-%S") output_file="latency-profile-${timestamp}.txt" if [ ${request_rate} == 0 ]; then request_rate="inf" - NUM_PROMPTS=$MAX_NUM_PROMPTS + num_prompts=$MAX_NUM_PROMPTS else - NUM_PROMPTS=$((${request_rate} * $BENCHMARK_TIME_SECONDS)) + num_prompts=$(awk "BEGIN {print int($request_rate * $BENCHMARK_TIME_SECONDS)}") fi - - PYTHON_OPTS="$PYTHON_OPTS --save-json-results --host=$IP --port=$PORT --model=$TOKENIZER --dataset=$PROMPT_DATASET_FILE --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$NUM_PROMPTS --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH" + echo "TOTAL prompts: $num_prompts" # Output: 8 + PYTHON_OPTS="$PYTHON_OPTS --save-json-results --host=$IP --port=$PORT --dataset=$PROMPT_DATASET_FILE --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$num_prompts --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH --file-prefix=$FILE_PREFIX --models=$MODELS" if [[ "$SCRAPE_SERVER_METRICS" = "true" ]]; then PYTHON_OPTS="$PYTHON_OPTS --scrape-server-metrics" fi + if [[ "$SAVE_AGGREGATED_RESULT" = "true" ]]; then + PYTHON_OPTS="$PYTHON_OPTS --save-aggregated-result" + fi $PYTHON $PYTHON_OPTS > $output_file cat $output_file sleep 5 # wait 5 seconds before next run diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf index 495cfae84..766004b62 100644 --- a/benchmarks/benchmark/tools/profile-generator/main.tf +++ b/benchmarks/benchmark/tools/profile-generator/main.tf @@ -80,4 +80,7 @@ module "latency-profile" { hugging_face_secret = var.hugging_face_secret hugging_face_secret_version = var.hugging_face_secret_version scrape_server_metrics = var.scrape_server_metrics + file_prefix = var.file_prefix + save_aggregated_result = var.save_aggregated_result + models = var.models } \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf index 79b050e3e..5d9d9baea 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf @@ -64,5 +64,8 @@ resource "kubernetes_manifest" "latency-profile-generator" { k8s_hf_secret_list = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret] output_bucket = var.output_bucket scrape_server_metrics = var.scrape_server_metrics + file_prefix = var.file_prefix + save_aggregated_result = var.save_aggregated_result + models = var.models })) } \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl index 520d83379..60eacf4c7 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl @@ -20,6 +20,8 @@ spec: image: ${artifact_registry}/latency-profile:latest command: ["bash", "-c", "./latency_throughput_curve.sh"] env: + - name: MODELS + value: ${models} - name: TOKENIZER value: ${tokenizer} - name: IP @@ -44,6 +46,10 @@ spec: value: ${scrape_server_metrics} - name: MAX_NUM_PROMPTS value: ${max_num_prompts} + - name: FILE_PREFIX + value: ${file_prefix} + - name: SAVE_AGGREGATED_RESULT + value: ${save_aggregated_result} %{ for hugging_face_token_secret in hugging_face_token_secret_list ~} - name: HF_TOKEN valueFrom: diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf index 6e7702e7b..3489b63ad 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf @@ -132,6 +132,13 @@ variable "tokenizer" { default = "tiiuae/falcon-7b" } +variable "models" { + description = "A list of comma separated models to benchmark." + type = string + nullable = false + default = "tiiuae/falcon-7b" +} + variable "output_bucket" { description = "Bucket name for storing results" type = string @@ -176,4 +183,17 @@ variable "benchmark_time_seconds" { type = number default = 120 nullable = false +} + +variable "file_prefix" { + description = "A prefix to the saved json file, useful to add additional context to the benchmark." + type = string + nullable = false + default = "benchmark" +} + +variable "save_aggregated_result" { + description = "Whether to save aggregated result, useful when benchmarking multiple models." + type = bool + default = false } \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf index a9ab4b7d4..1e7864bad 100644 --- a/benchmarks/benchmark/tools/profile-generator/variables.tf +++ b/benchmarks/benchmark/tools/profile-generator/variables.tf @@ -157,6 +157,13 @@ variable "targets" { }) } +variable "models" { + description = "A list of comma separated models to benchmark." + type = string + nullable = false + default = "tiiuae/falcon-7b" +} + variable "scrape_server_metrics" { description = "Whether to scrape server metrics." type = bool @@ -168,4 +175,17 @@ variable "benchmark_time_seconds" { type = number default = 120 nullable = false +} + +variable "file_prefix" { + description = "A prefix to the saved json file, useful to add additional context to the benchmark." + type = string + nullable = false + default = "benchmark" +} + +variable "save_aggregated_result" { + description = "Whether to save aggregated result, useful when benchmarking multiple models." + type = bool + default = false } \ No newline at end of file