Rely on script to gather data, not Prometheus #836

Bslabe123 · 2024-09-25T22:19:18Z

The metrics the benchmarking script gathers will be the closest to what an end user would see, as such we should use these for generating profiles. start_time has been temporarily removed since more investigation is needed to correctly implement this, this will be re-added in a follow up PR.

Example output:

{
  "metrics": {
    "num_prompts": 600,
    "request_rate": 20,
    "server_metrics": {
      "jetstream_slots_used_percentage": {
        "Mean": "0.80625",
        "Median": "1",
        "Sd": "0.3875",
        "Min": "0.03125",
        "Max": "1",
        "P90": "1",
        "P99": "1"
      },
      "jetstream_prefill_backlog_size": {
        "Mean": "62.6",
        "Median": "50",
        "Sd": "49.49989898979593",
        "Min": "0",
        "Max": "142",
        "P90": "121.60000000000001",
        "P99": "139.96"
      },
      "jetstream_time_to_first_token": {
        "Mean": "0.1309072266643019",
        "Median": "0.16533264033264033",
        "Min": "0.01",
        "Max": "0.5",
        "P90": "0.23331600831600832",
        "P99": "0.2486122661122661"
      },
      "jetstream_time_per_output_token": {
        "Mean": "0",
        "Median": "0.029783950617283953",
        "Min": "0.01",
        "Max": "2.5",
        "P90": "0.06992857142857141",
        "P99": "2.5"
      },
      "jetstream_time_per_request": {
        "Mean": "2.4860859514783478",
        "Median": "2.177941176470588",
        "Min": "1",
        "Max": "10",
        "P90": "5.813432835820894",
        "P99": "9.581343283582088"
      }
    },
    "benchmark_time": 45.24558734893799,
    "throughput": 13.260961679483719,
    "total_output_token": 58867,
    "output_tokens_per_min": 78063.30311861681,
    "total_input_tokens": 39871,
    "input_tokens_per_min": 52872.78031226953,
    "total_tokens": 98738,
    "tokens_per_min": 130936.08343088634,
    "avg_per_token_latency": 0.06252767149430062,
    "median_per_token_latency": 0.04538016859454204,
    "sd_per_token_latency": 0.05840462894333337,
    "min_per_token_latency": 0.0006931327638171968,
    "max_per_token_latency": 0.44351615403827865,
    "p90_per_token_latency": 0.13468392537190366,
    "p99_per_token_latency": 0.2823492748809582,
    "avg_latency": 7392.447297970454,
    "median_latency": 7913.007378578186,
    "sd_latency": 3714.844899591218,
    "min_latency": 145.55788040161133,
    "max_latency": 15069.028615951538,
    "p90_latency": 12280.448532104492,
    "p99_latency": 14529.40128326416,
    "avg_per_output_token_latency": 445.3496100653319,
    "median_per_output_token_latency": 76.25416885722768,
    "sd_per_output_token_latency": 1483.427880651156,
    "min_per_output_token_latency": 21.25244935353597,
    "max_per_output_token_latency": 10672.072649002075,
    "p90_per_output_token_latency": 460.75902462005627,
    "p99_per_output_token_latency": 8710.38443803787,
    "avg_input_len": 66.45166666666667,
    "median_input_len": 26,
    "sd_input_len": 75.05112255804188,
    "min_input_len": 4,
    "max_input_len": 256,
    "p90_input_len": 198,
    "p99_input_len": 251.01,
    "avg_output_len": 98.11166666666666,
    "median_output_len": 75.5,
    "sd_output_len": 79.47290437976679,
    "min_output_len": 1,
    "max_output_len": 259,
    "p90_output_len": 224.10000000000002,
    "p99_output_len": 255.01
  },
  "dimensions": {
    "date": "20240925-220923",
    "backend": "jetstream",
    "model_id": "google/gemma-7b",
    "tokenizer_id": "google/gemma-7b"
  },
  "config": {
    "model": "google/gemma-7b",
    "model_server": "jetstream",
  },
  "summary_stats": {
    "stats": [
      {
        "request_rate": 0,
        "request_latency": {
          "mean": 7392.447297970454,
          "median": 7913.007378578186,
          "sd": 3714.844899591218,
          "min": 145.55788040161133,
          "max": 15069.028615951538,
          "p90": 12280.448532104492,
          "p99": 14529.40128326416
        },
        "throughput": {
          "mean": 13.260961679483719
        },
        "input_length": {
          "mean": 66.45166666666667,
          "median": 26,
          "sd": 75.05112255804188,
          "min": 4,
          "max": 256,
          "p90": 198,
          "p99": 251.01
        },
        "output_length": {
          "mean": 98.11166666666666,
          "median": 75.5,
          "sd": 79.47290437976679,
          "min": 1,
          "max": 259,
          "p90": 224.10000000000002,
          "p99": 255.01
        },
        "tpot": {
          "mean": 445.3496100653319,
          "median": 76.25416885722768,
          "sd": 1483.427880651156,
          "min": 21.25244935353597,
          "max": 10672.072649002075,
          "p90": 460.75902462005627,
          "p99": 8710.38443803787
        },
        "model_server_metrics": [
          {
            "Name": "jetstream_slots_used_percentage",
            "Mean": "0.80625",
            "Median": "1",
            "Sd": "0.3875",
            "Min": "0.03125",
            "Max": "1",
            "P90": "1",
            "P99": "1"
          },
          {
            "Name": "jetstream_prefill_backlog_size",
            "Mean": "62.6",
            "Median": "50",
            "Sd": "49.49989898979593",
            "Min": "0",
            "Max": "142",
            "P90": "121.60000000000001",
            "P99": "139.96"
          },
          {
            "Name": "jetstream_time_to_first_token",
            "Mean": "0.1309072266643019",
            "Median": "0.16533264033264033",
            "Min": "0.01",
            "Max": "0.5",
            "P90": "0.23331600831600832",
            "P99": "0.2486122661122661"
          },
          {
            "Name": "jetstream_time_per_output_token",
            "Mean": "0",
            "Median": "0.029783950617283953",
            "Min": "0.01",
            "Max": "2.5",
            "P90": "0.06992857142857141",
            "P99": "2.5"
          },
          {
            "Name": "jetstream_time_per_request",
            "Mean": "2.4860859514783478",
            "Median": "2.177941176470588",
            "Min": "1",
            "Max": "10",
            "P90": "5.813432835820894",
            "P99": "9.581343283582088"
          }
        ]
      }
    ]
  }
}

benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py

* first commit * remove start time * parse float * Only print average to .txt output * fix start_time

Bslabe123 added 2 commits September 25, 2024 22:13

first commit

89af4bd

remove start time

fd46e2f

Bslabe123 requested review from achandrasekar, ahg-g and annapendleton as code owners September 25, 2024 22:19

parse float

1e03e44

achandrasekar reviewed Sep 26, 2024

View reviewed changes

benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py Outdated Show resolved Hide resolved

benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py Show resolved Hide resolved

Bslabe123 added 2 commits September 26, 2024 17:00

Only print average to .txt output

b8a511b

fix start_time

f9f76c5

achandrasekar approved these changes Sep 26, 2024

View reviewed changes

Bslabe123 merged commit 45a46b8 into main Sep 26, 2024
11 checks passed

achandrasekar pushed a commit to achandrasekar/ai-on-gke that referenced this pull request Sep 29, 2024

Rely on script to gather data, not Prometheus (GoogleCloudPlatform#836)

6dcbe07

* first commit * remove start time * parse float * Only print average to .txt output * fix start_time

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Rely on script to gather data, not Prometheus #836

Rely on script to gather data, not Prometheus #836

Bslabe123 commented Sep 25, 2024 •

edited

Loading

Rely on script to gather data, not Prometheus #836

Rely on script to gather data, not Prometheus #836

Conversation

Bslabe123 commented Sep 25, 2024 • edited Loading

Bslabe123 commented Sep 25, 2024 •

edited

Loading