Skip to content

Benchmarking script output field deduplication #34

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from

Conversation

Bslabe123
Copy link
Collaborator

@Bslabe123 Bslabe123 commented Apr 10, 2025

We record duplicate values for summary stats both under the metrics and summary_stats keys, this change removes the duplicates under the metrics key

Before:

{
  "metrics": {
    "num_prompts_attempted": 100,
    "num_prompts_succeeded": 100,
    "request_rate": 10,
    "server_metrics": {
      "vllm:cpu_cache_usage_perc": {},
      "vllm:gpu_cache_usage_perc": {},
      "vllm:num_requests_waiting": {},
      "vllm:num_requests_running": {},
      "vllm:num_requests_swapped": {},
      "vllm:time_to_first_token_seconds": {},
      "vllm:time_per_output_token_seconds": {},
      "vllm:e2e_request_latency_seconds": {},
      "vllm:request_prefill_time_seconds": {},
      "vllm:request_queue_time_seconds": {},
      "vllm:request_decode_time_seconds": {},
      "vllm:request_inference_time_seconds": {},
      "vllm:time_in_queue_requests": {},
      "vllm:request_prompt_tokens": {},
      "vllm:request_generation_tokens": {},
      "vllm:iteration_tokens_total": {},
      "vllm:prompt_tokens_total": {},
      "vllm:generation_tokens_total": {},
      "vllm:request_success_total": {},
      "vllm:num_preemptions_total": {},
      "vllm:cpu_prefix_cache_hit_rate": {},
      "vllm:gpu_prefix_cache_hit_rate": {},
      "vllm:avg_generation_throughput_toks_per_s": {},
      "vllm:avg_prompt_throughput_toks_per_s": {}
    },
    "benchmark_time": 87.15983057022095,
    "throughput_rps": 1.1473175125028987,
    "throughput": 283.3071133623408,
    "total_output_token": 24693,
    "total_input_tokens": 21631,
    "input_tokens_per_sec": 248.17625112950202,
    "total_tokens": 46324,
    "tokens_per_sec": 531.4833644918427,
    "avg_per_token_latency_ms": 65.62879936484664,
    "median_per_token_latency_ms": 79.78041821641335,
    "sd_per_token_latency_ms": 58.12467562553413,
    "min_per_token_latency_ms": 1.6009688112202982,
    "max_per_token_latency_ms": 482.1956672668457,
    "p90_per_token_latency_ms": 104.27726848783006,
    "p99_per_token_latency_ms": 149.28262528806292,
    "avg_TTFT_ms": 681.1766710996628,
    "median_TTFT_ms": 523.3574693202972,
    "sd_TTFT_ms": 678.7279214216226,
    "min_TTFT_ms": 78.12633419036865,
    "max_TTFT_ms": 3456.4309854507446,
    "p90_TTFT_ms": 1301.4110309600842,
    "p99_TTFT_ms": 3312.3639695787438,
    "avg_ITL_ms": 100.06214696606354,
    "median_ITL_ms": 87.2403130531311,
    "sd_ITL_ms": 278.0551728819678,
    "min_ITL_ms": 0.009039878845214844,
    "max_ITL_ms": 23677.23402738571,
    "p90_ITL_ms": 90.99695301055908,
    "p99_ITL_ms": 352.11324405670166,
    "avg_latency_ms": 25318.23270263672,
    "median_latency_ms": 18174.828491210938,
    "sd_latency_ms": 22968.93149507456,
    "min_latency_ms": 510.90283203125,
    "max_latency_ms": 81436.54663085938,
    "p90_latency_ms": 61525.34230957033,
    "p99_latency_ms": 75749.16485351566,
    "avg_normalized_time_per_output_token_ms": 135.875691689547,
    "median_normalized_time_per_output_token_ms": 110.48952206123582,
    "sd_normalized_time_per_output_token_ms": 96.0256469921684,
    "min_normalized_time_per_output_token_ms": 63.86285400390625,
    "max_normalized_time_per_output_token_ms": 795.3367020456415,
    "p90_normalized_time_per_output_token_ms": 174.08503954541553,
    "p99_normalized_time_per_output_token_ms": 718.4630758117904,
    "avg_input_len": 216.31,
    "median_input_len": 73,
    "sd_input_len": 268.59451576679675,
    "min_input_len": 4,
    "max_input_len": 975,
    "p90_input_len": 601.9000000000007,
    "p99_input_len": 974.01,
    "avg_output_len": 246.93,
    "median_output_len": 146,
    "sd_output_len": 255.60411792457492,
    "min_output_len": 4,
    "max_output_len": 995,
    "p90_output_len": 667.6,
    "p99_output_len": 884.1200000000006,
    "ClientConnectorError": 0,
    "TimeoutError": 0,
    "ContentTypeError": 0,
    "ClientOSError": 0,
    "ServerDisconnectedError": 0,
    "unknown_error": 0
  },
  "dimensions": {
    "date": "20250410-161611",
    "backend": "vllm",
    "model_id": "weighted",
    "tokenizer_id": "meta-llama/Llama-2-7b-hf"
  },
  "config": {
    "model": "weighted",
    "num_models": 1,
    "model_server": "vllm",
    "start_time": {
      "seconds": 1744301771,
      "nanos": 385791000
    }
  },
  "summary_stats": {
    "stats": [
      {
        "request_rate": 10,
        "request_latency": {
          "mean": 25318.23270263672,
          "median": 18174.828491210938,
          "sd": 22968.93149507456,
          "min": 510.90283203125,
          "max": 81436.54663085938,
          "p90": 61525.34230957033,
          "p99": 75749.16485351566
        },
        "throughput": {
          "mean": 283.3071133623408
        },
        "input_length": {
          "mean": 216.31,
          "median": 73,
          "sd": 268.59451576679675,
          "min": 4,
          "max": 975,
          "p90": 601.9000000000007,
          "p99": 974.01
        },
        "output_length": {
          "mean": 246.93,
          "median": 146,
          "sd": 255.60411792457492,
          "min": 4,
          "max": 995,
          "p90": 667.6,
          "p99": 884.1200000000006
        },
        "tpot": {
          "mean": 135.875691689547,
          "median": 110.48952206123582,
          "sd": 96.0256469921684,
          "min": 63.86285400390625,
          "max": 795.3367020456415,
          "p90": 174.08503954541553,
          "p99": 718.4630758117904
        },
     ...

After:

{
  "metrics": {
    "num_prompts_attempted": 100,
    "num_prompts_succeeded": 100,
    "request_rate": 10,
    "server_metrics": {
      "vllm:cpu_cache_usage_perc": {},
      "vllm:gpu_cache_usage_perc": {},
      "vllm:num_requests_waiting": {},
      "vllm:num_requests_running": {},
      "vllm:num_requests_swapped": {},
      "vllm:time_to_first_token_seconds": {},
      "vllm:time_per_output_token_seconds": {},
      "vllm:e2e_request_latency_seconds": {},
      "vllm:request_prefill_time_seconds": {},
      "vllm:request_queue_time_seconds": {},
      "vllm:request_decode_time_seconds": {},
      "vllm:request_inference_time_seconds": {},
      "vllm:time_in_queue_requests": {},
      "vllm:request_prompt_tokens": {},
      "vllm:request_generation_tokens": {},
      "vllm:iteration_tokens_total": {},
      "vllm:prompt_tokens_total": {},
      "vllm:generation_tokens_total": {},
      "vllm:request_success_total": {},
      "vllm:num_preemptions_total": {},
      "vllm:cpu_prefix_cache_hit_rate": {},
      "vllm:gpu_prefix_cache_hit_rate": {},
      "vllm:avg_generation_throughput_toks_per_s": {},
      "vllm:avg_prompt_throughput_toks_per_s": {}
    },
    "benchmark_time": 89.79840230941772,
    "throughput_rps": 1.1136055589879061,
    "throughput": 252.20938699958097,
    "total_output_token": 22648,
    "total_input_tokens": 24724,
    "input_tokens_per_sec": 275.3278384041699,
    "total_tokens": 47372,
    "tokens_per_sec": 527.5372254037509,
    "ClientConnectorError": 0,
    "TimeoutError": 0,
    "ContentTypeError": 0,
    "ClientOSError": 0,
    "ServerDisconnectedError": 0,
    "unknown_error": 0
  },
  "dimensions": {
    "date": "20250410-164545",
    "backend": "vllm",
    "model_id": "weighted",
    "tokenizer_id": "meta-llama/Llama-2-7b-hf"
  },
  "config": {
    "model": "weighted",
    "num_models": 1,
    "model_server": "vllm",
    "start_time": {
      "seconds": 1744303545,
      "nanos": 960754000
    }
  },
  "summary_stats": {
    "stats": [
      {
        "request_rate": 10,
        "request_latency": {
          "mean": 31747127.46826172,
          "median": 31635978.759765625,
          "sd": 21039342.704095166,
          "min": 1089250.9765625,
          "max": 80900479.00390625,
          "p90": 62639594.55566406,
          "p99": 80862957.37548828
        },
        "throughput": {
          "mean": 252.20938699958097
        },
        "input_length": {
          "mean": 247.24,
          "median": 119.5,
          "sd": 283.9288333368064,
          "min": 5,
          "max": 1012,
          "p90": 674.0000000000006,
          "p99": 955.5700000000003
        },
        "output_length": {
          "mean": 226.48,
          "median": 176,
          "sd": 206.0769021506292,
          "min": 4,
          "max": 921,
          "p90": 518.8000000000006,
          "p99": 691.3200000000012
        },
        "ttft": {
          "mean": 9933.80773329258,
          "median": 583.4563143253326,
          "sd": 14529.419121424398,
          "min": 101.06280565261841,
          "max": 37110.926535606384,
          "p90": 36094.4530960083,
          "p99": 37066.00822801113
        },
        "inter_token_latency": {
          "mean": 96.5978099885208,
          "median": 87.64755535125732,
          "sd": 149.29015189934833,
          "min": 0.008959770202636719,
          "max": 15574.367049217224,
          "p90": 90.9701452255249,
          "p99": 323.8559623718262
        },
        "tpot": {
          "mean": 422597.06265315035,
          "median": 121694.35466963978,
          "sd": 1089551.8412275861,
          "min": 87839.82519425217,
          "max": 8983535.034179688,
          "p90": 670745.2299854641,
          "p99": 4052263.4875488533
        },
        "per_token_latencies": {
          "mean": 111.2268114998901,
          "median": 89.54365993742783,
          "sd": 226.35747867726514,
          "min": 1.331602660834352,
          "max": 2226.0436150045957,
          "p90": 175.7089749850049,
          "p99": 533.3310897145078
        },
      ...

@Bslabe123 Bslabe123 requested a review from achandrasekar April 10, 2025 20:23
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant