Skip to content

Commit 07628c9

Browse files
authored
Correctly filter pod monitoring by name and namespace; add more vllm metrics; fix a bug (#10)
1 parent 16f242d commit 07628c9

File tree

2 files changed

+74
-29
lines changed

2 files changed

+74
-29
lines changed

benchmark_serving.py

Lines changed: 73 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import time
3030
from typing import AsyncGenerator, List, Optional, Tuple, Dict
3131
from prometheus_client import start_http_server, Histogram, Gauge
32+
import logging
3233

3334
import google.auth
3435
import google.auth.transport.requests
@@ -418,11 +419,11 @@ async def run_single_request(args: argparse.Namespace, api_url: str, tokenizer:
418419
if args.stream_request:
419420
result = await send_stream_request(
420421
args.backend, api_url, prompt, prompt_len, output_len,
421-
args.best_of, args.use_beam_search, args.top_k, tokenizer, args.sax_model, chosen_model)
422+
args.best_of, args.use_beam_search, args.top_k, tokenizer, args.sax_model, chosen_model, args.request_timeout,)
422423
else:
423424
result = await send_request(
424425
args.backend, api_url, prompt, prompt_len, output_len,
425-
args.best_of, args.use_beam_search, args.top_k, tokenizer, args.sax_model, chosen_model)
426+
args.best_of, args.use_beam_search, args.top_k, tokenizer, args.sax_model, chosen_model, args.request_timeout,)
426427
return chosen_model, result
427428

428429
async def benchmark(
@@ -608,17 +609,36 @@ def metrics_to_scrape(backend: str) -> List[str]:
608609
# If a value is specified for a given key, it will be populated on the outputs `summary_stats.stats` field as 'value':'stats' as well.
609610
if backend == "vllm":
610611
return [
611-
"vllm:gpu_cache_usage_perc",
612+
"vllm:cpu_cache_usage_perc",
613+
"vllm:gpu_cache_usage_perc",
614+
612615
"vllm:num_requests_waiting",
613616
"vllm:num_requests_running",
614617
"vllm:num_requests_swapped",
618+
615619
"vllm:time_to_first_token_seconds",
616620
"vllm:time_per_output_token_seconds",
621+
"vllm:e2e_request_latency_seconds",
622+
623+
"vllm:request_prefill_time_seconds",
617624
"vllm:request_queue_time_seconds",
625+
"vllm:request_decode_time_seconds",
618626
"vllm:request_inference_time_seconds",
627+
"vllm:time_in_queue_requests",
628+
619629
"vllm:request_prompt_tokens",
620630
"vllm:request_generation_tokens",
621631
"vllm:iteration_tokens_total",
632+
"vllm:prompt_tokens_total",
633+
"vllm:generation_tokens_total",
634+
"vllm:request_success_total",
635+
"vllm:num_preemptions_total",
636+
637+
"vllm:cpu_prefix_cache_hit_rate",
638+
"vllm:gpu_prefix_cache_hit_rate",
639+
640+
"vllm:avg_generation_throughput_toks_per_s",
641+
"vllm:avg_prompt_throughput_toks_per_s",
622642
]
623643
elif backend == "jetstream":
624644
return [
@@ -628,7 +648,7 @@ def metrics_to_scrape(backend: str) -> List[str]:
628648
else:
629649
return []
630650

631-
def print_metrics(metrics: List[str], duration: float, backend: str):
651+
def print_metrics(metrics: List[str], duration: float, namespace: str, job: str):
632652
# Creates a credentials object from the default service account file
633653
# Assumes that script has appropriate default credentials set up, ref:
634654
# https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials
@@ -646,12 +666,12 @@ def print_metrics(metrics: List[str], duration: float, backend: str):
646666
all_metrics_metadata = request_post.json()
647667
if request_post.ok is not True:
648668
print("HTTP Error: %s" % (all_metrics_metadata))
669+
return server_metrics
649670
if all_metrics_metadata["status"] != "success":
650671
print("Metadata error response: %s" % all_metrics_metadata["error"])
672+
return server_metrics
651673

652674
for metric in metrics:
653-
print("Metric Name: %s" % (metric))
654-
655675
# Find metric type
656676
metric_type = all_metrics_metadata['data'][metric]
657677
if all_metrics_metadata['data'][metric] is None:
@@ -664,43 +684,58 @@ def print_metrics(metrics: List[str], duration: float, backend: str):
664684
# podmonitoring spec assumed to be named "$BACKEND-podmonitoring"
665685
queries = {
666686
"gauge": {
667-
"Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
668-
"Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
669-
"Sd": "stddev_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
670-
"Min": "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
671-
"Max": "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
672-
"P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
673-
"P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
674-
},
687+
"Mean": "avg_over_time(%s{job='%s',namespace='%s'}[%.0fs])" % (metric, job, namespace, duration),
688+
"Median": "quantile_over_time(0.5, %s{job='%s',namespace='%s'}[%.0fs])" % (metric, job, namespace, duration),
689+
"Sd": "stddev_over_time(%s{job='%s',namespace='%s'}[%.0fs])" % (metric, job, namespace, duration),
690+
"Min": "min_over_time(%s{job='%s',namespace='%s'}[%.0fs])" % (metric, job, namespace, duration),
691+
"Max": "max_over_time(%s{job='%s',namespace='%s'}[%.0fs])" % (metric, job, namespace, duration),
692+
"P90": "quantile_over_time(0.9, %s{job='%s',namespace='%s'}[%.0fs])" % (metric, job, namespace, duration),
693+
"P95": "quantile_over_time(0.95, %s{job='%s',namespace='%s'}[%.0fs])" % (metric, job, namespace, duration),
694+
"P99": "quantile_over_time(0.99, %s{job='%s',namespace='%s'}[%.0fs])" % (metric, job, namespace, duration),
695+
},
675696
"histogram": {
676-
"Mean": "sum(rate(%s_sum{job='%s-podmonitoring'}[%.0fs])) / sum(rate(%s_count{job='%s-podmonitoring'}[%.0fs]))" % (metric, backend, duration, metric, backend, duration),
677-
"Median": "histogram_quantile(0.5, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
678-
"Min": "histogram_quantile(0, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
679-
"Max": "histogram_quantile(1, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
680-
"P90": "histogram_quantile(0.9, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
681-
"P99": "histogram_quantile(0.99, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric, backend, duration),
682-
}
697+
"Mean": "sum(rate(%s_sum{job='%s',namespace='%s'}[%.0fs])) / sum(rate(%s_count{job='%s',namespace='%s'}[%.0fs]))" % (metric, job, namespace, duration, metric, job, namespace, duration),
698+
"Median": "histogram_quantile(0.5, sum(rate(%s_bucket{job='%s',namespace='%s'}[%.0fs])) by (le))" % (metric, job, namespace, duration),
699+
"Min": "histogram_quantile(0, sum(rate(%s_bucket{job='%s',namespace='%s'}[%.0fs])) by (le))" % (metric, job, namespace, duration),
700+
"Max": "histogram_quantile(1, sum(rate(%s_bucket{job='%s',namespace='%s'}[%.0fs])) by (le))" % (metric, job, namespace, duration),
701+
"P90": "histogram_quantile(0.9, sum(rate(%s_bucket{job='%s',namespace='%s'}[%.0fs])) by (le))" % (metric, job, namespace, duration),
702+
"P95": "histogram_quantile(0.95, sum(rate(%s_bucket{job='%s',namespace='%s'}[%.0fs])) by (le))" % (metric, job, namespace, duration),
703+
"P99": "histogram_quantile(0.99, sum(rate(%s_bucket{job='%s',namespace='%s'}[%.0fs])) by (le))" % (metric, job, namespace, duration),
704+
},
705+
"counter": {
706+
"Sum": "sum_over_time(%s{job='%s',namespace='%s'}[%.0fs])" % (metric, job, namespace, duration),
707+
"Rate": "rate(%s{job='%s',namespace='%s'}[%.0fs])" % (metric, job, namespace, duration),
708+
"Increase": "increase(%s{job='%s',namespace='%s'}[%.0fs])" % (metric, job, namespace, duration),
709+
"Mean": "avg_over_time(rate(%s{job='%s',namespace='%s'}[%.0fs])[%.0fs:%.0fs])" % (metric, job, namespace, duration, duration, duration),
710+
"Max": "max_over_time(rate(%s{job='%s',namespace='%s'}[%.0fs])[%.0fs:%.0fs])" % (metric, job, namespace, duration, duration, duration),
711+
"Min": "min_over_time(rate(%s{job='%s',namespace='%s'}[%.0fs])[%.0fs:%.0fs])" % (metric, job, namespace, duration, duration, duration),
712+
"P90": "quantile_over_time(0.9, rate(%s{job='%s',namespace='%s'}[%.0fs])[%.0fs:%.0fs])" % (metric, job, namespace, duration, duration, duration),
713+
"P95": "quantile_over_time(0.5, rate(%s{job='%s',namespace='%s'}[%.0fs])[%.0fs:%.0fs])" % (metric, job, namespace, duration, duration, duration),
714+
"P99": "quantile_over_time(0.99, rate(%s{job='%s',namespace='%s'}[%.0fs])[%.0fs:%.0fs])" % (metric, job, namespace, duration, duration, duration),
715+
},
683716
}
684717
for query_name, query in queries[metric_type].items():
685718
# Configure respective query
686719
url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/query' % (project_id)
687720
headers_api = {'Authorization': 'Bearer ' + credentials.token}
688721
params = {'query': query}
689-
print(f"Finding {query_name} {metric} with the following query: {query}")
722+
logger.debug(f"Finding {query_name} {metric} with the following query: {query}")
690723
request_post = requests.get(url=url, headers=headers_api, params=params)
691724
response = request_post.json()
692725

693-
print(f"Got response from metrics server: {response}")
726+
logger.debug(f"Got response from metrics server: {response}")
694727

695728
# handle response
696729
if request_post.ok:
697-
if response["status"] == "success":
730+
if response["status"] == "success" and response["data"] and response["data"]["result"]:
698731
metric_results[query_name] = float(response["data"]["result"][0]["value"][1])
699-
print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1]))
732+
logger.debug("%s: %s" % (query_name, response["data"]["result"][0]["value"][1]))
700733
else:
701-
print("Cloud Monitoring PromQL Error: %s" % (response["error"]))
734+
logger.debug("Cloud Monitoring PromQL Error: %s" % (response))
735+
return server_metrics
702736
else:
703-
print("HTTP Error: %s" % (response))
737+
logger.debug("HTTP Error: %s" % (response))
738+
return server_metrics
704739
server_metrics[metric] = metric_results
705740
return server_metrics
706741

@@ -791,7 +826,7 @@ def print_and_save_result(args: argparse.Namespace, benchmark_duration, total_re
791826

792827
server_metrics = {}
793828
if args.scrape_server_metrics:
794-
server_metrics = print_metrics(metrics_to_scrape(args.backend), benchmark_duration, args.backend)
829+
server_metrics = print_metrics(metrics_to_scrape(args.backend), benchmark_duration, args.pm_namespace, args.pm_job)
795830
if args.save_json_results:
796831
save_json_results(args, benchmark_result, server_metrics, model, errors)
797832

@@ -1018,5 +1053,15 @@ def parse_traffic_split(arg):
10181053
action="store_true",
10191054
help="Whether to scrape server metrics.",
10201055
)
1056+
parser.add_argument("--pm-namespace", type=str, default="default", help="namespace of the pod monitoring object, ignored if scrape-server-metrics is false")
1057+
parser.add_argument("--pm-job", type=str, default="vllm-podmonitoring", help="name of the pod monitoring object, ignored if scrape-server-metrics is false")
10211058
cmd_args = parser.parse_args()
1059+
1060+
level = logging.INFO
1061+
logger = logging.getLogger(__name__)
1062+
logger.setLevel(level)
1063+
handler = logging.StreamHandler() # This sends output to the console
1064+
handler.setLevel(level) # Set handler level
1065+
logger.addHandler(handler)
1066+
10221067
asyncio.run(main(cmd_args))

latency_throughput_curve.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do
4040
echo "TOTAL prompts: $num_prompts"
4141

4242
# Build the python command options
43-
PYTHON_OPTS="$PYTHON_OPTS --save-json-results --host=$IP --port=$PORT --dataset=$PROMPT_DATASET_FILE --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$num_prompts --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH --file-prefix=$FILE_PREFIX --models=$MODELS"
43+
PYTHON_OPTS="$PYTHON_OPTS --save-json-results --host=$IP --port=$PORT --dataset=$PROMPT_DATASET_FILE --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$num_prompts --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH --file-prefix=$FILE_PREFIX --models=$MODELS --pm-namespace=$PM_NAMESPACE --pm-job=$PM_JOB"
4444

4545
if [[ "$TRAFFIC_SPLIT" ]]; then
4646
PYTHON_OPTS="$PYTHON_OPTS --traffic-split=$TRAFFIC_SPLIT"

0 commit comments

Comments
 (0)