2929import time
3030from typing import AsyncGenerator , List , Optional , Tuple , Dict
3131from prometheus_client import start_http_server , Histogram , Gauge
32+ import logging
3233
3334import google .auth
3435import google .auth .transport .requests
@@ -418,11 +419,11 @@ async def run_single_request(args: argparse.Namespace, api_url: str, tokenizer:
418419 if args .stream_request :
419420 result = await send_stream_request (
420421 args .backend , api_url , prompt , prompt_len , output_len ,
421- args .best_of , args .use_beam_search , args .top_k , tokenizer , args .sax_model , chosen_model )
422+ args .best_of , args .use_beam_search , args .top_k , tokenizer , args .sax_model , chosen_model , args . request_timeout , )
422423 else :
423424 result = await send_request (
424425 args .backend , api_url , prompt , prompt_len , output_len ,
425- args .best_of , args .use_beam_search , args .top_k , tokenizer , args .sax_model , chosen_model )
426+ args .best_of , args .use_beam_search , args .top_k , tokenizer , args .sax_model , chosen_model , args . request_timeout , )
426427 return chosen_model , result
427428
428429async def benchmark (
@@ -608,17 +609,36 @@ def metrics_to_scrape(backend: str) -> List[str]:
608609 # If a value is specified for a given key, it will be populated on the outputs `summary_stats.stats` field as 'value':'stats' as well.
609610 if backend == "vllm" :
610611 return [
611- "vllm:gpu_cache_usage_perc" ,
612+ "vllm:cpu_cache_usage_perc" ,
613+ "vllm:gpu_cache_usage_perc" ,
614+
612615 "vllm:num_requests_waiting" ,
613616 "vllm:num_requests_running" ,
614617 "vllm:num_requests_swapped" ,
618+
615619 "vllm:time_to_first_token_seconds" ,
616620 "vllm:time_per_output_token_seconds" ,
621+ "vllm:e2e_request_latency_seconds" ,
622+
623+ "vllm:request_prefill_time_seconds" ,
617624 "vllm:request_queue_time_seconds" ,
625+ "vllm:request_decode_time_seconds" ,
618626 "vllm:request_inference_time_seconds" ,
627+ "vllm:time_in_queue_requests" ,
628+
619629 "vllm:request_prompt_tokens" ,
620630 "vllm:request_generation_tokens" ,
621631 "vllm:iteration_tokens_total" ,
632+ "vllm:prompt_tokens_total" ,
633+ "vllm:generation_tokens_total" ,
634+ "vllm:request_success_total" ,
635+ "vllm:num_preemptions_total" ,
636+
637+ "vllm:cpu_prefix_cache_hit_rate" ,
638+ "vllm:gpu_prefix_cache_hit_rate" ,
639+
640+ "vllm:avg_generation_throughput_toks_per_s" ,
641+ "vllm:avg_prompt_throughput_toks_per_s" ,
622642 ]
623643 elif backend == "jetstream" :
624644 return [
@@ -628,7 +648,7 @@ def metrics_to_scrape(backend: str) -> List[str]:
628648 else :
629649 return []
630650
631- def print_metrics (metrics : List [str ], duration : float , backend : str ):
651+ def print_metrics (metrics : List [str ], duration : float , namespace : str , job : str ):
632652 # Creates a credentials object from the default service account file
633653 # Assumes that script has appropriate default credentials set up, ref:
634654 # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials
@@ -646,12 +666,12 @@ def print_metrics(metrics: List[str], duration: float, backend: str):
646666 all_metrics_metadata = request_post .json ()
647667 if request_post .ok is not True :
648668 print ("HTTP Error: %s" % (all_metrics_metadata ))
669+ return server_metrics
649670 if all_metrics_metadata ["status" ] != "success" :
650671 print ("Metadata error response: %s" % all_metrics_metadata ["error" ])
672+ return server_metrics
651673
652674 for metric in metrics :
653- print ("Metric Name: %s" % (metric ))
654-
655675 # Find metric type
656676 metric_type = all_metrics_metadata ['data' ][metric ]
657677 if all_metrics_metadata ['data' ][metric ] is None :
@@ -664,43 +684,58 @@ def print_metrics(metrics: List[str], duration: float, backend: str):
664684 # podmonitoring spec assumed to be named "$BACKEND-podmonitoring"
665685 queries = {
666686 "gauge" : {
667- "Mean" : "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric , backend , duration ),
668- "Median" : "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric , backend , duration ),
669- "Sd" : "stddev_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric , backend , duration ),
670- "Min" : "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric , backend , duration ),
671- "Max" : "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric , backend , duration ),
672- "P90" : "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric , backend , duration ),
673- "P99" : "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%.0fs])" % (metric , backend , duration ),
674- },
687+ "Mean" : "avg_over_time(%s{job='%s',namespace='%s'}[%.0fs])" % (metric , job , namespace , duration ),
688+ "Median" : "quantile_over_time(0.5, %s{job='%s',namespace='%s'}[%.0fs])" % (metric , job , namespace , duration ),
689+ "Sd" : "stddev_over_time(%s{job='%s',namespace='%s'}[%.0fs])" % (metric , job , namespace , duration ),
690+ "Min" : "min_over_time(%s{job='%s',namespace='%s'}[%.0fs])" % (metric , job , namespace , duration ),
691+ "Max" : "max_over_time(%s{job='%s',namespace='%s'}[%.0fs])" % (metric , job , namespace , duration ),
692+ "P90" : "quantile_over_time(0.9, %s{job='%s',namespace='%s'}[%.0fs])" % (metric , job , namespace , duration ),
693+ "P95" : "quantile_over_time(0.95, %s{job='%s',namespace='%s'}[%.0fs])" % (metric , job , namespace , duration ),
694+ "P99" : "quantile_over_time(0.99, %s{job='%s',namespace='%s'}[%.0fs])" % (metric , job , namespace , duration ),
695+ },
675696 "histogram" : {
676- "Mean" : "sum(rate(%s_sum{job='%s-podmonitoring'}[%.0fs])) / sum(rate(%s_count{job='%s-podmonitoring'}[%.0fs]))" % (metric , backend , duration , metric , backend , duration ),
677- "Median" : "histogram_quantile(0.5, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric , backend , duration ),
678- "Min" : "histogram_quantile(0, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric , backend , duration ),
679- "Max" : "histogram_quantile(1, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric , backend , duration ),
680- "P90" : "histogram_quantile(0.9, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric , backend , duration ),
681- "P99" : "histogram_quantile(0.99, sum(rate(%s_bucket{job='%s-podmonitoring'}[%.0fs])) by (le))" % (metric , backend , duration ),
682- }
697+ "Mean" : "sum(rate(%s_sum{job='%s',namespace='%s'}[%.0fs])) / sum(rate(%s_count{job='%s',namespace='%s'}[%.0fs]))" % (metric , job , namespace , duration , metric , job , namespace , duration ),
698+ "Median" : "histogram_quantile(0.5, sum(rate(%s_bucket{job='%s',namespace='%s'}[%.0fs])) by (le))" % (metric , job , namespace , duration ),
699+ "Min" : "histogram_quantile(0, sum(rate(%s_bucket{job='%s',namespace='%s'}[%.0fs])) by (le))" % (metric , job , namespace , duration ),
700+ "Max" : "histogram_quantile(1, sum(rate(%s_bucket{job='%s',namespace='%s'}[%.0fs])) by (le))" % (metric , job , namespace , duration ),
701+ "P90" : "histogram_quantile(0.9, sum(rate(%s_bucket{job='%s',namespace='%s'}[%.0fs])) by (le))" % (metric , job , namespace , duration ),
702+ "P95" : "histogram_quantile(0.95, sum(rate(%s_bucket{job='%s',namespace='%s'}[%.0fs])) by (le))" % (metric , job , namespace , duration ),
703+ "P99" : "histogram_quantile(0.99, sum(rate(%s_bucket{job='%s',namespace='%s'}[%.0fs])) by (le))" % (metric , job , namespace , duration ),
704+ },
705+ "counter" : {
706+ "Sum" : "sum_over_time(%s{job='%s',namespace='%s'}[%.0fs])" % (metric , job , namespace , duration ),
707+ "Rate" : "rate(%s{job='%s',namespace='%s'}[%.0fs])" % (metric , job , namespace , duration ),
708+ "Increase" : "increase(%s{job='%s',namespace='%s'}[%.0fs])" % (metric , job , namespace , duration ),
709+ "Mean" : "avg_over_time(rate(%s{job='%s',namespace='%s'}[%.0fs])[%.0fs:%.0fs])" % (metric , job , namespace , duration , duration , duration ),
710+ "Max" : "max_over_time(rate(%s{job='%s',namespace='%s'}[%.0fs])[%.0fs:%.0fs])" % (metric , job , namespace , duration , duration , duration ),
711+ "Min" : "min_over_time(rate(%s{job='%s',namespace='%s'}[%.0fs])[%.0fs:%.0fs])" % (metric , job , namespace , duration , duration , duration ),
712+ "P90" : "quantile_over_time(0.9, rate(%s{job='%s',namespace='%s'}[%.0fs])[%.0fs:%.0fs])" % (metric , job , namespace , duration , duration , duration ),
713+ "P95" : "quantile_over_time(0.5, rate(%s{job='%s',namespace='%s'}[%.0fs])[%.0fs:%.0fs])" % (metric , job , namespace , duration , duration , duration ),
714+ "P99" : "quantile_over_time(0.99, rate(%s{job='%s',namespace='%s'}[%.0fs])[%.0fs:%.0fs])" % (metric , job , namespace , duration , duration , duration ),
715+ },
683716 }
684717 for query_name , query in queries [metric_type ].items ():
685718 # Configure respective query
686719 url = 'https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/query' % (project_id )
687720 headers_api = {'Authorization' : 'Bearer ' + credentials .token }
688721 params = {'query' : query }
689- print (f"Finding { query_name } { metric } with the following query: { query } " )
722+ logger . debug (f"Finding { query_name } { metric } with the following query: { query } " )
690723 request_post = requests .get (url = url , headers = headers_api , params = params )
691724 response = request_post .json ()
692725
693- print (f"Got response from metrics server: { response } " )
726+ logger . debug (f"Got response from metrics server: { response } " )
694727
695728 # handle response
696729 if request_post .ok :
697- if response ["status" ] == "success" :
730+ if response ["status" ] == "success" and response [ "data" ] and response [ "data" ][ "result" ] :
698731 metric_results [query_name ] = float (response ["data" ]["result" ][0 ]["value" ][1 ])
699- print ("%s: %s" % (query_name , response ["data" ]["result" ][0 ]["value" ][1 ]))
732+ logger . debug ("%s: %s" % (query_name , response ["data" ]["result" ][0 ]["value" ][1 ]))
700733 else :
701- print ("Cloud Monitoring PromQL Error: %s" % (response ["error" ]))
734+ logger .debug ("Cloud Monitoring PromQL Error: %s" % (response ))
735+ return server_metrics
702736 else :
703- print ("HTTP Error: %s" % (response ))
737+ logger .debug ("HTTP Error: %s" % (response ))
738+ return server_metrics
704739 server_metrics [metric ] = metric_results
705740 return server_metrics
706741
@@ -791,7 +826,7 @@ def print_and_save_result(args: argparse.Namespace, benchmark_duration, total_re
791826
792827 server_metrics = {}
793828 if args .scrape_server_metrics :
794- server_metrics = print_metrics (metrics_to_scrape (args .backend ), benchmark_duration , args .backend )
829+ server_metrics = print_metrics (metrics_to_scrape (args .backend ), benchmark_duration , args .pm_namespace , args . pm_job )
795830 if args .save_json_results :
796831 save_json_results (args , benchmark_result , server_metrics , model , errors )
797832
@@ -1018,5 +1053,15 @@ def parse_traffic_split(arg):
10181053 action = "store_true" ,
10191054 help = "Whether to scrape server metrics." ,
10201055 )
1056+ parser .add_argument ("--pm-namespace" , type = str , default = "default" , help = "namespace of the pod monitoring object, ignored if scrape-server-metrics is false" )
1057+ parser .add_argument ("--pm-job" , type = str , default = "vllm-podmonitoring" , help = "name of the pod monitoring object, ignored if scrape-server-metrics is false" )
10211058 cmd_args = parser .parse_args ()
1059+
1060+ level = logging .INFO
1061+ logger = logging .getLogger (__name__ )
1062+ logger .setLevel (level )
1063+ handler = logging .StreamHandler () # This sends output to the console
1064+ handler .setLevel (level ) # Set handler level
1065+ logger .addHandler (handler )
1066+
10221067 asyncio .run (main (cmd_args ))
0 commit comments