55
55
active_requests_metric = Gauge ('LatencyProfileGenerator:active_requests' , 'How many requests actively being processed' )
56
56
total_request_count = Counter ('LatencyProfileGenerator:request_count' , 'How many total requests have been sent' )
57
57
58
+ # Singleton class to track requests for QPS counting and calculation.
59
+ class AsyncRequestCounter :
60
+ _instance = None
61
+ _lock = asyncio .Lock ()
62
+
63
+ async def __new__ (cls , target_requests = None , * args , ** kwargs ):
64
+ async with cls ._lock :
65
+ if not cls ._instance :
66
+ cls ._instance = super ().__new__ (cls )
67
+ cls ._instance ._count = 0
68
+ cls ._instance ._start_time = time .time ()
69
+ cls ._instance ._target_requests = target_requests
70
+ return cls ._instance
71
+
72
+ async def increment (self ):
73
+ async with self ._lock :
74
+ self ._count += 1
75
+ if self ._count == self ._target_requests :
76
+ self ._end_time = time .time ()
77
+
78
+ async def get_qps (self ):
79
+ return self ._count / (self ._end_time - self ._start_time )
80
+
81
+
58
82
# Add trace config for monitoring in flight requests
59
83
async def on_request_start (session , trace_config_ctx , params ):
60
84
active_requests_metric .inc ()
61
85
total_request_count .inc ()
86
+ counter = await AsyncRequestCounter ()
87
+ await counter .increment ()
62
88
63
89
async def on_request_end (session , trace_config_ctx , params ):
64
90
active_requests_metric .dec ()
@@ -462,6 +488,8 @@ async def benchmark(
462
488
model_weights = list (models_dict .values ())
463
489
464
490
benchmark_start_time_sec = time .time ()
491
+ # Initialize the counter with target prompts
492
+ await AsyncRequestCounter (args .num_prompts )
465
493
tasks : List [asyncio .Task ] = []
466
494
prompts_sent = 0
467
495
async for request in generate_next_request (input_requests , args .request_rate ):
@@ -503,12 +531,12 @@ async def benchmark(
503
531
504
532
benchmark_duration_sec = time .time () - benchmark_start_time_sec
505
533
506
- print_and_save_result (args , benchmark_duration_sec , prompts_sent , "weighted" ,
534
+ await print_and_save_result (args , benchmark_duration_sec , prompts_sent , "weighted" ,
507
535
overall_results ["latencies" ], overall_results ["ttfts" ],
508
536
overall_results ["itls" ], overall_results ["tpots" ],
509
537
overall_results ["errors" ])
510
538
for model , data in per_model_results .items ():
511
- print_and_save_result (args , benchmark_duration_sec , len (data ["latencies" ]), model ,
539
+ await print_and_save_result (args , benchmark_duration_sec , len (data ["latencies" ]), model ,
512
540
data ["latencies" ], data ["ttfts" ], data ["itls" ],
513
541
data ["tpots" ], data ["errors" ])
514
542
@@ -524,6 +552,7 @@ def save_json_results(args: argparse.Namespace, benchmark_result, server_metrics
524
552
"num_prompts_attempted" : benchmark_result ['num_prompts_attempted' ],
525
553
"num_prompts_succeeded" : benchmark_result ['num_prompts_succeeded' ],
526
554
"request_rate" : args .request_rate ,
555
+ "queries_per_second" : benchmark_result ['queries_per_second' ],
527
556
'server_metrics' : {
528
557
** server_metrics
529
558
},
@@ -762,15 +791,6 @@ def print_metrics(metrics: List[str], duration_sec: float, namespace: str, job:
762
791
logger .debug ("HTTP Error: %s" % (response ))
763
792
continue
764
793
server_metrics [metric ] = metric_results
765
-
766
-
767
- url = 'https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/query' % (project_id )
768
- headers_api = {'Authorization' : 'Bearer ' + credentials .token }
769
- params = {'query' : f'rate(LatencyProfileGenerator:request_count_total[{ duration } s])' }
770
- logger .debug (f"Finding { query_name } { metric } with the following query: { query } " )
771
- request_post = requests .get (url = url , headers = headers_api , params = params )
772
- response = request_post .json ()
773
- print (f"Got response for benchmarking prom metrics: { response } " )
774
794
775
795
return server_metrics
776
796
@@ -795,14 +815,18 @@ def get_stats_for_set(name, description, points):
795
815
f'p99_{ name } ' : p99 ,
796
816
}
797
817
798
- def print_and_save_result (args : argparse .Namespace , benchmark_duration_sec , total_requests , model , request_latencies , ttfts , itls , tpots , errors ):
818
+ async def print_and_save_result (args : argparse .Namespace , benchmark_duration_sec , total_requests , model , request_latencies , ttfts , itls , tpots , errors ):
799
819
benchmark_result = {}
800
820
801
821
print (f"====Result for Model: { model } ====" )
802
822
print (f"Errors: { errors } " )
803
823
print (f"Total time (seconds): { benchmark_duration_sec :.2f} s" )
804
824
print (f"Successful/total requests: { len (request_latencies )} /{ total_requests } " )
805
825
print (f"Requests/sec: { total_requests / benchmark_duration_sec :.2f} " )
826
+ counter = await AsyncRequestCounter ()
827
+ queries_per_second = await counter .get_qps ()
828
+ print (f"Queries/sec: { queries_per_second :.2f} " )
829
+ benchmark_result ['queries_per_second' ] = queries_per_second
806
830
benchmark_result ["num_prompts_attempted" ] = total_requests
807
831
benchmark_result ["num_prompts_succeeded" ] = len (request_latencies )
808
832
benchmark_result ['benchmark_time' ] = benchmark_duration_sec
0 commit comments