From 3672188a234e6688bc150fb3808268d2ddf4b68e Mon Sep 17 00:00:00 2001 From: "huangdi.hd" Date: Fri, 9 Aug 2024 15:47:53 +0800 Subject: [PATCH] fix issue #63 --- token_benchmark_ray.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/token_benchmark_ray.py b/token_benchmark_ray.py index a8c7754..02e56a6 100644 --- a/token_benchmark_ray.py +++ b/token_benchmark_ray.py @@ -95,15 +95,17 @@ def get_token_throughput_latencies( ): iter += 1 - default_sampling_params = {"max_tokens": num_output_tokens_list.pop()} - default_sampling_params.update(additional_sampling_params) - request_config = RequestConfig( - model=model, - prompt=prompts.pop(), - sampling_params=default_sampling_params, - llm_api=llm_api, - ) - req_launcher.launch_requests(request_config) + if iter <= max_num_completed_requests: + default_sampling_params = {"max_tokens": num_output_tokens_list.pop()} + default_sampling_params.update(additional_sampling_params) + request_config = RequestConfig( + model=model, + prompt=prompts.pop(), + sampling_params=default_sampling_params, + llm_api=llm_api, + ) + req_launcher.launch_requests(request_config) + # Retrieving results less frequently allows for more concurrent requests # to be launched. This will overall reduce the amount of time it takes # for the test to run.