From 3672188a234e6688bc150fb3808268d2ddf4b68e Mon Sep 17 00:00:00 2001
From: "huangdi.hd" <huangdi.hd@alibaba-inc.com>
Date: Fri, 9 Aug 2024 15:47:53 +0800
Subject: [PATCH] fix issue #63

---
 token_benchmark_ray.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/token_benchmark_ray.py b/token_benchmark_ray.py
index a8c7754..02e56a6 100644
--- a/token_benchmark_ray.py
+++ b/token_benchmark_ray.py
@@ -95,15 +95,17 @@ def get_token_throughput_latencies(
     ):
         iter += 1
 
-        default_sampling_params = {"max_tokens": num_output_tokens_list.pop()}
-        default_sampling_params.update(additional_sampling_params)
-        request_config = RequestConfig(
-            model=model,
-            prompt=prompts.pop(),
-            sampling_params=default_sampling_params,
-            llm_api=llm_api,
-        )
-        req_launcher.launch_requests(request_config)
+        if iter <= max_num_completed_requests:
+            default_sampling_params = {"max_tokens": num_output_tokens_list.pop()}
+            default_sampling_params.update(additional_sampling_params)
+            request_config = RequestConfig(
+                model=model,
+                prompt=prompts.pop(),
+                sampling_params=default_sampling_params,
+                llm_api=llm_api,
+            )
+            req_launcher.launch_requests(request_config)
+
         # Retrieving results less frequently allows for more concurrent requests
         # to be launched. This will overall reduce the amount of time it takes
         # for the test to run.