7
7
from typing import Optional
8
8
9
9
import datasets
10
+ import logging
10
11
import matplotlib .pyplot as plt
11
12
import numpy as np
12
13
import requests
13
14
from tqdm .contrib .concurrent import thread_map
14
15
15
16
17
+ logging .basicConfig (level = logging .INFO )
18
+ logger = logging .getLogger ("server-bench" )
19
+
20
+
16
21
def get_prompts (n_prompts : int ) -> list [str ]:
17
- print ( " Loading MMLU dataset..." )
18
- ret = datasets .load_dataset ("cais/mmlu" , "all" )["test" ]["question" ]
22
+ logger . info ( " Loading MMLU dataset..." )
23
+ ret = datasets .load_dataset ("cais/mmlu" , "all" )["test" ]["question" ] # type: ignore
19
24
if n_prompts >= 0 :
20
25
ret = ret [:n_prompts ]
21
26
return ret
22
27
23
28
24
29
def get_server (path_server : str , path_model : str , path_log : Optional [str ], port : int , n_gpu_layers : int , parallel : int , ctx_size : int ) -> dict :
25
- print ( " Starting the llama.cpp server..." )
30
+ logger . info ( " Starting the llama.cpp server..." )
26
31
address = f"http://localhost:{ port } "
27
32
28
33
popen_args : list [str ] = [
@@ -78,7 +83,7 @@ def get_prompt_length(data: dict) -> int:
78
83
return len (tokens )
79
84
80
85
81
- def send_prompt (data : dict ) -> tuple [int , float , list [float ]]:
86
+ def send_prompt (data : dict ) -> tuple [float , list [float ]]:
82
87
session = data ["session" ]
83
88
server_address : str = data ["server_address" ]
84
89
@@ -93,6 +98,7 @@ def send_prompt(data: dict) -> tuple[int, float, list[float]]:
93
98
json_data : dict = {"prompt" : prompt , "seed" : data ["seed" ], "n_predict" : data ["n_predict" ], "stream" : True }
94
99
response = session .post (f"{ server_address } /completion" , json = json_data , stream = True )
95
100
101
+ last_valid_line : str = ""
96
102
token_arrival_times : list [float ] = []
97
103
for line in response .iter_lines (decode_unicode = True ):
98
104
if not line .startswith ("data: " ):
@@ -111,21 +117,20 @@ def send_prompt(data: dict) -> tuple[int, float, list[float]]:
111
117
def benchmark (path_server : str , path_model : str , path_log : Optional [str ], port : int , n_gpu_layers : int , parallel : int , ctx_size : int , n_prompts : int , n_predict : int ):
112
118
prompts : list [str ] = get_prompts (n_prompts )
113
119
114
- server = None
120
+ server : Optional [ dict ] = None
115
121
try :
116
- server : dict = get_server (path_server , path_model , path_log , port , n_gpu_layers , parallel , ctx_size )
122
+ server = get_server (path_server , path_model , path_log , port , n_gpu_layers , parallel , ctx_size )
117
123
server_address : str = server ["address" ]
118
124
119
125
with requests .Session () as session :
120
126
data : list [dict ] = []
121
127
for i , p in enumerate (prompts ):
122
128
data .append ({"session" : session , "server_address" : server_address , "prompt" : p , "n_predict" : n_predict , "seed" : i })
123
129
124
- print ( " Getting the prompt lengths..." )
125
- prompt_n : list [ int ] = [get_prompt_length (d ) for d in data ]
130
+ logger . info ( " Getting the prompt lengths..." )
131
+ prompt_n = [get_prompt_length (d ) for d in data ]
126
132
127
- print ("Starting the benchmark..." )
128
- print ()
133
+ logger .info (" Starting the benchmark...\n " )
129
134
t0 = time ()
130
135
results : list [tuple [int , list [float ]]] = thread_map (send_prompt , data , max_workers = parallel + 1 , chunksize = 1 )
131
136
finally :
@@ -149,17 +154,17 @@ def benchmark(path_server: str, path_model: str, path_log: Optional[str], port:
149
154
token_t -= t0
150
155
token_t_last = np .max (token_t )
151
156
152
- print ( )
153
- print (f"Benchmark duration: { token_t_last :.2f} s" )
154
- print (f"Request throughput: { n_prompts / token_t_last :.2f} requests/s = { n_prompts / (token_t_last / 60 ):.2f} requests/min" )
155
- print (f"Total prompt length: { np .sum (prompt_n )} tokens" )
156
- print (f"Average prompt length: { np .mean (prompt_n ):.2f} tokens" )
157
- print (f"Average prompt latency: { np .mean (prompt_ms ):.2f} ms" )
158
- print (f"Average prompt speed: { np .sum (prompt_n ) / (1e-3 * np .sum (prompt_ms )):.2f} tokens/s" )
159
- print (f"Total generated tokens: { token_t .shape [0 ]} " )
160
- print (f"Average generation depth: { depth_sum / token_t .shape [0 ]:.2f} tokens" )
161
- print (f"Average total generation speed: { token_t .shape [0 ] / token_t_last :.2f} tokens/s" )
162
- print (f"Average generation speed per slot: { token_t .shape [0 ] / (parallel * token_t_last ):.2f} tokens/s / slot" )
157
+ logger . info ( "" )
158
+ logger . info (f" Benchmark duration: { token_t_last :.2f} s" )
159
+ logger . info (f" Request throughput: { n_prompts / token_t_last :.2f} requests/s = { n_prompts / (token_t_last / 60 ):.2f} requests/min" )
160
+ logger . info (f" Total prompt length: { np .sum (prompt_n )} tokens" )
161
+ logger . info (f" Average prompt length: { np .mean (prompt_n ):.2f} tokens" )
162
+ logger . info (f" Average prompt latency: { np .mean (prompt_ms ):.2f} ms" )
163
+ logger . info (f" Average prompt speed: { np .sum (prompt_n ) / (1e-3 * np .sum (prompt_ms )):.2f} tokens/s" )
164
+ logger . info (f" Total generated tokens: { token_t .shape [0 ]} " )
165
+ logger . info (f" Average generation depth: { depth_sum / token_t .shape [0 ]:.2f} tokens" )
166
+ logger . info (f" Average total generation speed: { token_t .shape [0 ] / token_t_last :.2f} tokens/s" )
167
+ logger . info (f" Average generation speed per slot: { token_t .shape [0 ] / (parallel * token_t_last ):.2f} tokens/s / slot" )
163
168
164
169
plt .figure ()
165
170
plt .scatter (prompt_n , prompt_ms , s = 10.0 , marker = "." , alpha = 0.25 )
0 commit comments