5
5
from pathlib import Path
6
6
import re
7
7
import time
8
+ import random
8
9
from typing import Any , Dict , List , Optional , Tuple
9
10
10
11
import pandas as pd
@@ -56,6 +57,8 @@ def get_token_throughput_latencies(
56
57
(e.g. throughput, latencies, etc.)
57
58
The individual metrics for each request.
58
59
"""
60
+ random .seed (11111 )
61
+
59
62
tokenizer = LlamaTokenizerFast .from_pretrained (
60
63
"hf-internal-testing/llama-tokenizer"
61
64
)
@@ -110,6 +113,7 @@ def get_token_throughput_latencies(
110
113
request_metrics [common_metrics .INTER_TOKEN_LAT ] = 0
111
114
request_metrics [common_metrics .NUM_OUTPUT_TOKENS ] = num_output_tokens
112
115
request_metrics [common_metrics .NUM_TOTAL_TOKENS ] = request_metrics [common_metrics .NUM_INPUT_TOKENS ] + num_output_tokens
116
+ request_metrics [common_metrics .REQ_OUTPUT_THROUGHPUT ] = num_output_tokens / request_metrics [common_metrics .E2E_LAT ]
113
117
all_metrics .append (request_metrics )
114
118
completed_requests .extend (all_metrics )
115
119
pbar .update (len (completed_requests ) - num_completed_requests )
@@ -132,6 +136,7 @@ def get_token_throughput_latencies(
132
136
request_metrics [common_metrics .INTER_TOKEN_LAT ] = 0
133
137
request_metrics [common_metrics .NUM_OUTPUT_TOKENS ] = num_output_tokens
134
138
request_metrics [common_metrics .NUM_TOTAL_TOKENS ] = request_metrics [common_metrics .NUM_INPUT_TOKENS ] + num_output_tokens
139
+ request_metrics [common_metrics .REQ_OUTPUT_THROUGHPUT ] = num_output_tokens / request_metrics [common_metrics .E2E_LAT ]
135
140
136
141
all_metrics .append (request_metrics )
137
142
completed_requests .extend (all_metrics )
0 commit comments