Skip to content

Commit afd137a

Browse files
authored
Fix decoding throughput computation (#45)
* Fix tokens/s * remove print
1 parent 5acd7b1 commit afd137a

File tree

1 file changed

+5
-0
lines changed

1 file changed

+5
-0
lines changed

token_benchmark_ray.py

+5
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pathlib import Path
66
import re
77
import time
8+
import random
89
from typing import Any, Dict, List, Optional, Tuple
910

1011
import pandas as pd
@@ -56,6 +57,8 @@ def get_token_throughput_latencies(
5657
(e.g. throughput, latencies, etc.)
5758
The individual metrics for each request.
5859
"""
60+
random.seed(11111)
61+
5962
tokenizer = LlamaTokenizerFast.from_pretrained(
6063
"hf-internal-testing/llama-tokenizer"
6164
)
@@ -110,6 +113,7 @@ def get_token_throughput_latencies(
110113
request_metrics[common_metrics.INTER_TOKEN_LAT] = 0
111114
request_metrics[common_metrics.NUM_OUTPUT_TOKENS] = num_output_tokens
112115
request_metrics[common_metrics.NUM_TOTAL_TOKENS] = request_metrics[common_metrics.NUM_INPUT_TOKENS] + num_output_tokens
116+
request_metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = num_output_tokens / request_metrics[common_metrics.E2E_LAT]
113117
all_metrics.append(request_metrics)
114118
completed_requests.extend(all_metrics)
115119
pbar.update(len(completed_requests) - num_completed_requests)
@@ -132,6 +136,7 @@ def get_token_throughput_latencies(
132136
request_metrics[common_metrics.INTER_TOKEN_LAT] = 0
133137
request_metrics[common_metrics.NUM_OUTPUT_TOKENS] = num_output_tokens
134138
request_metrics[common_metrics.NUM_TOTAL_TOKENS] = request_metrics[common_metrics.NUM_INPUT_TOKENS] + num_output_tokens
139+
request_metrics[common_metrics.REQ_OUTPUT_THROUGHPUT] = num_output_tokens / request_metrics[common_metrics.E2E_LAT]
135140

136141
all_metrics.append(request_metrics)
137142
completed_requests.extend(all_metrics)

0 commit comments

Comments
 (0)