Skip to content

Commit 2a73b0c

Browse files
committed
Bug fix indirectly overwriting prompt_token_ids on cache recovery
1 parent d4fe2d7 commit 2a73b0c

File tree

1 file changed

+9
-6
lines changed

1 file changed

+9
-6
lines changed

serve/mlc_serve/engine/engine_common.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -278,15 +278,18 @@ def get_requests_to_process(
278278
# TODO(masahi): How to account for token counts in EvalMultiQueryRequest in
279279
# Prometheus metric?
280280
elif not state.is_prefilled:
281-
token_ids = state.prompt_token_ids
282-
# generated_token_ids is added for the case where the request is
283-
# recovering from cache eviction.
284-
285281
if (
286282
state.num_sequences == 1
287283
and state.generation_sequences[0].generated_token_ids
288284
):
289-
token_ids += state.generation_sequences[0].generated_token_ids
285+
# generated_token_ids is added for the case where the request is
286+
# recovering from cache eviction.
287+
token_ids = (
288+
state.prompt_token_ids
289+
+ state.generation_sequences[0].generated_token_ids
290+
)
291+
else:
292+
token_ids = state.prompt_token_ids
290293

291294
requests.append(
292295
PrefillRequest(
@@ -457,7 +460,7 @@ def evict_request(self, cancell_callback: Callable[[RequestId], None]) -> int:
457460
LOG.warn(
458461
f"Cancelling a parallel-sampling request '{request_to_remove.request_id}'"
459462
f"since it has generated more than {self.max_num_batched_tokens} tokens in total"
460-
"and currently we do not support preempting such request.",
463+
"and currently we do not support preempting such request.",
461464
)
462465
continue
463466

0 commit comments

Comments
 (0)