Skip to content

Commit

Permalink
Bug fix indirectly overwriting prompt_token_ids on cache recovery
Browse files Browse the repository at this point in the history
  • Loading branch information
masahi committed Feb 2, 2024
1 parent d4fe2d7 commit 2a73b0c
Showing 1 changed file with 9 additions and 6 deletions.
15 changes: 9 additions & 6 deletions serve/mlc_serve/engine/engine_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,15 +278,18 @@ def get_requests_to_process(
# TODO(masahi): How to account for token counts in EvalMultiQueryRequest in
# Prometheus metric?
elif not state.is_prefilled:
token_ids = state.prompt_token_ids
# generated_token_ids is added for the case where the request is
# recovering from cache eviction.

if (
state.num_sequences == 1
and state.generation_sequences[0].generated_token_ids
):
token_ids += state.generation_sequences[0].generated_token_ids
# generated_token_ids is added for the case where the request is
# recovering from cache eviction.
token_ids = (
state.prompt_token_ids
+ state.generation_sequences[0].generated_token_ids
)
else:
token_ids = state.prompt_token_ids

requests.append(
PrefillRequest(
Expand Down Expand Up @@ -457,7 +460,7 @@ def evict_request(self, cancell_callback: Callable[[RequestId], None]) -> int:
LOG.warn(
f"Cancelling a parallel-sampling request '{request_to_remove.request_id}'"
f"since it has generated more than {self.max_num_batched_tokens} tokens in total"
"and currently we do not support preempting such request.",
"and currently we do not support preempting such request.",
)
continue

Expand Down

0 comments on commit 2a73b0c

Please sign in to comment.