-
Notifications
You must be signed in to change notification settings - Fork 714
test: Add KV transfer cancellation test on TRT-LLM #4547
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -362,3 +362,94 @@ def test_request_cancellation_trtllm_prefill_cancel( | |
| logger.info( | ||
| "Completion request cancellation during prefill phase detected successfully" | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.trtllm_marker | ||
| @pytest.mark.gpu_1 | ||
| @pytest.mark.e2e | ||
| @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME) | ||
| def test_request_cancellation_trtllm_kv_transfer_cancel( | ||
| request, runtime_services, predownload_models | ||
| ): | ||
| """ | ||
| End-to-end test for request cancellation during prefill to decode KV transfer phase. | ||
|
|
||
| This test verifies that when a request is cancelled by the client during the KV transfer phase, | ||
| the system properly handles the cancellation and cleans up resources on the workers. | ||
| """ | ||
|
|
||
| # Step 1: Start the frontend | ||
| with DynamoFrontendProcess(request) as frontend: | ||
| logger.info("Frontend started successfully") | ||
|
|
||
| # Step 2: Start the prefill worker | ||
| with DynamoWorkerProcess(request, mode="prefill") as prefill_worker: | ||
| logger.info(f"Prefill Worker PID: {prefill_worker.get_pid()}") | ||
|
|
||
| # Step 3: Start the decode worker | ||
| with DynamoWorkerProcess(request, mode="decode") as decode_worker: | ||
| logger.info(f"Decode Worker PID: {decode_worker.get_pid()}") | ||
|
|
||
| # TODO: Why wait after worker ready fixes frontend 404 / 500 flakiness? | ||
| time.sleep(2) | ||
|
|
||
| # Step 4: Test request cancellation during KV transfer phase | ||
| logger.info( | ||
| "Testing completion request cancellation during KV transfer phase..." | ||
| ) | ||
|
|
||
| # Send request with long prompt | ||
| cancellable_req = send_cancellable_request( | ||
| "completion", use_long_prompt=True | ||
| ) | ||
|
|
||
| # Poll for "Prefill Request ID" pattern in prefill worker | ||
| request_id, prefill_log_offset = poll_for_pattern( | ||
| process=prefill_worker, | ||
| pattern="Prefill Request ID: ", | ||
| match_type="contains", | ||
| ) | ||
|
|
||
| # Poll for decode worker entry signaling start of KV transfer phase | ||
| _, decode_log_offset = poll_for_pattern( | ||
| process=decode_worker, | ||
| pattern=f"Decode Request ID: {request_id}", | ||
| poll_interval_ms=2, | ||
| ) | ||
|
|
||
| # Cancel during KV transfer phase in decode worker | ||
| cancellable_req.cancel() | ||
| logger.info( | ||
| f"Cancelled request ID: {request_id} at beginning of decode" | ||
| ) | ||
|
|
||
| # Poll for "Aborted Request ID" in decode worker | ||
| _, decode_log_offset = poll_for_pattern( | ||
| process=decode_worker, | ||
| pattern=f"Aborted Request ID: {request_id}", | ||
| log_offset=decode_log_offset, | ||
| ) | ||
|
|
||
| # Verify frontend log has kill message | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there any log we can find from the prefill worker to indicate the transfer has stopped / broken?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, I found the transfer always succeeded. which makes sense because the cancellation signal propagates into the TRT-LLM engine, but we wait until the engine gracefully exits the generate loop before returning from the request, so the engine can choose to finish receiving kv cache and then exit the request. |
||
| _, frontend_log_offset = poll_for_pattern( | ||
kthui marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| process=frontend, | ||
| pattern="issued control message Kill to sender", | ||
| ) | ||
|
|
||
| logger.info( | ||
| "Completion request cancellation at beginning of decode detected successfully" | ||
| ) | ||
|
|
||
| # Verify the workers are still functional | ||
| cancellable_req = send_cancellable_request("chat_completion_stream") | ||
| _, decode_log_offset = poll_for_pattern( | ||
| process=decode_worker, | ||
| pattern="Decode Request ID: ", | ||
| log_offset=decode_log_offset, | ||
| match_type="contains", | ||
| ) | ||
| read_streaming_responses(cancellable_req, expected_count=5) | ||
|
|
||
| logger.info( | ||
| "Workers are functional after cancellation during KV transfer" | ||
| ) | ||
Uh oh!
There was an error while loading. Please reload this page.