fix: add fallback mechanism on missing end marker during streaming (#8890)

enitrat · chenmoneygithub · web-flow · commit a0b2155a9b29 · 2025-10-03T20:41:05.000-07:00
* fix: add fallback mechanism on missing end marker during streaming

* address PR review comments

* fix tests

---------

Co-authored-by: chenmoneygithub &lt;chen.qian@databricks.com&gt;
diff --git a/dspy/streaming/streamify.py b/dspy/streaming/streamify.py
@@ -193,6 +193,11 @@ async def async_streamer(*args, **kwargs):
                 elif isinstance(value, StatusMessage):
                     yield value
                 elif isinstance(value, Prediction):
+                    # Flush remaining buffered tokens before yielding the Prediction instance
+                    for listener in stream_listeners:
+                        if final_chunk := listener.finalize():
+                            yield final_chunk
+
                     if include_final_prediction_in_output_stream:
                         yield value
                     elif (
diff --git a/dspy/streaming/streaming_listener.py b/dspy/streaming/streaming_listener.py
@@ -208,13 +208,41 @@ def flush(self) -> str:
             return last_tokens[:boundary_index]
         elif isinstance(settings.adapter, ChatAdapter) or settings.adapter is None:
             boundary_index = last_tokens.find("[[")
+            if boundary_index == -1:
+                boundary_index = len(last_tokens)
             return last_tokens[:boundary_index]
         else:
             raise ValueError(
                 f"Unsupported adapter for streaming: {settings.adapter}, please use one of the following adapters: "
                 f"{', '.join([a.__name__ for a in ADAPTER_SUPPORT_STREAMING])}"
             )
 
+    def finalize(self) -> StreamResponse | None:
+        """Finalize the stream and flush any remaining buffered tokens.
+
+        This should be called when the stream ends.
+        It ensures no tokens are lost from the buffer and marks the final chunk appropriately.
+
+        Returns:
+            A StreamResponse with the remaining buffered tokens and is_last_chunk=True,
+            or None if there are no buffered tokens or the stream hasn't started.
+        """
+        if self.stream_end or not self.stream_start:
+            # Stream already ended or never started, nothing to finalize
+            return None
+
+        self.stream_end = True
+        if self.field_end_queue.qsize() > 0:
+            token = self.flush()
+            if token:
+                return StreamResponse(
+                    self.predict_name,
+                    self.signature_field_name,
+                    token,
+                    is_last_chunk=True,
+                )
+        return None
+
     @property
     def _output_type(self) -> type | None:
         try:
@@ -224,7 +252,7 @@ def _output_type(self) -> type | None:
 
 
 
-def find_predictor_for_stream_listeners(program: "Module", stream_listeners: list[StreamListener]):
+def find_predictor_for_stream_listeners(program: "Module", stream_listeners: list[StreamListener]) -> dict[int, list[StreamListener]]:
     """Find the predictor for each stream listener.
 
     This is a utility function to automatically find the predictor for each stream listener. It is used when some
diff --git a/tests/streaming/test_streaming.py b/tests/streaming/test_streaming.py
@@ -166,9 +166,10 @@ def __call__(self, x: str, **kwargs):
 
     assert all_chunks[0].predict_name == "predict1"
     assert all_chunks[0].signature_field_name == "answer"
-
-    assert all_chunks[-1].predict_name == "predict2"
-    assert all_chunks[-1].signature_field_name == "judgement"
+    # The last chunk can be from either predictor because sometimes small LMs miss the `[[ ## completed ## ]]` marker,
+    # which results in an extra chunk that flushes out the buffer.
+    assert all_chunks[-2].predict_name == "predict2"
+    assert all_chunks[-2].signature_field_name == "judgement"
 
 
 @pytest.mark.anyio
@@ -299,10 +300,11 @@ def __call__(self, x: str, **kwargs):
     assert all_chunks[0].predict_name == "predict1"
     assert all_chunks[0].signature_field_name == "answer"
     assert all_chunks[0].is_last_chunk is False
-
-    assert all_chunks[-1].predict_name == "predict2"
-    assert all_chunks[-1].signature_field_name == "judgement"
-    assert all_chunks[-1].is_last_chunk is True
+    # The last chunk can be from either predictor because sometimes small LMs miss the `[[ ## completed ## ]]` marker,
+    # which results in an extra chunk that flushes out the buffer.
+    assert all_chunks[-2].predict_name == "predict2"
+    assert all_chunks[-2].signature_field_name == "judgement"
+    assert all_chunks[-2].is_last_chunk is True
 
 
 def test_sync_status_streaming():
@@ -599,6 +601,71 @@ async def gemini_stream_2(*args, **kwargs):
         )
 
 
+@pytest.mark.anyio
+async def test_stream_listener_missing_completion_marker_chat_adapter():
+    """Test that streaming works correctly when LLM response omits a final completion marker.
+
+    This test verifies that:
+    1. All tokens are yielded including those in the buffer
+    2. The last chunk is properly marked with is_last_chunk=True
+    3. No tokens are lost when the completion marker is missing
+    """
+
+    class MyProgram(dspy.Module):
+        def __init__(self):
+            super().__init__()
+            self.predict = dspy.Predict("question->answer")
+
+        def forward(self, question, **kwargs):
+            return self.predict(question=question, **kwargs)
+
+    async def incomplete_stream(*args, **kwargs):
+        """Stream that includes start marker but MISSING completion marker"""
+        # Start marker
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="[[ ##"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" answer"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" ## ]]\n\n"))])
+
+        # Content tokens - more than 10 to ensure buffering happens
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="This"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" is"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" a"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" test"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" response"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" with"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" many"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" tokens"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" to"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" ensure"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" buffering"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" works"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" correctly"))])
+        yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content="."))])
+        # NO COMPLETION MARKER
+
+    with mock.patch("litellm.acompletion", side_effect=incomplete_stream):
+        program = dspy.streamify(
+            MyProgram(),
+            stream_listeners=[
+                dspy.streaming.StreamListener(signature_field_name="answer"),
+            ],
+        )
+        with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False), adapter=dspy.ChatAdapter()):
+            output = program(question="Test question")
+            all_chunks = []
+            final_prediction = None
+            async for value in output:
+                if isinstance(value, dspy.streaming.StreamResponse):
+                    all_chunks.append(value)
+                elif isinstance(value, dspy.Prediction):
+                    final_prediction = value
+
+    full_content = "".join([chunk.chunk for chunk in all_chunks])
+    expected_content = "This is a test response with many tokens to ensure buffering works correctly."
+    assert full_content == expected_content
+    assert final_prediction.answer == expected_content
+
+
 @pytest.mark.anyio
 async def test_stream_listener_returns_correct_chunk_json_adapter_untokenized_stream():
     class MyProgram(dspy.Module):
@@ -915,9 +982,10 @@ async def stream(*args, **kwargs):
         yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" ##"))])
         yield ModelResponseStream(model="gpt-4o-mini", choices=[StreamingChoices(delta=Delta(content=" ]]"))])
 
-
     with mock.patch("litellm.acompletion", side_effect=stream):
-        with dspy.context(lm=dspy.LM("openai/gpt-4o-mini", cache=False), adapter=dspy.ChatAdapter(native_response_types=[CustomType])):
+        with dspy.context(
+            lm=dspy.LM("openai/gpt-4o-mini", cache=False), adapter=dspy.ChatAdapter(native_response_types=[CustomType])
+        ):
             output = program(question="why did a chicken cross the kitchen?")
             all_chunks = []
             async for value in output:
@@ -934,6 +1002,7 @@ async def stream(*args, **kwargs):
 async def test_streaming_with_citations():
     class AnswerWithSources(dspy.Signature):
         """Answer questions using provided documents with citations."""
+
         documents: list[Document] = dspy.InputField()
         question: str = dspy.InputField()
         answer: str = dspy.OutputField()
@@ -964,19 +1033,26 @@ async def citation_stream(*args, **kwargs):
         yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="g"))])
         yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" to "))])
         yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content="the references,"))])
-        yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(
-            content="",
-            provider_specific_fields={
-                "citation": {
-                    "type": "char_location",
-                    "cited_text": "water boils at 100°C",
-                    "document_index": 0,
-                    "document_title": "Physics Facts",
-                    "start_char_index": 0,
-                    "end_char_index": 19
-                }
-            }
-        ))])
+        yield ModelResponseStream(
+            model="claude",
+            choices=[
+                StreamingChoices(
+                    delta=Delta(
+                        content="",
+                        provider_specific_fields={
+                            "citation": {
+                                "type": "char_location",
+                                "cited_text": "water boils at 100°C",
+                                "document_index": 0,
+                                "document_title": "Physics Facts",
+                                "start_char_index": 0,
+                                "end_char_index": 19,
+                            }
+                        },
+                    )
+                )
+            ],
+        )
         yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" water"))])
         yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" boils"))])
         yield ModelResponseStream(model="claude", choices=[StreamingChoices(delta=Delta(content=" at"))])
@@ -1000,7 +1076,10 @@ async def citation_stream(*args, **kwargs):
         # Create test documents
         docs = [Document(data="Water boils at 100°C at standard pressure.", title="Physics Facts")]
 
-        with dspy.context(lm=dspy.LM("anthropic/claude-3-5-sonnet-20241022", cache=False), adapter=dspy.ChatAdapter(native_response_types=[Citations])):
+        with dspy.context(
+            lm=dspy.LM("anthropic/claude-3-5-sonnet-20241022", cache=False),
+            adapter=dspy.ChatAdapter(native_response_types=[Citations]),
+        ):
             output = program(documents=docs, question="What temperature does water boil?")
             citation_chunks = []
             answer_chunks = []