updated handling of end of turn; faster response with preview server; adjustable penalties for end of turn delay; new defaults

sam-s10s · sam-s10s · commit e3af4dadc911 · 2025-10-30T02:01:56.000Z
diff --git a/sdk/voice/speechmatics/voice/_client.py b/sdk/voice/speechmatics/voice/_client.py
@@ -231,6 +231,31 @@ def __init__(
                 self._logger.warning("Smart Turn model not available. Falling back to ADAPTIVE.")
                 self._config.end_of_utterance_mode = EndOfUtteranceMode.ADAPTIVE
 
+        # EOU mode
+        self._eou_mode: EndOfUtteranceMode = self._config.end_of_utterance_mode
+
+        # Uses fixed EndOfUtterance message
+        self._uses_fixed_eou: bool = self._eou_mode == EndOfUtteranceMode.FIXED
+
+        # Uses ForceEndOfUtterance message
+        self._uses_forced_eou: bool = self._config.enable_preview_features and self._eou_mode in [
+            EndOfUtteranceMode.ADAPTIVE,
+            EndOfUtteranceMode.SMART_TURN,
+        ]
+
+        # EOT start / stop messages
+        self._uses_turn_start_end: bool = self._eou_mode in [
+            EndOfUtteranceMode.FIXED,
+            EndOfUtteranceMode.ADAPTIVE,
+            EndOfUtteranceMode.SMART_TURN,
+        ]
+
+        # EOT prediction
+        self._uses_eot_prediction: bool = self._eou_mode in [
+            EndOfUtteranceMode.ADAPTIVE,
+            EndOfUtteranceMode.SMART_TURN,
+        ]
+
         # Diarization / speaker focus
         self._dz_enabled: bool = self._config.enable_diarization
         self._dz_config = self._config.speaker_config
@@ -563,6 +588,10 @@ def update_diarization_config(self, config: SpeakerFocusConfig) -> None:
         """
         self._dz_config = config
 
+    # ============================================================================
+    # PUBLIC UTTERANCE / TURN MANAGEMENT
+    # ============================================================================
+
     def finalize(self, ttl: Optional[float] = None, end_of_turn: bool = False) -> None:
         """Finalize segments.
 
@@ -590,17 +619,9 @@ async def emit() -> None:
             if ttl is not None and ttl > 0:
                 await asyncio.sleep(ttl)
 
-            # Emit segments or finalize STT message
-            if self._config.enable_preview_features:
+            # Emit segments or finalize STT message (only for ADAPTIVE and SMART_TURN)
+            if self._uses_forced_eou and not end_of_turn:
                 """Uses the forced end of utterance message to emit segments and finalize."""
-
-                # Listen for the `END_OF_UTTERANCE` event
-                @self.once(ServerMessageType.END_OF_UTTERANCE)  # type: ignore[misc]
-                def _on_eou(message: dict[str, Any]) -> None:
-                    self._stt_message_queue.put_nowait(
-                        lambda: self._emit_segments(finalize=True, end_of_turn=end_of_turn)
-                    )
-
                 # Emit the message
                 await self.force_end_of_utterance()
 
@@ -668,6 +689,16 @@ def _evt_on_partial_transcript(message: dict[str, Any]) -> None:
         def _evt_on_final_transcript(message: dict[str, Any]) -> None:
             self._stt_message_queue.put_nowait(lambda: self._handle_transcript(message, is_final=True))
 
+        # End of Utterance (FIXED mode only)
+        if self._uses_fixed_eou:
+
+            @self.on(ServerMessageType.END_OF_UTTERANCE)  # type: ignore[misc]
+            def _evt_on_end_of_utterance(message: dict[str, Any]) -> None:
+                async def _trigger_end_of_turn() -> None:
+                    self.end_of_turn()
+
+                self._stt_message_queue.put_nowait(_trigger_end_of_turn)
+
     def _emit_message(self, message: BaseMessage) -> None:
         """Emit a message to the client.
 
@@ -1185,6 +1216,9 @@ async def _emit_segments(self, finalize: bool = False, end_of_turn: bool = False
                 ),
             )
 
+            # Stop the EOT handler
+            self._end_of_turn_handler.complete_handler()
+
             # Reset the previous view
             self._previous_view = None
             self._turn_start_time = None
@@ -1203,9 +1237,6 @@ async def _calculate_finalize_delay(
         the segments to the client.
 
         Args:
-            view: The speaker fragment to evaluate.
-            view_changes: The annotation result to use for evaluation.
-            filter_flags: The annotation flags to use for evaluation.
             smart_turn_prediction: The smart turn prediction result to use for evaluation.
 
         Returns:
@@ -1251,13 +1282,13 @@ def add_multipler_reason(multiplier: float, reason: str) -> None:
 
             # Iterate over the penalties
             for penalty in self._config.end_of_turn_config.penalties:
-                description = "+".join(penalty.annotation)
+                description = "__".join(penalty.annotation)
                 if not penalty.is_not:
                     if last_active_segment.annotation.has(*penalty.annotation):
                         add_multipler_reason(penalty.penalty, description)
                 else:
                     if not last_active_segment.annotation.has(*penalty.annotation):
-                        add_multipler_reason(penalty.penalty, description)
+                        add_multipler_reason(penalty.penalty, f"not__{description}")
 
         # Smart turn prediction
         if smart_turn_prediction:
@@ -1282,16 +1313,15 @@ def add_multipler_reason(multiplier: float, reason: str) -> None:
         finalize_delay = max(clamped_delay - self._last_ttfb, self._config.end_of_turn_config.min_end_of_turn_delay)
 
         # Emit prediction
-        if self.listeners(AgentServerMessageType.END_OF_TURN_PREDICTION):
-            self._emit_message(
-                TurnPredictionMessage(
-                    turn_id=self._turn_id,
-                    metadata=TurnPredictionMetadata(
-                        ttl=round(finalize_delay, 2),
-                        reasons=[_reason for _, _reason in reasons],
-                    ),
+        self._emit_message(
+            TurnPredictionMessage(
+                turn_id=self._turn_id,
+                metadata=TurnPredictionMetadata(
+                    ttl=round(finalize_delay, 2),
+                    reasons=[_reason for _, _reason in reasons],
                 ),
-            )
+            ),
+        )
 
         # Return the time
         return finalize_delay
@@ -1332,7 +1362,7 @@ async def _predict_smart_turn(self, end_time: float, language: str) -> SmartTurn
         return prediction
 
     # ============================================================================
-    # VAD (VOICE ACTIVITY DETECTION)
+    # VAD (VOICE ACTIVITY DETECTION) / SPEAKER DETECTION
     # ============================================================================
 
     def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
@@ -1420,30 +1450,32 @@ def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
         event_time = speaker_start_time if self._is_speaking else speaker_end_time
 
         # Emit start of turn (not when using EXTERNAL)
-        if self._config.end_of_utterance_mode is not EndOfUtteranceMode.EXTERNAL:
-            # New turn started
-            if not self._end_of_turn_handler.handler_active and self._is_speaking:
-                self._end_of_turn_handler.start_handler()
-                self._turn_id = self._end_of_turn_handler.handler_id
+        if self._uses_turn_start_end:
+            """Trigger a start of turn message and also reset pending end of turn prediction."""
+
+            # Emit end of turn prediction was wrong (turn continues)
+            if self._uses_eot_prediction and self._end_of_turn_handler.handler_active and self._is_speaking:
+                self._end_of_turn_handler.reset()
                 self._emit_message(
                     TurnStartEndResetMessage(
-                        message=AgentServerMessageType.START_OF_TURN,
+                        message=AgentServerMessageType.END_OF_TURN_RESET,
                         turn_id=self._turn_id,
                         metadata=MessageTimeMetadata(
-                            start_time=event_time,
+                            time=event_time,
                         ),
                     ),
                 )
 
-            # Emit end of turn prediction wrong
-            elif self._end_of_turn_handler.handler_active and self._is_speaking:
-                self._end_of_turn_handler.reset()
+            # New turn started
+            elif self._is_speaking and not self._end_of_turn_handler.handler_active:
+                self._end_of_turn_handler.start_handler()
+                self._turn_id = self._end_of_turn_handler.handler_id
                 self._emit_message(
                     TurnStartEndResetMessage(
-                        message=AgentServerMessageType.END_OF_TURN_RESET,
+                        message=AgentServerMessageType.START_OF_TURN,
                         turn_id=self._turn_id,
                         metadata=MessageTimeMetadata(
-                            time=event_time,
+                            start_time=event_time,
                         ),
                     ),
                 )
@@ -1462,51 +1494,57 @@ def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
             ),
         )
 
+        # Speaking has started
+        if self._is_speaking:
+            self._handle_speaker_started()
+
         # Speaking has stopped
-        if not self._is_speaking:
-            """Reset the current speaker and do smart turn detection (if enabled)."""
+        else:
+            self._handle_speaker_stopped(speaker_end_time)
 
-            # Reset current speaker
-            self._current_speaker = None
+    def _handle_speaker_started(self) -> None:
+        """Reset timers when a new speaker starts speaking after silence."""
 
-            # Add task for end of utterance
-            self._end_of_utterance_handler.update_timer(self._config.end_of_utterance_silence_trigger)
+        # Reset the handlers
+        self._end_of_utterance_handler.reset()
+        self._end_of_turn_handler.reset()
 
-            # For ADAPTIVE and SMART_TURN only
-            if self._config.end_of_utterance_mode is not EndOfUtteranceMode.EXTERNAL:
-                """When not EXTERNAL, we need to do EOT detection / prediction."""
+    def _handle_speaker_stopped(self, speaker_end_time: float) -> None:
+        """Reset the current speaker and do smart turn detection (if enabled)."""
 
-                # Callback
-                async def do_eot_detection(end_time: float, language: str) -> None:
-                    try:
-                        # Wait for Smart Turn result
-                        if self._config.end_of_utterance_mode == EndOfUtteranceMode.SMART_TURN:
-                            result = await self._predict_smart_turn(end_time, language)
-                        else:
-                            result = None
+        # Reset current speaker
+        self._current_speaker = None
 
-                        # Create a new task to evaluate the finalize delay
-                        delay = await self._calculate_finalize_delay(smart_turn_prediction=result)
+        # Add task for end of utterance
+        self._end_of_utterance_handler.update_timer(self._config.end_of_utterance_silence_trigger)
 
-                        # Set the finalize timer (go now if no delay)
-                        self._end_of_turn_handler.update_timer(delay or 0.01)
+        # For ADAPTIVE and SMART_TURN only
+        if self._uses_eot_prediction:
+            """When not EXTERNAL, we need to do EOT detection / prediction."""
 
-                    except asyncio.CancelledError:
-                        pass
+            # Callback
+            async def do_eot_detection(end_time: float, language: str) -> None:
+                try:
+                    # Wait for Smart Turn result
+                    if self._eou_mode == EndOfUtteranceMode.SMART_TURN:
+                        result = await self._predict_smart_turn(end_time, language)
+                    else:
+                        result = None
 
-                # Add task
-                self._end_of_turn_handler.add_task(
-                    asyncio.create_task(do_eot_detection(speaker_end_time, self._config.language)),
-                    self._config.end_of_utterance_mode.value,
-                )
+                    # Create a new task to evaluate the finalize delay
+                    delay = await self._calculate_finalize_delay(smart_turn_prediction=result)
 
-        # Speaking has started
-        else:
-            """When speaking has started, reset speaking-related variables."""
+                    # Set the finalize timer (go now if no delay)
+                    self._end_of_turn_handler.update_timer(delay or 0.01)
+
+                except asyncio.CancelledError:
+                    pass
 
-            # Reset the handlers
-            self._end_of_utterance_handler.reset()
-            self._end_of_turn_handler.reset()
+            # Add task
+            self._end_of_turn_handler.add_task(
+                asyncio.create_task(do_eot_detection(speaker_end_time, self._config.language)),
+                self._eou_mode.value,
+            )
 
     # ============================================================================
     # HELPER METHODS
diff --git a/tests/voice/test_04_models.py b/tests/voice/test_04_models.py
@@ -267,14 +267,10 @@ async def test_presets():
     assert low_latency_one.max_delay == 12.34
     assert low_latency_one.enable_diarization is False
 
-    print(low_latency_one.model_dump_json())
-
     # Overlay #2
     low_latency_two: VoiceAgentConfig | None = VoiceAgentConfigPreset.LOW_LATENCY(
         VoiceAgentConfig(speech_segment_config=SpeechSegmentConfig(emit_mode=SpeechSegmentEmitMode.ON_SPEAKER_ENDED))
     )
     assert low_latency_two is not None
     assert low_latency_two.enable_diarization is True
     assert low_latency_two.speech_segment_config.emit_mode == SpeechSegmentEmitMode.ON_SPEAKER_ENDED
-
-    print(low_latency_two.model_dump_json())
diff --git a/tests/voice/test_05_utterance.py b/tests/voice/test_05_utterance.py
@@ -310,9 +310,7 @@ async def test_end_of_utterance_adaptive_vad():
 
     # Test conversation
     log = ConversationLog(os.path.join(os.path.dirname(__file__), "./assets/chat2.jsonl"))
-    chat = log.get_conversation(
-        ["Info", "RecognitionStarted", "AddPartialTranscript", "AddTranscript", "EndOfUtterance"]
-    )
+    chat = log.get_conversation(["Info", "RecognitionStarted", "AddPartialTranscript", "AddTranscript"])
 
     # Start time
     start_time = datetime.datetime.now()
@@ -393,7 +391,7 @@ async def send_message(idx: int, count: int = 1, use_ttl: bool = True):
     assert last_final_time is not None
 
     # Timing info
-    timeout = adaptive_timeout * 1.5
+    timeout = adaptive_timeout * 2.0
 
     # Wait for EndOfUtterance
     try: