@@ -231,6 +231,31 @@ def __init__(
231231 self ._logger .warning ("Smart Turn model not available. Falling back to ADAPTIVE." )
232232 self ._config .end_of_utterance_mode = EndOfUtteranceMode .ADAPTIVE
233233
234+ # EOU mode
235+ self ._eou_mode : EndOfUtteranceMode = self ._config .end_of_utterance_mode
236+
237+ # Uses fixed EndOfUtterance message
238+ self ._uses_fixed_eou : bool = self ._eou_mode == EndOfUtteranceMode .FIXED
239+
240+ # Uses ForceEndOfUtterance message
241+ self ._uses_forced_eou : bool = self ._config .enable_preview_features and self ._eou_mode in [
242+ EndOfUtteranceMode .ADAPTIVE ,
243+ EndOfUtteranceMode .SMART_TURN ,
244+ ]
245+
246+ # EOT start / stop messages
247+ self ._uses_turn_start_end : bool = self ._eou_mode in [
248+ EndOfUtteranceMode .FIXED ,
249+ EndOfUtteranceMode .ADAPTIVE ,
250+ EndOfUtteranceMode .SMART_TURN ,
251+ ]
252+
253+ # EOT prediction
254+ self ._uses_eot_prediction : bool = self ._eou_mode in [
255+ EndOfUtteranceMode .ADAPTIVE ,
256+ EndOfUtteranceMode .SMART_TURN ,
257+ ]
258+
234259 # Diarization / speaker focus
235260 self ._dz_enabled : bool = self ._config .enable_diarization
236261 self ._dz_config = self ._config .speaker_config
@@ -563,6 +588,10 @@ def update_diarization_config(self, config: SpeakerFocusConfig) -> None:
563588 """
564589 self ._dz_config = config
565590
591+ # ============================================================================
592+ # PUBLIC UTTERANCE / TURN MANAGEMENT
593+ # ============================================================================
594+
566595 def finalize (self , ttl : Optional [float ] = None , end_of_turn : bool = False ) -> None :
567596 """Finalize segments.
568597
@@ -590,17 +619,9 @@ async def emit() -> None:
590619 if ttl is not None and ttl > 0 :
591620 await asyncio .sleep (ttl )
592621
593- # Emit segments or finalize STT message
594- if self ._config . enable_preview_features :
622+ # Emit segments or finalize STT message (only for ADAPTIVE and SMART_TURN)
623+ if self ._uses_forced_eou and not end_of_turn :
595624 """Uses the forced end of utterance message to emit segments and finalize."""
596-
597- # Listen for the `END_OF_UTTERANCE` event
598- @self .once (ServerMessageType .END_OF_UTTERANCE ) # type: ignore[misc]
599- def _on_eou (message : dict [str , Any ]) -> None :
600- self ._stt_message_queue .put_nowait (
601- lambda : self ._emit_segments (finalize = True , end_of_turn = end_of_turn )
602- )
603-
604625 # Emit the message
605626 await self .force_end_of_utterance ()
606627
@@ -668,6 +689,16 @@ def _evt_on_partial_transcript(message: dict[str, Any]) -> None:
668689 def _evt_on_final_transcript (message : dict [str , Any ]) -> None :
669690 self ._stt_message_queue .put_nowait (lambda : self ._handle_transcript (message , is_final = True ))
670691
692+ # End of Utterance (FIXED mode only)
693+ if self ._uses_fixed_eou :
694+
695+ @self .on (ServerMessageType .END_OF_UTTERANCE ) # type: ignore[misc]
696+ def _evt_on_end_of_utterance (message : dict [str , Any ]) -> None :
697+ async def _trigger_end_of_turn () -> None :
698+ self .end_of_turn ()
699+
700+ self ._stt_message_queue .put_nowait (_trigger_end_of_turn )
701+
671702 def _emit_message (self , message : BaseMessage ) -> None :
672703 """Emit a message to the client.
673704
@@ -1185,6 +1216,9 @@ async def _emit_segments(self, finalize: bool = False, end_of_turn: bool = False
11851216 ),
11861217 )
11871218
1219+ # Stop the EOT handler
1220+ self ._end_of_turn_handler .complete_handler ()
1221+
11881222 # Reset the previous view
11891223 self ._previous_view = None
11901224 self ._turn_start_time = None
@@ -1203,9 +1237,6 @@ async def _calculate_finalize_delay(
12031237 the segments to the client.
12041238
12051239 Args:
1206- view: The speaker fragment to evaluate.
1207- view_changes: The annotation result to use for evaluation.
1208- filter_flags: The annotation flags to use for evaluation.
12091240 smart_turn_prediction: The smart turn prediction result to use for evaluation.
12101241
12111242 Returns:
@@ -1251,13 +1282,13 @@ def add_multipler_reason(multiplier: float, reason: str) -> None:
12511282
12521283 # Iterate over the penalties
12531284 for penalty in self ._config .end_of_turn_config .penalties :
1254- description = "+ " .join (penalty .annotation )
1285+ description = "__ " .join (penalty .annotation )
12551286 if not penalty .is_not :
12561287 if last_active_segment .annotation .has (* penalty .annotation ):
12571288 add_multipler_reason (penalty .penalty , description )
12581289 else :
12591290 if not last_active_segment .annotation .has (* penalty .annotation ):
1260- add_multipler_reason (penalty .penalty , description )
1291+ add_multipler_reason (penalty .penalty , f"not__ { description } " )
12611292
12621293 # Smart turn prediction
12631294 if smart_turn_prediction :
@@ -1282,16 +1313,15 @@ def add_multipler_reason(multiplier: float, reason: str) -> None:
12821313 finalize_delay = max (clamped_delay - self ._last_ttfb , self ._config .end_of_turn_config .min_end_of_turn_delay )
12831314
12841315 # Emit prediction
1285- if self .listeners (AgentServerMessageType .END_OF_TURN_PREDICTION ):
1286- self ._emit_message (
1287- TurnPredictionMessage (
1288- turn_id = self ._turn_id ,
1289- metadata = TurnPredictionMetadata (
1290- ttl = round (finalize_delay , 2 ),
1291- reasons = [_reason for _ , _reason in reasons ],
1292- ),
1316+ self ._emit_message (
1317+ TurnPredictionMessage (
1318+ turn_id = self ._turn_id ,
1319+ metadata = TurnPredictionMetadata (
1320+ ttl = round (finalize_delay , 2 ),
1321+ reasons = [_reason for _ , _reason in reasons ],
12931322 ),
1294- )
1323+ ),
1324+ )
12951325
12961326 # Return the time
12971327 return finalize_delay
@@ -1332,7 +1362,7 @@ async def _predict_smart_turn(self, end_time: float, language: str) -> SmartTurn
13321362 return prediction
13331363
13341364 # ============================================================================
1335- # VAD (VOICE ACTIVITY DETECTION)
1365+ # VAD (VOICE ACTIVITY DETECTION) / SPEAKER DETECTION
13361366 # ============================================================================
13371367
13381368 def _vad_evaluation (self , fragments : list [SpeechFragment ]) -> None :
@@ -1420,30 +1450,32 @@ def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
14201450 event_time = speaker_start_time if self ._is_speaking else speaker_end_time
14211451
14221452 # Emit start of turn (not when using EXTERNAL)
1423- if self ._config .end_of_utterance_mode is not EndOfUtteranceMode .EXTERNAL :
1424- # New turn started
1425- if not self ._end_of_turn_handler .handler_active and self ._is_speaking :
1426- self ._end_of_turn_handler .start_handler ()
1427- self ._turn_id = self ._end_of_turn_handler .handler_id
1453+ if self ._uses_turn_start_end :
1454+ """Trigger a start of turn message and also reset pending end of turn prediction."""
1455+
1456+ # Emit end of turn prediction was wrong (turn continues)
1457+ if self ._uses_eot_prediction and self ._end_of_turn_handler .handler_active and self ._is_speaking :
1458+ self ._end_of_turn_handler .reset ()
14281459 self ._emit_message (
14291460 TurnStartEndResetMessage (
1430- message = AgentServerMessageType .START_OF_TURN ,
1461+ message = AgentServerMessageType .END_OF_TURN_RESET ,
14311462 turn_id = self ._turn_id ,
14321463 metadata = MessageTimeMetadata (
1433- start_time = event_time ,
1464+ time = event_time ,
14341465 ),
14351466 ),
14361467 )
14371468
1438- # Emit end of turn prediction wrong
1439- elif self ._end_of_turn_handler .handler_active and self ._is_speaking :
1440- self ._end_of_turn_handler .reset ()
1469+ # New turn started
1470+ elif self ._is_speaking and not self ._end_of_turn_handler .handler_active :
1471+ self ._end_of_turn_handler .start_handler ()
1472+ self ._turn_id = self ._end_of_turn_handler .handler_id
14411473 self ._emit_message (
14421474 TurnStartEndResetMessage (
1443- message = AgentServerMessageType .END_OF_TURN_RESET ,
1475+ message = AgentServerMessageType .START_OF_TURN ,
14441476 turn_id = self ._turn_id ,
14451477 metadata = MessageTimeMetadata (
1446- time = event_time ,
1478+ start_time = event_time ,
14471479 ),
14481480 ),
14491481 )
@@ -1462,51 +1494,57 @@ def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
14621494 ),
14631495 )
14641496
1497+ # Speaking has started
1498+ if self ._is_speaking :
1499+ self ._handle_speaker_started ()
1500+
14651501 # Speaking has stopped
1466- if not self . _is_speaking :
1467- """Reset the current speaker and do smart turn detection (if enabled)."""
1502+ else :
1503+ self . _handle_speaker_stopped ( speaker_end_time )
14681504
1469- # Reset current speaker
1470- self . _current_speaker = None
1505+ def _handle_speaker_started ( self ) -> None :
1506+ """Reset timers when a new speaker starts speaking after silence."""
14711507
1472- # Add task for end of utterance
1473- self ._end_of_utterance_handler .update_timer (self ._config .end_of_utterance_silence_trigger )
1508+ # Reset the handlers
1509+ self ._end_of_utterance_handler .reset ()
1510+ self ._end_of_turn_handler .reset ()
14741511
1475- # For ADAPTIVE and SMART_TURN only
1476- if self ._config .end_of_utterance_mode is not EndOfUtteranceMode .EXTERNAL :
1477- """When not EXTERNAL, we need to do EOT detection / prediction."""
1512+ def _handle_speaker_stopped (self , speaker_end_time : float ) -> None :
1513+ """Reset the current speaker and do smart turn detection (if enabled)."""
14781514
1479- # Callback
1480- async def do_eot_detection (end_time : float , language : str ) -> None :
1481- try :
1482- # Wait for Smart Turn result
1483- if self ._config .end_of_utterance_mode == EndOfUtteranceMode .SMART_TURN :
1484- result = await self ._predict_smart_turn (end_time , language )
1485- else :
1486- result = None
1515+ # Reset current speaker
1516+ self ._current_speaker = None
14871517
1488- # Create a new task to evaluate the finalize delay
1489- delay = await self ._calculate_finalize_delay ( smart_turn_prediction = result )
1518+ # Add task for end of utterance
1519+ self ._end_of_utterance_handler . update_timer ( self . _config . end_of_utterance_silence_trigger )
14901520
1491- # Set the finalize timer (go now if no delay)
1492- self ._end_of_turn_handler .update_timer (delay or 0.01 )
1521+ # For ADAPTIVE and SMART_TURN only
1522+ if self ._uses_eot_prediction :
1523+ """When not EXTERNAL, we need to do EOT detection / prediction."""
14931524
1494- except asyncio .CancelledError :
1495- pass
1525+ # Callback
1526+ async def do_eot_detection (end_time : float , language : str ) -> None :
1527+ try :
1528+ # Wait for Smart Turn result
1529+ if self ._eou_mode == EndOfUtteranceMode .SMART_TURN :
1530+ result = await self ._predict_smart_turn (end_time , language )
1531+ else :
1532+ result = None
14961533
1497- # Add task
1498- self ._end_of_turn_handler .add_task (
1499- asyncio .create_task (do_eot_detection (speaker_end_time , self ._config .language )),
1500- self ._config .end_of_utterance_mode .value ,
1501- )
1534+ # Create a new task to evaluate the finalize delay
1535+ delay = await self ._calculate_finalize_delay (smart_turn_prediction = result )
15021536
1503- # Speaking has started
1504- else :
1505- """When speaking has started, reset speaking-related variables."""
1537+ # Set the finalize timer (go now if no delay)
1538+ self ._end_of_turn_handler .update_timer (delay or 0.01 )
1539+
1540+ except asyncio .CancelledError :
1541+ pass
15061542
1507- # Reset the handlers
1508- self ._end_of_utterance_handler .reset ()
1509- self ._end_of_turn_handler .reset ()
1543+ # Add task
1544+ self ._end_of_turn_handler .add_task (
1545+ asyncio .create_task (do_eot_detection (speaker_end_time , self ._config .language )),
1546+ self ._eou_mode .value ,
1547+ )
15101548
15111549 # ============================================================================
15121550 # HELPER METHODS
0 commit comments