Skip to content

Commit e3af4da

Browse files
committed
updated handling of end of turn; faster response with preview server; adjustable penalties for end of turn delay; new defaults
1 parent 9e012da commit e3af4da

File tree

3 files changed

+111
-79
lines changed

3 files changed

+111
-79
lines changed

sdk/voice/speechmatics/voice/_client.py

Lines changed: 109 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,31 @@ def __init__(
231231
self._logger.warning("Smart Turn model not available. Falling back to ADAPTIVE.")
232232
self._config.end_of_utterance_mode = EndOfUtteranceMode.ADAPTIVE
233233

234+
# EOU mode
235+
self._eou_mode: EndOfUtteranceMode = self._config.end_of_utterance_mode
236+
237+
# Uses fixed EndOfUtterance message
238+
self._uses_fixed_eou: bool = self._eou_mode == EndOfUtteranceMode.FIXED
239+
240+
# Uses ForceEndOfUtterance message
241+
self._uses_forced_eou: bool = self._config.enable_preview_features and self._eou_mode in [
242+
EndOfUtteranceMode.ADAPTIVE,
243+
EndOfUtteranceMode.SMART_TURN,
244+
]
245+
246+
# EOT start / stop messages
247+
self._uses_turn_start_end: bool = self._eou_mode in [
248+
EndOfUtteranceMode.FIXED,
249+
EndOfUtteranceMode.ADAPTIVE,
250+
EndOfUtteranceMode.SMART_TURN,
251+
]
252+
253+
# EOT prediction
254+
self._uses_eot_prediction: bool = self._eou_mode in [
255+
EndOfUtteranceMode.ADAPTIVE,
256+
EndOfUtteranceMode.SMART_TURN,
257+
]
258+
234259
# Diarization / speaker focus
235260
self._dz_enabled: bool = self._config.enable_diarization
236261
self._dz_config = self._config.speaker_config
@@ -563,6 +588,10 @@ def update_diarization_config(self, config: SpeakerFocusConfig) -> None:
563588
"""
564589
self._dz_config = config
565590

591+
# ============================================================================
592+
# PUBLIC UTTERANCE / TURN MANAGEMENT
593+
# ============================================================================
594+
566595
def finalize(self, ttl: Optional[float] = None, end_of_turn: bool = False) -> None:
567596
"""Finalize segments.
568597
@@ -590,17 +619,9 @@ async def emit() -> None:
590619
if ttl is not None and ttl > 0:
591620
await asyncio.sleep(ttl)
592621

593-
# Emit segments or finalize STT message
594-
if self._config.enable_preview_features:
622+
# Emit segments or finalize STT message (only for ADAPTIVE and SMART_TURN)
623+
if self._uses_forced_eou and not end_of_turn:
595624
"""Uses the forced end of utterance message to emit segments and finalize."""
596-
597-
# Listen for the `END_OF_UTTERANCE` event
598-
@self.once(ServerMessageType.END_OF_UTTERANCE) # type: ignore[misc]
599-
def _on_eou(message: dict[str, Any]) -> None:
600-
self._stt_message_queue.put_nowait(
601-
lambda: self._emit_segments(finalize=True, end_of_turn=end_of_turn)
602-
)
603-
604625
# Emit the message
605626
await self.force_end_of_utterance()
606627

@@ -668,6 +689,16 @@ def _evt_on_partial_transcript(message: dict[str, Any]) -> None:
668689
def _evt_on_final_transcript(message: dict[str, Any]) -> None:
669690
self._stt_message_queue.put_nowait(lambda: self._handle_transcript(message, is_final=True))
670691

692+
# End of Utterance (FIXED mode only)
693+
if self._uses_fixed_eou:
694+
695+
@self.on(ServerMessageType.END_OF_UTTERANCE) # type: ignore[misc]
696+
def _evt_on_end_of_utterance(message: dict[str, Any]) -> None:
697+
async def _trigger_end_of_turn() -> None:
698+
self.end_of_turn()
699+
700+
self._stt_message_queue.put_nowait(_trigger_end_of_turn)
701+
671702
def _emit_message(self, message: BaseMessage) -> None:
672703
"""Emit a message to the client.
673704
@@ -1185,6 +1216,9 @@ async def _emit_segments(self, finalize: bool = False, end_of_turn: bool = False
11851216
),
11861217
)
11871218

1219+
# Stop the EOT handler
1220+
self._end_of_turn_handler.complete_handler()
1221+
11881222
# Reset the previous view
11891223
self._previous_view = None
11901224
self._turn_start_time = None
@@ -1203,9 +1237,6 @@ async def _calculate_finalize_delay(
12031237
the segments to the client.
12041238
12051239
Args:
1206-
view: The speaker fragment to evaluate.
1207-
view_changes: The annotation result to use for evaluation.
1208-
filter_flags: The annotation flags to use for evaluation.
12091240
smart_turn_prediction: The smart turn prediction result to use for evaluation.
12101241
12111242
Returns:
@@ -1251,13 +1282,13 @@ def add_multipler_reason(multiplier: float, reason: str) -> None:
12511282

12521283
# Iterate over the penalties
12531284
for penalty in self._config.end_of_turn_config.penalties:
1254-
description = "+".join(penalty.annotation)
1285+
description = "__".join(penalty.annotation)
12551286
if not penalty.is_not:
12561287
if last_active_segment.annotation.has(*penalty.annotation):
12571288
add_multipler_reason(penalty.penalty, description)
12581289
else:
12591290
if not last_active_segment.annotation.has(*penalty.annotation):
1260-
add_multipler_reason(penalty.penalty, description)
1291+
add_multipler_reason(penalty.penalty, f"not__{description}")
12611292

12621293
# Smart turn prediction
12631294
if smart_turn_prediction:
@@ -1282,16 +1313,15 @@ def add_multipler_reason(multiplier: float, reason: str) -> None:
12821313
finalize_delay = max(clamped_delay - self._last_ttfb, self._config.end_of_turn_config.min_end_of_turn_delay)
12831314

12841315
# Emit prediction
1285-
if self.listeners(AgentServerMessageType.END_OF_TURN_PREDICTION):
1286-
self._emit_message(
1287-
TurnPredictionMessage(
1288-
turn_id=self._turn_id,
1289-
metadata=TurnPredictionMetadata(
1290-
ttl=round(finalize_delay, 2),
1291-
reasons=[_reason for _, _reason in reasons],
1292-
),
1316+
self._emit_message(
1317+
TurnPredictionMessage(
1318+
turn_id=self._turn_id,
1319+
metadata=TurnPredictionMetadata(
1320+
ttl=round(finalize_delay, 2),
1321+
reasons=[_reason for _, _reason in reasons],
12931322
),
1294-
)
1323+
),
1324+
)
12951325

12961326
# Return the time
12971327
return finalize_delay
@@ -1332,7 +1362,7 @@ async def _predict_smart_turn(self, end_time: float, language: str) -> SmartTurn
13321362
return prediction
13331363

13341364
# ============================================================================
1335-
# VAD (VOICE ACTIVITY DETECTION)
1365+
# VAD (VOICE ACTIVITY DETECTION) / SPEAKER DETECTION
13361366
# ============================================================================
13371367

13381368
def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
@@ -1420,30 +1450,32 @@ def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
14201450
event_time = speaker_start_time if self._is_speaking else speaker_end_time
14211451

14221452
# Emit start of turn (not when using EXTERNAL)
1423-
if self._config.end_of_utterance_mode is not EndOfUtteranceMode.EXTERNAL:
1424-
# New turn started
1425-
if not self._end_of_turn_handler.handler_active and self._is_speaking:
1426-
self._end_of_turn_handler.start_handler()
1427-
self._turn_id = self._end_of_turn_handler.handler_id
1453+
if self._uses_turn_start_end:
1454+
"""Trigger a start of turn message and also reset pending end of turn prediction."""
1455+
1456+
# Emit end of turn prediction was wrong (turn continues)
1457+
if self._uses_eot_prediction and self._end_of_turn_handler.handler_active and self._is_speaking:
1458+
self._end_of_turn_handler.reset()
14281459
self._emit_message(
14291460
TurnStartEndResetMessage(
1430-
message=AgentServerMessageType.START_OF_TURN,
1461+
message=AgentServerMessageType.END_OF_TURN_RESET,
14311462
turn_id=self._turn_id,
14321463
metadata=MessageTimeMetadata(
1433-
start_time=event_time,
1464+
time=event_time,
14341465
),
14351466
),
14361467
)
14371468

1438-
# Emit end of turn prediction wrong
1439-
elif self._end_of_turn_handler.handler_active and self._is_speaking:
1440-
self._end_of_turn_handler.reset()
1469+
# New turn started
1470+
elif self._is_speaking and not self._end_of_turn_handler.handler_active:
1471+
self._end_of_turn_handler.start_handler()
1472+
self._turn_id = self._end_of_turn_handler.handler_id
14411473
self._emit_message(
14421474
TurnStartEndResetMessage(
1443-
message=AgentServerMessageType.END_OF_TURN_RESET,
1475+
message=AgentServerMessageType.START_OF_TURN,
14441476
turn_id=self._turn_id,
14451477
metadata=MessageTimeMetadata(
1446-
time=event_time,
1478+
start_time=event_time,
14471479
),
14481480
),
14491481
)
@@ -1462,51 +1494,57 @@ def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
14621494
),
14631495
)
14641496

1497+
# Speaking has started
1498+
if self._is_speaking:
1499+
self._handle_speaker_started()
1500+
14651501
# Speaking has stopped
1466-
if not self._is_speaking:
1467-
"""Reset the current speaker and do smart turn detection (if enabled)."""
1502+
else:
1503+
self._handle_speaker_stopped(speaker_end_time)
14681504

1469-
# Reset current speaker
1470-
self._current_speaker = None
1505+
def _handle_speaker_started(self) -> None:
1506+
"""Reset timers when a new speaker starts speaking after silence."""
14711507

1472-
# Add task for end of utterance
1473-
self._end_of_utterance_handler.update_timer(self._config.end_of_utterance_silence_trigger)
1508+
# Reset the handlers
1509+
self._end_of_utterance_handler.reset()
1510+
self._end_of_turn_handler.reset()
14741511

1475-
# For ADAPTIVE and SMART_TURN only
1476-
if self._config.end_of_utterance_mode is not EndOfUtteranceMode.EXTERNAL:
1477-
"""When not EXTERNAL, we need to do EOT detection / prediction."""
1512+
def _handle_speaker_stopped(self, speaker_end_time: float) -> None:
1513+
"""Reset the current speaker and do smart turn detection (if enabled)."""
14781514

1479-
# Callback
1480-
async def do_eot_detection(end_time: float, language: str) -> None:
1481-
try:
1482-
# Wait for Smart Turn result
1483-
if self._config.end_of_utterance_mode == EndOfUtteranceMode.SMART_TURN:
1484-
result = await self._predict_smart_turn(end_time, language)
1485-
else:
1486-
result = None
1515+
# Reset current speaker
1516+
self._current_speaker = None
14871517

1488-
# Create a new task to evaluate the finalize delay
1489-
delay = await self._calculate_finalize_delay(smart_turn_prediction=result)
1518+
# Add task for end of utterance
1519+
self._end_of_utterance_handler.update_timer(self._config.end_of_utterance_silence_trigger)
14901520

1491-
# Set the finalize timer (go now if no delay)
1492-
self._end_of_turn_handler.update_timer(delay or 0.01)
1521+
# For ADAPTIVE and SMART_TURN only
1522+
if self._uses_eot_prediction:
1523+
"""When not EXTERNAL, we need to do EOT detection / prediction."""
14931524

1494-
except asyncio.CancelledError:
1495-
pass
1525+
# Callback
1526+
async def do_eot_detection(end_time: float, language: str) -> None:
1527+
try:
1528+
# Wait for Smart Turn result
1529+
if self._eou_mode == EndOfUtteranceMode.SMART_TURN:
1530+
result = await self._predict_smart_turn(end_time, language)
1531+
else:
1532+
result = None
14961533

1497-
# Add task
1498-
self._end_of_turn_handler.add_task(
1499-
asyncio.create_task(do_eot_detection(speaker_end_time, self._config.language)),
1500-
self._config.end_of_utterance_mode.value,
1501-
)
1534+
# Create a new task to evaluate the finalize delay
1535+
delay = await self._calculate_finalize_delay(smart_turn_prediction=result)
15021536

1503-
# Speaking has started
1504-
else:
1505-
"""When speaking has started, reset speaking-related variables."""
1537+
# Set the finalize timer (go now if no delay)
1538+
self._end_of_turn_handler.update_timer(delay or 0.01)
1539+
1540+
except asyncio.CancelledError:
1541+
pass
15061542

1507-
# Reset the handlers
1508-
self._end_of_utterance_handler.reset()
1509-
self._end_of_turn_handler.reset()
1543+
# Add task
1544+
self._end_of_turn_handler.add_task(
1545+
asyncio.create_task(do_eot_detection(speaker_end_time, self._config.language)),
1546+
self._eou_mode.value,
1547+
)
15101548

15111549
# ============================================================================
15121550
# HELPER METHODS

tests/voice/test_04_models.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -267,14 +267,10 @@ async def test_presets():
267267
assert low_latency_one.max_delay == 12.34
268268
assert low_latency_one.enable_diarization is False
269269

270-
print(low_latency_one.model_dump_json())
271-
272270
# Overlay #2
273271
low_latency_two: VoiceAgentConfig | None = VoiceAgentConfigPreset.LOW_LATENCY(
274272
VoiceAgentConfig(speech_segment_config=SpeechSegmentConfig(emit_mode=SpeechSegmentEmitMode.ON_SPEAKER_ENDED))
275273
)
276274
assert low_latency_two is not None
277275
assert low_latency_two.enable_diarization is True
278276
assert low_latency_two.speech_segment_config.emit_mode == SpeechSegmentEmitMode.ON_SPEAKER_ENDED
279-
280-
print(low_latency_two.model_dump_json())

tests/voice/test_05_utterance.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -310,9 +310,7 @@ async def test_end_of_utterance_adaptive_vad():
310310

311311
# Test conversation
312312
log = ConversationLog(os.path.join(os.path.dirname(__file__), "./assets/chat2.jsonl"))
313-
chat = log.get_conversation(
314-
["Info", "RecognitionStarted", "AddPartialTranscript", "AddTranscript", "EndOfUtterance"]
315-
)
313+
chat = log.get_conversation(["Info", "RecognitionStarted", "AddPartialTranscript", "AddTranscript"])
316314

317315
# Start time
318316
start_time = datetime.datetime.now()
@@ -393,7 +391,7 @@ async def send_message(idx: int, count: int = 1, use_ttl: bool = True):
393391
assert last_final_time is not None
394392

395393
# Timing info
396-
timeout = adaptive_timeout * 1.5
394+
timeout = adaptive_timeout * 2.0
397395

398396
# Wait for EndOfUtterance
399397
try:

0 commit comments

Comments
 (0)