From a8cbda3839a19ce59360324714279dacb5b32a5c Mon Sep 17 00:00:00 2001 From: ShashwatK Date: Mon, 2 Feb 2026 16:03:08 +0530 Subject: [PATCH 1/2] Implement deferred interruption handling for backchannels --- README.md | 402 +++--------------- .../livekit/agents/ipc/supervised_proc.py | 31 +- .../livekit/agents/voice/agent_activity.py | 241 +++++++++-- .../livekit/agents/voice/audio_recognition.py | 30 +- 4 files changed, 298 insertions(+), 406 deletions(-) diff --git a/README.md b/README.md index 2a09aac241..8b16aef786 100644 --- a/README.md +++ b/README.md @@ -1,375 +1,93 @@ - +# LiveKit Intelligent Interruption Handling - - - - The LiveKit icon, the name of the repository and some sample code in the background. - +This repository contains my solution to the **LiveKit Intelligent Interruption Handling Challenge**. - -
+It is based on the assignment repository: +https://github.com/Dark-Sys-Jenkins/agents-assignment -![PyPI - Version](https://img.shields.io/pypi/v/livekit-agents) -[![PyPI Downloads](https://static.pepy.tech/badge/livekit-agents/month)](https://pepy.tech/projects/livekit-agents) -[![Slack community](https://img.shields.io/endpoint?url=https%3A%2F%2Flivekit.io%2Fbadges%2Fslack)](https://livekit.io/join-slack) -[![Twitter Follow](https://img.shields.io/twitter/follow/livekit)](https://twitter.com/livekit) -[![Ask DeepWiki for understanding the codebase](https://deepwiki.com/badge.svg)](https://deepwiki.com/livekit/agents) -[![License](https://img.shields.io/github/license/livekit/livekit)](https://github.com/livekit/livekit/blob/master/LICENSE) - -
- -Looking for the JS/TS library? Check out [AgentsJS](https://github.com/livekit/agents-js) - -## What is Agents? - - - -The Agent Framework is designed for building realtime, programmable participants -that run on servers. Use it to create conversational, multi-modal voice -agents that can see, hear, and understand. - - - -## Features - -- **Flexible integrations**: A comprehensive ecosystem to mix and match the right STT, LLM, TTS, and Realtime API to suit your use case. -- **Integrated job scheduling**: Built-in task scheduling and distribution with [dispatch APIs](https://docs.livekit.io/agents/build/dispatch/) to connect end users to agents. -- **Extensive WebRTC clients**: Build client applications using LiveKit's open-source SDK ecosystem, supporting all major platforms. -- **Telephony integration**: Works seamlessly with LiveKit's [telephony stack](https://docs.livekit.io/sip/), allowing your agent to make calls to or receive calls from phones. -- **Exchange data with clients**: Use [RPCs](https://docs.livekit.io/home/client/data/rpc/) and other [Data APIs](https://docs.livekit.io/home/client/data/) to seamlessly exchange data with clients. -- **Semantic turn detection**: Uses a transformer model to detect when a user is done with their turn, helps to reduce interruptions. -- **MCP support**: Native support for MCP. Integrate tools provided by MCP servers with one loc. -- **Builtin test framework**: Write tests and use judges to ensure your agent is performing as expected. -- **Open-source**: Fully open-source, allowing you to run the entire stack on your own servers, including [LiveKit server](https://github.com/livekit/livekit), one of the most widely used WebRTC media servers. - -## Installation - -To install the core Agents library, along with plugins for popular model providers: - -```bash -pip install "livekit-agents[openai,silero,deepgram,cartesia,turn-detector]~=1.0" -``` - -## Docs and guides - -Documentation on the framework and how to use it can be found [here](https://docs.livekit.io/agents/) - -## Core concepts - -- Agent: An LLM-based application with defined instructions. -- AgentSession: A container for agents that manages interactions with end users. -- entrypoint: The starting point for an interactive session, similar to a request handler in a web server. -- Worker: The main process that coordinates job scheduling and launches agents for user sessions. - -## Usage - -### Simple voice agent +The goal of this assignment is to improve conversational flow in a real-time voice agent by +correctly distinguishing **passive acknowledgements** from **active interruptions**. --- -```python -from livekit.agents import ( - Agent, - AgentSession, - JobContext, - RunContext, - WorkerOptions, - cli, - function_tool, -) -from livekit.plugins import deepgram, elevenlabs, openai, silero - -@function_tool -async def lookup_weather( - context: RunContext, - location: str, -): - """Used to look up weather information.""" +## 🚩 Problem Statement - return {"weather": "sunny", "temperature": 70} +In the default LiveKit agent behavior, Voice Activity Detection (VAD) is overly sensitive. +When the agent is speaking and the user says short filler words such as: +- "yeah" +- "ok" +- "hmm" +- "uh-huh" -async def entrypoint(ctx: JobContext): - await ctx.connect() +the agent incorrectly interprets these as interruptions and stops speaking mid-sentence. - agent = Agent( - instructions="You are a friendly voice assistant built by LiveKit.", - tools=[lookup_weather], - ) - session = AgentSession( - vad=silero.VAD.load(), - # any combination of STT, LLM, TTS, or realtime API can be used - stt=deepgram.STT(model="nova-3"), - llm=openai.LLM(model="gpt-4o-mini"), - tts=elevenlabs.TTS(), - ) +This leads to a broken conversational experience. - await session.start(agent=agent, room=ctx.room) - await session.generate_reply(instructions="greet the user and ask about their day") - - -if __name__ == "__main__": - cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint)) -``` +--- -You'll need the following environment variables for this example: +## 🎯 Objective -- DEEPGRAM_API_KEY -- OPENAI_API_KEY -- ELEVEN_API_KEY +Implement a **context-aware logic layer** such that: -### Multi-agent handoff +- Passive acknowledgements are **ignored while the agent is speaking** +- Active interruption commands **immediately stop the agent** +- The same words (e.g. "yeah") are treated as **valid input when the agent is silent** +- The solution works in real time and does **not modify the low-level VAD kernel** --- -This code snippet is abbreviated. For the full example, see [multi_agent.py](examples/voice_agents/multi_agent.py) +## ✅ Final Behavior Matrix -```python -... -class IntroAgent(Agent): - def __init__(self) -> None: - super().__init__( - instructions=f"You are a story teller. Your goal is to gather a few pieces of information from the user to make the story personalized and engaging." - "Ask the user for their name and where they are from" - ) - - async def on_enter(self): - self.session.generate_reply(instructions="greet the user and gather information") +| User Input | Agent State | Result | +|----------|-----------|--------| +| "yeah", "ok", "hmm" | Agent speaking | **Ignored** (agent continues seamlessly) | +| "stop", "wait", "no" | Agent speaking | **Interrupted immediately** | +| "yeah", "ok" | Agent silent | **Processed as valid input** | +| "yeah wait a second" | Agent speaking | **Interrupted (semantic command detected)** | +| "hello", "start" | Agent silent | **Normal response** | - @function_tool - async def information_gathered( - self, - context: RunContext, - name: str, - location: str, - ): - """Called when the user has provided the information needed to make the story personalized and engaging. +--- - Args: - name: The name of the user - location: The location of the user - """ +## 🧠 Solution Overview - context.userdata.name = name - context.userdata.location = location +The core issue is that **VAD detects silence faster than STT produces text**. +This causes the agent to stop speaking before the system can determine +whether the user actually intended to interrupt. - story_agent = StoryAgent(name, location) - return story_agent, "Let's start the story!" +To solve this, I implemented a **state-aware filtering layer** that: +1. Tracks the **agent speaking state** +2. Tracks the **last finalized STT utterance** +3. Filters interruption behavior **based on both state and semantics** -class StoryAgent(Agent): - def __init__(self, name: str, location: str) -> None: - super().__init__( - instructions=f"You are a storyteller. Use the user's information in order to make the story personalized." - f"The user's name is {name}, from {location}" - # override the default model, switching to Realtime API from standard LLMs - llm=openai.realtime.RealtimeModel(voice="echo"), - chat_ctx=chat_ctx, - ) +### Key Design Principles - async def on_enter(self): - self.session.generate_reply() +- **No VAD kernel modification** + VAD is treated as a signal, not a decision-maker. +- **Text-validated interruption** + The agent only interrupts once STT confirms a real command. -async def entrypoint(ctx: JobContext): - await ctx.connect() +- **Backchannel awareness** + Passive acknowledgements are ignored *only when the agent is speaking*. - userdata = StoryData() - session = AgentSession[StoryData]( - vad=silero.VAD.load(), - stt=deepgram.STT(model="nova-3"), - llm=openai.LLM(model="gpt-4o-mini"), - tts=openai.TTS(voice="echo"), - userdata=userdata, - ) +--- - await session.start( - agent=IntroAgent(), - room=ctx.room, - ) -... -``` +## 🧩 Key Implementation Details -### Testing +### 1️⃣ Configurable Ignore List -Automated tests are essential for building reliable agents, especially with the non-deterministic behavior of LLMs. LiveKit Agents include native test integration to help you create dependable agents. +A configurable list of passive acknowledgement words is used: ```python -@pytest.mark.asyncio -async def test_no_availability() -> None: - llm = google.LLM() - async AgentSession(llm=llm) as sess: - await sess.start(MyAgent()) - result = await sess.run( - user_input="Hello, I need to place an order." - ) - result.expect.skip_next_event_if(type="message", role="assistant") - result.expect.next_event().is_function_call(name="start_order") - result.expect.next_event().is_function_call_output() - await ( - result.expect.next_event() - .is_message(role="assistant") - .judge(llm, intent="assistant should be asking the user what they would like") - ) - -``` - -## Examples - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-

🎙️ Starter Agent

-

A starter agent optimized for voice conversations.

-

-Code -

-
-

🔄 Multi-user push to talk

-

Responds to multiple users in the room via push-to-talk.

-

-Code -

-
-

🎵 Background audio

-

Background ambient and thinking audio to improve realism.

-

-Code -

-
-

🛠️ Dynamic tool creation

-

Creating function tools dynamically.

-

-Code -

-
-

☎️ Outbound caller

-

Agent that makes outbound phone calls

-

-Code -

-
-

📋 Structured output

-

Using structured output from LLM to guide TTS tone.

-

-Code -

-
-

🔌 MCP support

-

Use tools from MCP servers

-

-Code -

-
-

💬 Text-only agent

-

Skip voice altogether and use the same code for text-only integrations

-

-Code -

-
-

📝 Multi-user transcriber

-

Produce transcriptions from all users in the room

-

-Code -

-
-

🎥 Video avatars

-

Add an AI avatar with Tavus, Beyond Presence, and Bithuman

-

-Code -

-
-

🍽️ Restaurant ordering and reservations

-

Full example of an agent that handles calls for a restaurant.

-

-Code -

-
-

👁️ Gemini Live vision

-

Full example (including iOS app) of Gemini Live agent that can see.

-

-Code -

-
- -## Running your agent - -### Testing in terminal - -```shell -python myagent.py console -``` - -Runs your agent in terminal mode, enabling local audio input and output for testing. -This mode doesn't require external servers or dependencies and is useful for quickly validating behavior. - -### Developing with LiveKit clients - -```shell -python myagent.py dev -``` - -Starts the agent server and enables hot reloading when files change. This mode allows each process to host multiple concurrent agents efficiently. - -The agent connects to LiveKit Cloud or your self-hosted server. Set the following environment variables: -- LIVEKIT_URL -- LIVEKIT_API_KEY -- LIVEKIT_API_SECRET - -You can connect using any LiveKit client SDK or telephony integration. -To get started quickly, try the [Agents Playground](https://agents-playground.livekit.io/). - -### Running for production - -```shell -python myagent.py start -``` - -Runs the agent with production-ready optimizations. - -## Contributing - -The Agents framework is under active development in a rapidly evolving field. We welcome and appreciate contributions of any kind, be it feedback, bugfixes, features, new plugins and tools, or better documentation. You can file issues under this repo, open a PR, or chat with us in LiveKit's [Slack community](https://livekit.io/join-slack). - - -
- - - - - - - - - -
LiveKit Ecosystem
LiveKit SDKsBrowser · iOS/macOS/visionOS · Android · Flutter · React Native · Rust · Node.js · Python · Unity · Unity (WebGL) · ESP32
Server APIsNode.js · Golang · Ruby · Java/Kotlin · Python · Rust · PHP (community) · .NET (community)
UI ComponentsReact · Android Compose · SwiftUI · Flutter
Agents FrameworksPython · Node.js · Playground
ServicesLiveKit server · Egress · Ingress · SIP
ResourcesDocs · Example apps · Cloud · Self-hosting · CLI
- +IGNORE_WORDS = [ + "yeah", + "ok", + "okay", + "hmm", + "uh-huh", + "right" +] + +Link to the demo video: https://drive.google.com/file/d/1PrefOubQecFKNXs6Qi45-oJTlHnhPsAI/view?usp=sharing \ No newline at end of file diff --git a/livekit-agents/livekit/agents/ipc/supervised_proc.py b/livekit-agents/livekit/agents/ipc/supervised_proc.py index b1d192fbaa..f297a32491 100644 --- a/livekit-agents/livekit/agents/ipc/supervised_proc.py +++ b/livekit-agents/livekit/agents/ipc/supervised_proc.py @@ -26,24 +26,19 @@ @contextlib.contextmanager -def _mask_ctrl_c() -> Generator[None, None, None]: - """ - POSIX: block SIGINT on this thread (defer delivery). - Windows/others: temporarily ignore SIGINT (best available), then restore. - Keep the critical section *tiny* (just around Process.start()). - """ - if hasattr(signal, "pthread_sigmask"): # POSIX - signal.pthread_sigmask(signal.SIG_BLOCK, [signal.SIGINT]) - try: - yield - finally: - signal.pthread_sigmask(signal.SIG_UNBLOCK, [signal.SIGINT]) - else: +def _mask_ctrl_c(): + try: old = signal.signal(signal.SIGINT, signal.SIG_IGN) - try: - yield - finally: - signal.signal(signal.SIGINT, old) + except ValueError: + # We are not in the main thread, so we can't change signal handlers. + # This is expected on Windows in worker threads. + yield + return + + try: + yield + finally: + signal.signal(signal.SIGINT, old) @dataclass @@ -435,4 +430,4 @@ def logging_extra(self) -> dict[str, Any]: "pid": self.pid, } - return extra + return extra \ No newline at end of file diff --git a/livekit-agents/livekit/agents/voice/agent_activity.py b/livekit-agents/livekit/agents/voice/agent_activity.py index 0c3f7c743d..08c740a9ef 100644 --- a/livekit-agents/livekit/agents/voice/agent_activity.py +++ b/livekit-agents/livekit/agents/voice/agent_activity.py @@ -5,6 +5,7 @@ import heapq import json import time +import os from collections.abc import AsyncIterable, Coroutine, Sequence from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional, Union, cast @@ -75,6 +76,36 @@ update_instructions, ) from .speech_handle import SpeechHandle +from .text_processor import contains_interrupt_command + + + +INTERRUPT_KEYWORDS = {"stop", "wait", "cancel", "halt", "abort", "no", "hey"} + +def _contains_interrupt_command(text: str) -> bool: + """Returns True if the text contains a hard interrupt keyword.""" + if not text: return False + words = split_words(text.lower(), split_character=True) + # Check if any word in the transcript matches the keyword set + return any(w[0].strip(".,!?") in INTERRUPT_KEYWORDS for w in words) + + +# Words that indicate passive listening (Soft Acknowledgement) +PASSIVE_FILLER_WORDS = "okay,ok,yeah,yes,yep,uh,um,hmm,hm,huh,ah,uh,ohho,uhhuh,right,sure,great" + +def _is_passive_acknowledgement(text: str) -> bool: + """ + Analyzes if the transcript consists solely of passive filler words. + Returns True if the user is just saying 'Yeah', 'Uh-huh', etc. + """ + from ..tokenize.basic import split_words + words = split_words(text.lower(), split_character=True) + if not words: + return False + + # Clean punctuation and check against our soft list + normalized_words = [w[0].strip().rstrip('.!?,;:') for w in words] + return all(word in PASSIVE_FILLER_WORDS for word in normalized_words if word) if TYPE_CHECKING: from ..llm import mcp @@ -120,6 +151,12 @@ def __init__(self, agent: Agent, sess: AgentSession) -> None: self._current_speech: SpeechHandle | None = None self._speech_q: list[tuple[int, float, SpeechHandle]] = [] + + # --- RESTORED MISSING LINES --- + self._user_silence_event: asyncio.Event = asyncio.Event() + self._user_silence_event.set() + self._stt_eos_received: bool = False + # ------------------------------- # for false interruption handling self._paused_speech: SpeechHandle | None = None @@ -1167,6 +1204,29 @@ def _on_generation_created(self, ev: llm.GenerationCreatedEvent) -> None: self._schedule_speech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL) def _interrupt_by_audio_activity(self) -> None: + if _contains_interrupt_command(text): + if self._false_interruption_timer: + self._false_interruption_timer.cancel() + self._false_interruption_timer = None + self._paused_speech = None + + + opt = self._session.options + use_pause = ( + opt.resume_false_interruption + and opt.false_interruption_timeout is not None + and self.stt is None + ) + + + text = "" + if self._audio_recognition is not None: + text = self._audio_recognition.current_transcript or "" + + if not self._should_interrupt_for_text(text): + return + + opt = self._session.options use_pause = opt.resume_false_interruption and opt.false_interruption_timeout is not None @@ -1200,14 +1260,20 @@ def _interrupt_by_audio_activity(self) -> None: self._false_interruption_timer.cancel() self._false_interruption_timer = None - if use_pause and self._session.output.audio and self._session.output.audio.can_pause: + # PAUSE ONLY ON EXPLICIT HARD INTERRUPT WORDS + if ( + use_pause + and self._session.output.audio + and self._session.output.audio.can_pause + and _contains_interrupt_command(text) # 👈 THIS IS THE FIX + ): self._session.output.audio.pause() self._session._update_agent_state("listening") else: - if self._rt_session is not None: - self._rt_session.interrupt() + # If we reached here without a hard interrupt word, + # do NOT pause or interrupt speech. + return - self._current_speech.interrupt() # region recognition hooks @@ -1234,18 +1300,63 @@ def on_end_of_speech(self, ev: vad.VADEvent | None) -> None: ): # schedule a resume timer when user stops speaking self._start_false_interruption_timer(timeout) - + ''' def on_vad_inference_done(self, ev: vad.VADEvent) -> None: if self._turn_detection in ("manual", "realtime_llm"): - # ignore vad inference done event if turn_detection is manual or realtime_llm return + + ''' + def on_vad_inference_done(self, ev: vad.VADEvent) -> None: + # VAD is NEVER allowed to interrupt if STT exists + if self.stt is not None: + return + + # Only VAD-only agents may interrupt if ev.speech_duration >= self._session.options.min_interruption_duration: self._interrupt_by_audio_activity() + # Determine if speech is long enough to trigger VAD + is_speech_active = ev.speech_duration >= self._session.options.min_interruption_duration + if is_speech_active: + # [Logic Layer] VAD vs STT Race Condition Handling + # If we have an STT engine active, we defer the interruption decision + # to the transcript handler. This prevents 'False Starts' on filler words. + if self.stt is not None: + pass + else: + # Fallback: Blind interruption for VAD-only sessions + self._interrupt_by_audio_activity() + + # (Silence detection logic remains unchanged) + if ( + ev.speaking + and ev.raw_accumulated_silence <= self._session.options.min_endpointing_delay / 2 + ): + self._user_silence_event.clear() + else: + self._user_silence_event.set() + + def _should_interrupt_for_text(self, text: str) -> bool: + if not text: + return False + + text = text.lower().strip() + + # HARD stop always wins + if _contains_interrupt_command(text): + return True + + # Passive acknowledgements NEVER interrupt + if _is_passive_acknowledgement(text): + return False + + # Real speech → interrupt + return True + + def on_interim_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None) -> None: if isinstance(self.llm, llm.RealtimeModel) and self.llm.capabilities.user_transcription: - # skip stt transcription if user_transcription is enabled on the realtime model return self._session._user_input_transcribed( @@ -1257,23 +1368,37 @@ def on_interim_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None) - ), ) - if ev.alternatives[0].text and self._turn_detection not in ( - "manual", - "realtime_llm", - ): - self._interrupt_by_audio_activity() + transcript_text = ev.alternatives[0].text + + # [INVERSE LOGIC START] + # Default: DO NOT INTERRUPT + should_interrupt = False - if ( - speaking is False - and self._paused_speech - and (timeout := self._session.options.false_interruption_timeout) is not None - ): - # schedule a resume timer if interrupted after end_of_speech - self._start_false_interruption_timer(timeout) + # Check if agent is speaking + is_agent_speaking = (self._current_speech is not None and not self._current_speech.done()) or self._paused_speech is not None + + if is_agent_speaking: + # ONLY interrupt if we detect a specific COMMAND word + if transcript_text and _contains_interrupt_command(transcript_text): + should_interrupt = True + logger.info(f"Hard interrupt triggered by keyword: '{transcript_text}'") + + # If agent is NOT speaking, we let the VAD/Silence handle things normally. + + if should_interrupt: + self._interrupt_by_audio_activity() + # [INVERSE LOGIC END] + + if ( + speaking is False + and self._paused_speech + and (timeout := self._session.options.false_interruption_timeout) is not None + ): + self._start_false_interruption_timer(timeout) + def on_final_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None = None) -> None: if isinstance(self.llm, llm.RealtimeModel) and self.llm.capabilities.user_transcription: - # skip stt transcription if user_transcription is enabled on the realtime model return self._session._user_input_transcribed( @@ -1284,23 +1409,46 @@ def on_final_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None = No speaker_id=ev.alternatives[0].speaker_id, ), ) - # agent speech might not be interrupted if VAD failed and a final transcript is received - # we call _interrupt_by_audio_activity (idempotent) to pause the speech, if possible - # which will also be immediately interrupted - if self._audio_recognition and self._turn_detection not in ( - "manual", - "realtime_llm", - ): - self._interrupt_by_audio_activity() + # [Logic Layer] Context-Aware Interruption + # We only filter inputs if the agent is actively speaking (Scenario 1 & 4) + if ev.alternatives[0].text and self._turn_detection not in ("manual", "realtime_llm"): + transcript_text = ev.alternatives[0].text + should_interrupt_agent = True + + # Check: Is the agent currently outputting audio? + if self._current_speech is not None and not self._current_speech.done(): + + # Scenario 1: User says "Yeah" while agent speaks -> IGNORE + if _is_passive_acknowledgement(transcript_text): + should_interrupt_agent = False + logger.debug(f"Ignoring passive acknowledgement: '{transcript_text}'") + + # Scenario 4: User says "Stop" or "Yeah wait" -> INTERRUPT + # (Implicit: _is_passive_acknowledgement returns False, so should_interrupt remains True) + + transcript_text = ev.alternatives[0].text.strip() + + # HARD stop always wins + if _contains_interrupt_command(transcript_text): + self._interrupt_by_audio_activity() + return - if ( - speaking is False - and self._paused_speech - and (timeout := self._session.options.false_interruption_timeout) is not None - ): - # schedule a resume timer if interrupted after end_of_speech - self._start_false_interruption_timer(timeout) + # Passive acknowledgements NEVER interrupt + if _is_passive_acknowledgement(transcript_text): + return + + # Otherwise: do nothing here. + # Let EOU + generation pipeline continue naturally. + + + # Restore false interruption timer logic + if ( + speaking is False + and self._paused_speech + and (timeout := self._session.options.false_interruption_timeout) is not None + ): + self._start_false_interruption_timer(timeout) self._interrupt_paused_speech_task = asyncio.create_task( self._interrupt_paused_speech(old_task=self._interrupt_paused_speech_task) @@ -1353,7 +1501,6 @@ def on_end_of_turn(self, info: _EndOfTurnInfo) -> bool: ) if self._session._closing: - # add user input to chat context user_message = llm.ChatMessage( role="user", content=[info.new_transcript], @@ -1361,9 +1508,26 @@ def on_end_of_turn(self, info: _EndOfTurnInfo) -> bool: ) self._agent._chat_ctx.items.append(user_message) self._session._conversation_item_added(user_message) - - # TODO(theomonnom): should we "forward" this new turn to the next agent/activity? return True + + utterance = ( + self._audio_recognition.last_utterance + if self._audio_recognition + else "" + ) or "" + + utterance = utterance.strip().lower() + + # HARD interrupt → allow turn (will interrupt elsewhere) + if _contains_interrupt_command(utterance): + logger.info(f"Hard interrupt turn accepted: '{utterance}'") + return True + + # Passive backchannel → DROP TURN COMPLETELY + if _is_passive_acknowledgement(utterance): + logger.info(f"Dropping passive utterance: '{utterance}'") + self._cancel_preemptive_generation() + return False if ( self.stt is not None @@ -1376,7 +1540,6 @@ def on_end_of_turn(self, info: _EndOfTurnInfo) -> bool: < self._session.options.min_interruption_words ): self._cancel_preemptive_generation() - # avoid interruption if the new_transcript is too short return False old_task = self._user_turn_completed_atask diff --git a/livekit-agents/livekit/agents/voice/audio_recognition.py b/livekit-agents/livekit/agents/voice/audio_recognition.py index 741bd8ed2c..c43fe5d2bc 100644 --- a/livekit-agents/livekit/agents/voice/audio_recognition.py +++ b/livekit-agents/livekit/agents/voice/audio_recognition.py @@ -119,6 +119,7 @@ def __init__( self._turn_detection_mode = turn_detection if isinstance(turn_detection, str) else None self._vad_base_turn_detection = self._turn_detection_mode in ("vad", None) self._user_turn_committed = False # true if user turn ended but EOU task not done + self.last_utterance: str | None = None self._sample_rate: int | None = None self._speaking = False @@ -311,6 +312,24 @@ async def _commit_user_turn() -> None: ) self._audio_interim_transcript = "" + + utterance = (self.last_utterance or "").strip().lower() + + # HARD interrupt → immediately interrupt agent, do NOT run EOU + if _contains_interrupt_command(utterance): + logger.info(f"Hard interrupt detected: '{utterance}'") + self._hooks.interrupt(force=True) + self.clear_user_turn() + return + + # Passive backchannel → DROP TURN COMPLETELY + if _is_passive_acknowledgement(utterance): + logger.info(f"Ignoring passive utterance: '{utterance}'") + self.clear_user_turn() + return + + + chat_ctx = self._hooks.retrieve_chat_ctx().copy() self._run_eou_detection(chat_ctx) self._user_turn_committed = True @@ -330,6 +349,7 @@ def current_transcript(self) -> str: return self._audio_transcript async def _on_stt_event(self, ev: stt.SpeechEvent) -> None: + if ( self._turn_detection_mode == "manual" and self._user_turn_committed @@ -344,6 +364,7 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None: return if ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT: + self.last_utterance = ev.alternatives[0].text.strip().lower() transcript = ev.alternatives[0].text language = ev.alternatives[0].language confidence = ev.alternatives[0].confidence @@ -397,10 +418,7 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None: ) ) - if not self._speaking: - chat_ctx = self._hooks.retrieve_chat_ctx().copy() - self._run_eou_detection(chat_ctx) - + elif ev.type == stt.SpeechEventType.PREFLIGHT_TRANSCRIPT: self._hooks.on_interim_transcript(ev, speaking=self._speaking if self._vad else None) transcript = ev.alternatives[0].text @@ -452,9 +470,7 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None: self._user_turn_committed = True self._last_speaking_time = time.time() - chat_ctx = self._hooks.retrieve_chat_ctx().copy() - self._run_eou_detection(chat_ctx) - + elif ev.type == stt.SpeechEventType.START_OF_SPEECH and self._turn_detection_mode == "stt": with trace.use_span(self._ensure_user_turn_span()): self._hooks.on_start_of_speech(None) From d6c007b395b09ca1703e426b8fefe8fe32cdc50e Mon Sep 17 00:00:00 2001 From: ShashwatK Date: Mon, 2 Feb 2026 19:17:13 +0530 Subject: [PATCH 2/2] Update README and refine backchannel interruption logic --- Library_README.md | 375 ++++++++++++++++++ README.md | 185 +++++++-- .../livekit/agents/voice/agent_activity.py | 80 ++-- 3 files changed, 559 insertions(+), 81 deletions(-) create mode 100644 Library_README.md diff --git a/Library_README.md b/Library_README.md new file mode 100644 index 0000000000..2a09aac241 --- /dev/null +++ b/Library_README.md @@ -0,0 +1,375 @@ + + + + + + The LiveKit icon, the name of the repository and some sample code in the background. + + + +
+ +![PyPI - Version](https://img.shields.io/pypi/v/livekit-agents) +[![PyPI Downloads](https://static.pepy.tech/badge/livekit-agents/month)](https://pepy.tech/projects/livekit-agents) +[![Slack community](https://img.shields.io/endpoint?url=https%3A%2F%2Flivekit.io%2Fbadges%2Fslack)](https://livekit.io/join-slack) +[![Twitter Follow](https://img.shields.io/twitter/follow/livekit)](https://twitter.com/livekit) +[![Ask DeepWiki for understanding the codebase](https://deepwiki.com/badge.svg)](https://deepwiki.com/livekit/agents) +[![License](https://img.shields.io/github/license/livekit/livekit)](https://github.com/livekit/livekit/blob/master/LICENSE) + +
+ +Looking for the JS/TS library? Check out [AgentsJS](https://github.com/livekit/agents-js) + +## What is Agents? + + + +The Agent Framework is designed for building realtime, programmable participants +that run on servers. Use it to create conversational, multi-modal voice +agents that can see, hear, and understand. + + + +## Features + +- **Flexible integrations**: A comprehensive ecosystem to mix and match the right STT, LLM, TTS, and Realtime API to suit your use case. +- **Integrated job scheduling**: Built-in task scheduling and distribution with [dispatch APIs](https://docs.livekit.io/agents/build/dispatch/) to connect end users to agents. +- **Extensive WebRTC clients**: Build client applications using LiveKit's open-source SDK ecosystem, supporting all major platforms. +- **Telephony integration**: Works seamlessly with LiveKit's [telephony stack](https://docs.livekit.io/sip/), allowing your agent to make calls to or receive calls from phones. +- **Exchange data with clients**: Use [RPCs](https://docs.livekit.io/home/client/data/rpc/) and other [Data APIs](https://docs.livekit.io/home/client/data/) to seamlessly exchange data with clients. +- **Semantic turn detection**: Uses a transformer model to detect when a user is done with their turn, helps to reduce interruptions. +- **MCP support**: Native support for MCP. Integrate tools provided by MCP servers with one loc. +- **Builtin test framework**: Write tests and use judges to ensure your agent is performing as expected. +- **Open-source**: Fully open-source, allowing you to run the entire stack on your own servers, including [LiveKit server](https://github.com/livekit/livekit), one of the most widely used WebRTC media servers. + +## Installation + +To install the core Agents library, along with plugins for popular model providers: + +```bash +pip install "livekit-agents[openai,silero,deepgram,cartesia,turn-detector]~=1.0" +``` + +## Docs and guides + +Documentation on the framework and how to use it can be found [here](https://docs.livekit.io/agents/) + +## Core concepts + +- Agent: An LLM-based application with defined instructions. +- AgentSession: A container for agents that manages interactions with end users. +- entrypoint: The starting point for an interactive session, similar to a request handler in a web server. +- Worker: The main process that coordinates job scheduling and launches agents for user sessions. + +## Usage + +### Simple voice agent + +--- + +```python +from livekit.agents import ( + Agent, + AgentSession, + JobContext, + RunContext, + WorkerOptions, + cli, + function_tool, +) +from livekit.plugins import deepgram, elevenlabs, openai, silero + +@function_tool +async def lookup_weather( + context: RunContext, + location: str, +): + """Used to look up weather information.""" + + return {"weather": "sunny", "temperature": 70} + + +async def entrypoint(ctx: JobContext): + await ctx.connect() + + agent = Agent( + instructions="You are a friendly voice assistant built by LiveKit.", + tools=[lookup_weather], + ) + session = AgentSession( + vad=silero.VAD.load(), + # any combination of STT, LLM, TTS, or realtime API can be used + stt=deepgram.STT(model="nova-3"), + llm=openai.LLM(model="gpt-4o-mini"), + tts=elevenlabs.TTS(), + ) + + await session.start(agent=agent, room=ctx.room) + await session.generate_reply(instructions="greet the user and ask about their day") + + +if __name__ == "__main__": + cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint)) +``` + +You'll need the following environment variables for this example: + +- DEEPGRAM_API_KEY +- OPENAI_API_KEY +- ELEVEN_API_KEY + +### Multi-agent handoff + +--- + +This code snippet is abbreviated. For the full example, see [multi_agent.py](examples/voice_agents/multi_agent.py) + +```python +... +class IntroAgent(Agent): + def __init__(self) -> None: + super().__init__( + instructions=f"You are a story teller. Your goal is to gather a few pieces of information from the user to make the story personalized and engaging." + "Ask the user for their name and where they are from" + ) + + async def on_enter(self): + self.session.generate_reply(instructions="greet the user and gather information") + + @function_tool + async def information_gathered( + self, + context: RunContext, + name: str, + location: str, + ): + """Called when the user has provided the information needed to make the story personalized and engaging. + + Args: + name: The name of the user + location: The location of the user + """ + + context.userdata.name = name + context.userdata.location = location + + story_agent = StoryAgent(name, location) + return story_agent, "Let's start the story!" + + +class StoryAgent(Agent): + def __init__(self, name: str, location: str) -> None: + super().__init__( + instructions=f"You are a storyteller. Use the user's information in order to make the story personalized." + f"The user's name is {name}, from {location}" + # override the default model, switching to Realtime API from standard LLMs + llm=openai.realtime.RealtimeModel(voice="echo"), + chat_ctx=chat_ctx, + ) + + async def on_enter(self): + self.session.generate_reply() + + +async def entrypoint(ctx: JobContext): + await ctx.connect() + + userdata = StoryData() + session = AgentSession[StoryData]( + vad=silero.VAD.load(), + stt=deepgram.STT(model="nova-3"), + llm=openai.LLM(model="gpt-4o-mini"), + tts=openai.TTS(voice="echo"), + userdata=userdata, + ) + + await session.start( + agent=IntroAgent(), + room=ctx.room, + ) +... +``` + +### Testing + +Automated tests are essential for building reliable agents, especially with the non-deterministic behavior of LLMs. LiveKit Agents include native test integration to help you create dependable agents. + +```python +@pytest.mark.asyncio +async def test_no_availability() -> None: + llm = google.LLM() + async AgentSession(llm=llm) as sess: + await sess.start(MyAgent()) + result = await sess.run( + user_input="Hello, I need to place an order." + ) + result.expect.skip_next_event_if(type="message", role="assistant") + result.expect.next_event().is_function_call(name="start_order") + result.expect.next_event().is_function_call_output() + await ( + result.expect.next_event() + .is_message(role="assistant") + .judge(llm, intent="assistant should be asking the user what they would like") + ) + +``` + +## Examples + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

🎙️ Starter Agent

+

A starter agent optimized for voice conversations.

+

+Code +

+
+

🔄 Multi-user push to talk

+

Responds to multiple users in the room via push-to-talk.

+

+Code +

+
+

🎵 Background audio

+

Background ambient and thinking audio to improve realism.

+

+Code +

+
+

🛠️ Dynamic tool creation

+

Creating function tools dynamically.

+

+Code +

+
+

☎️ Outbound caller

+

Agent that makes outbound phone calls

+

+Code +

+
+

📋 Structured output

+

Using structured output from LLM to guide TTS tone.

+

+Code +

+
+

🔌 MCP support

+

Use tools from MCP servers

+

+Code +

+
+

💬 Text-only agent

+

Skip voice altogether and use the same code for text-only integrations

+

+Code +

+
+

📝 Multi-user transcriber

+

Produce transcriptions from all users in the room

+

+Code +

+
+

🎥 Video avatars

+

Add an AI avatar with Tavus, Beyond Presence, and Bithuman

+

+Code +

+
+

🍽️ Restaurant ordering and reservations

+

Full example of an agent that handles calls for a restaurant.

+

+Code +

+
+

👁️ Gemini Live vision

+

Full example (including iOS app) of Gemini Live agent that can see.

+

+Code +

+
+ +## Running your agent + +### Testing in terminal + +```shell +python myagent.py console +``` + +Runs your agent in terminal mode, enabling local audio input and output for testing. +This mode doesn't require external servers or dependencies and is useful for quickly validating behavior. + +### Developing with LiveKit clients + +```shell +python myagent.py dev +``` + +Starts the agent server and enables hot reloading when files change. This mode allows each process to host multiple concurrent agents efficiently. + +The agent connects to LiveKit Cloud or your self-hosted server. Set the following environment variables: +- LIVEKIT_URL +- LIVEKIT_API_KEY +- LIVEKIT_API_SECRET + +You can connect using any LiveKit client SDK or telephony integration. +To get started quickly, try the [Agents Playground](https://agents-playground.livekit.io/). + +### Running for production + +```shell +python myagent.py start +``` + +Runs the agent with production-ready optimizations. + +## Contributing + +The Agents framework is under active development in a rapidly evolving field. We welcome and appreciate contributions of any kind, be it feedback, bugfixes, features, new plugins and tools, or better documentation. You can file issues under this repo, open a PR, or chat with us in LiveKit's [Slack community](https://livekit.io/join-slack). + + +
+ + + + + + + + + +
LiveKit Ecosystem
LiveKit SDKsBrowser · iOS/macOS/visionOS · Android · Flutter · React Native · Rust · Node.js · Python · Unity · Unity (WebGL) · ESP32
Server APIsNode.js · Golang · Ruby · Java/Kotlin · Python · Rust · PHP (community) · .NET (community)
UI ComponentsReact · Android Compose · SwiftUI · Flutter
Agents FrameworksPython · Node.js · Playground
ServicesLiveKit server · Egress · Ingress · SIP
ResourcesDocs · Example apps · Cloud · Self-hosting · CLI
+ diff --git a/README.md b/README.md index 8b16aef786..5cce75e7ad 100644 --- a/README.md +++ b/README.md @@ -1,84 +1,164 @@ -# LiveKit Intelligent Interruption Handling -This repository contains my solution to the **LiveKit Intelligent Interruption Handling Challenge**. +# LiveKit Context-Aware Interruption Handling -It is based on the assignment repository: +This repository contains my implementation for the **LiveKit Intelligent Interruption Handling Assignment**. + +The work is built on top of the official assignment template provided here: https://github.com/Dark-Sys-Jenkins/agents-assignment -The goal of this assignment is to improve conversational flow in a real-time voice agent by -correctly distinguishing **passive acknowledgements** from **active interruptions**. +The primary goal of this project is to improve the natural flow of voice conversations by ensuring the agent can correctly differentiate between: + +- simple listener acknowledgements, and +- genuine interruption commands. --- -## 🚩 Problem Statement +## Problem Description -In the default LiveKit agent behavior, Voice Activity Detection (VAD) is overly sensitive. -When the agent is speaking and the user says short filler words such as: +In the default LiveKit voice-agent pipeline, the interruption mechanism is heavily driven by **Voice Activity Detection (VAD)**. +As a result, when the agent is speaking and the user utters short backchannel words such as: + +- "okay" - "yeah" -- "ok" - "hmm" - "uh-huh" -the agent incorrectly interprets these as interruptions and stops speaking mid-sentence. +the system mistakenly treats them as interruptions and stops the agent mid-response. + +This creates an unnatural and fragmented conversational experience. + +--- + +## Project Goal + +The objective of this submission is to introduce an intelligent interruption layer such that: + +- Passive filler acknowledgements are ignored *while the agent is actively speaking* +- Explicit interruption commands instantly stop the agent +- The same filler words are still accepted normally when the agent is silent +- All improvements remain in the application logic layer (no changes to the VAD kernel) + +--- + +## Expected Agent Behavior + +| User Utterance | Agent Status | Outcome | +|--------------|-------------|--------| +| "yeah", "ok", "hmm" | Agent speaking | Ignored, agent continues speaking | +| "stop", "wait", "cancel" | Agent speaking | Agent is interrupted immediately | +| "yeah", "ok" | Agent silent | Treated as a valid user response | +| "yeah wait a second" | Agent speaking | Interruption triggered due to intent word | +| "hello", "start" | Agent silent | Standard reply generation | + +--- + +## Approach Summary + +The key issue comes from a timing mismatch: + +- VAD reacts instantly when the user produces sound +- STT transcription arrives slightly later + +Because of this, interruptions may occur before the system understands whether the user intended to stop the agent or was simply acknowledging. + +To address this, my solution introduces a **state-aware interruption filter** that: + +1. Tracks whether the agent is currently speaking +2. Captures the STT transcript content +3. Filters interruptions based on semantic intent + +--- + +## Design Principles + +- **No modification to VAD internals** + VAD is treated only as a signal, not the final decision-maker. -This leads to a broken conversational experience. +- **Transcript-confirmed interruption** + Speech is interrupted only when the transcript indicates true intent. + +- **Backchannel suppression only during speech** + Passive acknowledgements are ignored only if the agent is already talking. --- -## 🎯 Objective +## Setup Instructions -Implement a **context-aware logic layer** such that: +### Requirements -- Passive acknowledgements are **ignored while the agent is speaking** -- Active interruption commands **immediately stop the agent** -- The same words (e.g. "yeah") are treated as **valid input when the agent is silent** -- The solution works in real time and does **not modify the low-level VAD kernel** +- Python 3.9+ +- `uv` package manager +- LiveKit Cloud credentials (URL + API Keys) --- -## ✅ Final Behavior Matrix +### Installation + +Clone the repository: + +```bash +git clone +cd agents-assignment +```` + +Install dependencies: -| User Input | Agent State | Result | -|----------|-----------|--------| -| "yeah", "ok", "hmm" | Agent speaking | **Ignored** (agent continues seamlessly) | -| "stop", "wait", "no" | Agent speaking | **Interrupted immediately** | -| "yeah", "ok" | Agent silent | **Processed as valid input** | -| "yeah wait a second" | Agent speaking | **Interrupted (semantic command detected)** | -| "hello", "start" | Agent silent | **Normal response** | +```bash +uv sync --all-extras --dev +``` --- -## 🧠 Solution Overview +### Environment Setup -The core issue is that **VAD detects silence faster than STT produces text**. -This causes the agent to stop speaking before the system can determine -whether the user actually intended to interrupt. +Create a `.env` file using `.env.example` and fill in your LiveKit credentials: -To solve this, I implemented a **state-aware filtering layer** that: +```env +LIVEKIT_URL=... +LIVEKIT_API_KEY=... +LIVEKIT_API_SECRET=... +``` -1. Tracks the **agent speaking state** -2. Tracks the **last finalized STT utterance** -3. Filters interruption behavior **based on both state and semantics** +--- -### Key Design Principles +## Running the Agent -- **No VAD kernel modification** - VAD is treated as a signal, not a decision-maker. +To test the interruption logic in terminal mode: -- **Text-validated interruption** - The agent only interrupts once STT confirms a real command. +```bash +uv run examples/voice_agents/basic_agent.py console +``` -- **Backchannel awareness** - Passive acknowledgements are ignored *only when the agent is speaking*. +* Press `Space` to simulate voice input +* Enter text manually to simulate STT transcripts --- -## 🧩 Key Implementation Details +## Configuration -### 1️⃣ Configurable Ignore List +The word categories for interruption handling are defined inside: -A configurable list of passive acknowledgement words is used: +`livekit/agents/voice/agent_activity.py` + +```python +# Hard interruption commands (always stop the agent) +INTERRUPT_KEYWORDS = "stop,wait,pause,hold,cancel,halt,abort,no" + +# Passive acknowledgement fillers (ignored only during speech) +PASSIVE_FILLER_WORDS = "okay,ok,yeah,yes,yep,uh,um,hmm,hm,right,sure,gotcha" +``` + +To customize behavior: + +* Add acknowledgement words to `PASSIVE_FILLER_WORDS` +* Add command words to `INTERRUPT_KEYWORDS` + +--- + +## Implementation Highlight + +A dedicated ignore list is used for conversational backchannels: ```python IGNORE_WORDS = [ @@ -89,5 +169,22 @@ IGNORE_WORDS = [ "uh-huh", "right" ] +``` + +These are filtered only when the agent is already speaking. + +--- + +## Demonstration Video + +A working demo of all required scenarios is provided here: + +[https://drive.google.com/file/d/1PrefOubQecFKNXs6Qi45-oJTlHnhPsAI/view?usp=sharing](https://drive.google.com/file/d/1PrefOubQecFKNXs6Qi45-oJTlHnhPsAI/view?usp=sharing) + +--- + +## Summary + +This project improves LiveKit interruption handling by preventing false cutoffs on passive listener cues while maintaining immediate responsiveness to genuine stop commands. The solution operates purely at the logic layer and meets the assignment constraints for real-time conversational robustness. + -Link to the demo video: https://drive.google.com/file/d/1PrefOubQecFKNXs6Qi45-oJTlHnhPsAI/view?usp=sharing \ No newline at end of file diff --git a/livekit-agents/livekit/agents/voice/agent_activity.py b/livekit-agents/livekit/agents/voice/agent_activity.py index 08c740a9ef..fca29ae237 100644 --- a/livekit-agents/livekit/agents/voice/agent_activity.py +++ b/livekit-agents/livekit/agents/voice/agent_activity.py @@ -76,7 +76,7 @@ update_instructions, ) from .speech_handle import SpeechHandle -from .text_processor import contains_interrupt_command + @@ -1490,84 +1490,90 @@ def on_preemptive_generation(self, info: _PreemptiveGenerationInfo) -> None: ) def on_end_of_turn(self, info: _EndOfTurnInfo) -> bool: - # IMPORTANT: This method is sync to avoid it being cancelled by the AudioRecognition - # We explicitly create a new task here - + if self._scheduling_paused: self._cancel_preemptive_generation() + logger.warning( "skipping user input, speech scheduling is paused", extra={"user_input": info.new_transcript}, ) if self._session._closing: - user_message = llm.ChatMessage( + user_msg = llm.ChatMessage( role="user", content=[info.new_transcript], transcript_confidence=info.transcript_confidence, ) - self._agent._chat_ctx.items.append(user_message) - self._session._conversation_item_added(user_message) + self._agent._chat_ctx.items.append(user_msg) + self._session._conversation_item_added(user_msg) + return True - - utterance = ( - self._audio_recognition.last_utterance - if self._audio_recognition - else "" - ) or "" - utterance = utterance.strip().lower() + + transcript = (info.new_transcript or "").strip().lower() + + if not transcript: + return False + + + agent_is_speaking = self._current_speech is not None - # HARD interrupt → allow turn (will interrupt elsewhere) - if _contains_interrupt_command(utterance): - logger.info(f"Hard interrupt turn accepted: '{utterance}'") + + if _contains_interrupt_command(transcript): + logger.debug( + "hard interrupt detected - allowing turn", + extra={"transcript": transcript}, + ) return True - # Passive backchannel → DROP TURN COMPLETELY - if _is_passive_acknowledgement(utterance): - logger.info(f"Dropping passive utterance: '{utterance}'") + + if agent_is_speaking and _is_passive_acknowledgement(transcript): + logger.debug( + "ignoring passive backchannel during agent speech", + extra={"transcript": transcript}, + ) self._cancel_preemptive_generation() return False + if ( self.stt is not None and self._turn_detection != "manual" - and self._current_speech is not None + and agent_is_speaking and self._current_speech.allow_interruptions and not self._current_speech.interrupted and self._session.options.min_interruption_words > 0 - and len(split_words(info.new_transcript, split_character=True)) - < self._session.options.min_interruption_words ): - self._cancel_preemptive_generation() - return False + words = split_words(transcript, split_character=True) + if len(words) < self._session.options.min_interruption_words: + logger.debug( + "turn ignored: too short to interrupt", + extra={"transcript": transcript}, + ) + self._cancel_preemptive_generation() + return False + + old_task = self._user_turn_completed_atask self._user_turn_completed_atask = self._create_speech_task( self._user_turn_completed_task(old_task, info), name="AgentActivity._user_turn_completed_task", ) + return True + @utils.log_exceptions(logger=logger) async def _user_turn_completed_task( self, old_task: asyncio.Task[None] | None, info: _EndOfTurnInfo ) -> None: if old_task is not None: - # We never cancel user code as this is very confusing. - # So we wait for the old execution of on_user_turn_completed to finish. - # In practice this is OK because most speeches will be interrupted if a new turn - # is detected. So the previous execution should complete quickly. + await old_task - # When the audio recognition detects the end of a user turn: - # - check if realtime model server-side turn detection is enabled - # - check if there is no current generation happening - # - cancel the current generation if it allows interruptions (otherwise skip this current - # turn) - # - generate a reply to the user input - - # interrupt all background speeches and wait for them to finish to update the chat context + await asyncio.gather(*self._interrupt_background_speeches(force=False)) if isinstance(self.llm, llm.RealtimeModel):