Dark-Sys-Jenkins · coffeine16 · Feb 2, 2026
diff --git a/examples/voice_agents/README.md b/examples/voice_agents/README.md
@@ -1,3 +1,49 @@
+<!--Keshav Agrawal
+    Semantic Interruption Handling
+
+    Implementation
+
+    Logic is implemented in this file:
+
+    examples/voice_agents/basic_agent.py
+
+    [Using a SemanticInterruptManager to ensure zero pause, zero hiccup behavior while maintaining real-time responsiveness.]
+
+    This implementation adds a semantic, state-aware interruption layer on top of LiveKit’s VAD to correctly handle passive backchanneling (e.g., “yeah”, “ok”, “hmm”) without interrupting agent speech.
+
+    PROBLEM
+
+    LiveKit’s default VAD triggers an interruption on any detected user audio. This causes the agent to stop speaking even when the user is only providing passive acknowledgements.
+
+    SOLUTION
+
+    Automatic VAD-based interruptions are disabled:
+    [ allow_interruptions=False ]
+    A custom semantic interruption manager is implemented that:
+
+    Tracks whether the agent is currently speaking using agent_state_changed
+
+    Inspects user STT transcripts via conversation_item_added
+
+    Classifies user input as:
+
+    Soft input (e.g., “yeah”, “ok”, “hmm”) → ignored while agent is speaking
+
+    Hard interruption (e.g., “stop”, “wait”, mixed commands) → immediately interrupts agent audio
+
+    Key Behavior
+    Agent State	  User Input	    Result
+    Speaking	  "yeah", "ok"	    Ignored
+    Speaking	  "stop", "wait"	Interrupted
+    Speaking	  "yeah wait"	    Interrupted
+    Silent	      "yeah"	        Normal response
+
+    Configuration
+
+    Soft words can be configured by environment variable: [SOFT_WORDS=yeah,ok,hmm,uh-huh,right]
+
+>
+
 # Voice Agents Examples
 
 This directory contains a comprehensive collection of voice-based agent examples demonstrating various capabilities and integrations with the LiveKit Agents framework.

diff --git a/examples/voice_agents/basic_agent.py b/examples/voice_agents/basic_agent.py
@@ -1,3 +1,5 @@
+import os
+import re
 import logging
 
 from dotenv import load_dotenv
@@ -18,49 +20,30 @@
 from livekit.plugins import silero
 from livekit.plugins.turn_detector.multilingual import MultilingualModel
 
-# uncomment to enable Krisp background voice/noise cancellation
-# from livekit.plugins import noise_cancellation
-
 logger = logging.getLogger("basic-agent")
-
 load_dotenv()
 
-
 class MyAgent(Agent):
     def __init__(self) -> None:
         super().__init__(
-            instructions="Your name is Kelly. You would interact with users via voice."
-            "with that in mind keep your responses concise and to the point."
-            "do not use emojis, asterisks, markdown, or other special characters in your responses."
-            "You are curious and friendly, and have a sense of humor."
-            "you will speak english to the user",
+            instructions=(
+                "Your name is Kelly. You interact with users via voice. "
+                "Keep responses concise and natural. "
+                "Do not use emojis, markdown, or special formatting. "
+                "You are curious, friendly, and slightly humorous. "
+                "Speak English only."
+            )
         )
 
     async def on_enter(self):
-        # when the agent is added to the session, it'll generate a reply
-        # according to its instructions
         self.session.generate_reply()
 
-    # all functions annotated with @function_tool will be passed to the LLM when this
-    # agent is active
     @function_tool
     async def lookup_weather(
         self, context: RunContext, location: str, latitude: str, longitude: str
     ):
-        """Called when the user asks for weather related information.
-        Ensure the user's location (city or region) is provided.
-        When given a location, please estimate the latitude and longitude of the location and
-        do not ask the user for them.
-
-        Args:
-            location: The location they are asking for
-            latitude: The latitude of the location, do not ask user for it
-            longitude: The longitude of the location, do not ask user for it
-        """
-
         logger.info(f"Looking up weather for {location}")
-
-        return "sunny with a temperature of 70 degrees."
+        return "It is sunny with a temperature of 70 degrees."
 
 
 server = AgentServer()
@@ -72,59 +55,88 @@ def prewarm(proc: JobProcess):
 
 server.setup_fnc = prewarm
 
+class SmartInterruptionManager:
+    def __init__(self, session, soft_words):
+        self.session = session
+        self.soft_words = set(w.strip().lower() for w in soft_words)
+
+    def _is_agent_speaking(self) -> bool:
+        return self.session.output.audio.is_playing()
+
+    def _is_soft_only(self, text: str) -> bool:
+        if not text:
+            return True
+        tokens = re.findall(r"[a-zA-Z']+", text.lower())
+        if not tokens:
+            return True
+        return all(tok in self.soft_words for tok in tokens)
+
+    def on_user_transcript(self, ev):
+        text = (ev.text or "").strip()
+        speaking = self._is_agent_speaking()
+
+        logger.info(f"User transcript='{text}' | agent_speaking={speaking}")
+
+        # If agent is silent → normal flow
+        if not speaking:
+            return
+
+        # Agent is speaking
+        if self._is_soft_only(text):
+            logger.info("Soft input detected → continuing speech")
+            return
+
+        # HARD INTERRUPT
+        logger.info("Hard interrupt detected → stopping agent speech")
+        self.session.output.audio.clear_buffer()
+
 
 @server.rtc_session()
 async def entrypoint(ctx: JobContext):
-    # each log entry will include these fields
-    ctx.log_context_fields = {
-        "room": ctx.room.name,
-    }
+    ctx.log_context_fields = {"room": ctx.room.name}
+
     session = AgentSession(
-        # Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand
-        # See all available models at https://docs.livekit.io/agents/models/stt/
+        allow_interruptions=True,   # MUST stay True
         stt="deepgram/nova-3",
-        # A Large Language Model (LLM) is your agent's brain, processing user input and generating a response
-        # See all available models at https://docs.livekit.io/agents/models/llm/
-        llm="openai/gpt-4.1-mini",
-        # Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear
-        # See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/
+        llm="openai/gpt-4o-mini",
         tts="cartesia/sonic-2:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc",
-        # VAD and turn detection are used to determine when the user is speaking and when the agent should respond
-        # See more at https://docs.livekit.io/agents/build/turns
         turn_detection=MultilingualModel(),
         vad=ctx.proc.userdata["vad"],
-        # allow the LLM to generate a response while waiting for the end of turn
-        # See more at https://docs.livekit.io/agents/build/audio/#preemptive-generation
         preemptive_generation=True,
-        # sometimes background noise could interrupt the agent session, these are considered false positive interruptions
-        # when it's detected, you may resume the agent's speech
-        resume_false_interruption=True,
-        false_interruption_timeout=1.0,
+        resume_false_interruption=False,
     )
 
-    # log metrics as they are emitted, and total usage after session is over
+    soft_words = os.getenv(
+        "SOFT_WORDS",
+        "yeah,ok,okay,uh-huh,hmm,mhm,yep,yup,right,aha"
+    ).split(",")
+
+    logger.info(f"Soft words: {soft_words}")
+
+    interrupt_manager = SmartInterruptionManager(session, soft_words)
+
+    @session.on("user_transcript")
+    def _on_user_transcript(ev):
+        interrupt_manager.on_user_transcript(ev)
+
+    # Metrics
     usage_collector = metrics.UsageCollector()
 
     @session.on("metrics_collected")
-    def _on_metrics_collected(ev: MetricsCollectedEvent):
+    def _on_metrics(ev: MetricsCollectedEvent):
         metrics.log_metrics(ev.metrics)
         usage_collector.collect(ev.metrics)
 
     async def log_usage():
-        summary = usage_collector.get_summary()
-        logger.info(f"Usage: {summary}")
+        logger.info(f"Usage: {usage_collector.get_summary()}")
 
-    # shutdown callbacks are triggered when the session is over
     ctx.add_shutdown_callback(log_usage)
 
     await session.start(
         agent=MyAgent(),
         room=ctx.room,
         room_options=room_io.RoomOptions(
-            audio_input=room_io.AudioInputOptions(
-                # uncomment to enable the Krisp BVC noise cancellation
-                # noise_cancellation=noise_cancellation.BVC(),
-            ),
+            audio_input=room_io.AudioInputOptions()
         ),
     )
 

diff --git a/examples/voice_agents/realtime_turn_detector.py b/examples/voice_agents/realtime_turn_detector.py
@@ -55,4 +55,4 @@ def prewarm(proc: JobProcess):
 server.setup_fnc = prewarm
 
 if __name__ == "__main__":
-    cli.run_app(server)
+    cli.run_app(server)
diff --git a/examples/voice_agents/requirements.txt b/examples/voice_agents/requirements.txt
@@ -1,3 +1,3 @@
 livekit-agents[openai, cartesia, elevenlabs, deepgram, silero, turn-detector, mcp]>=1.0
 python-dotenv>=1.0
-duckduckgo-search>=8.0
+duckduckgo-search>=8.0