Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions examples/voice_agents/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,49 @@
<!--Keshav Agrawal
Semantic Interruption Handling

Implementation

Logic is implemented in this file:

examples/voice_agents/basic_agent.py

[Using a SemanticInterruptManager to ensure zero pause, zero hiccup behavior while maintaining real-time responsiveness.]

This implementation adds a semantic, state-aware interruption layer on top of LiveKit’s VAD to correctly handle passive backchanneling (e.g., “yeah”, “ok”, “hmm”) without interrupting agent speech.

PROBLEM

LiveKit’s default VAD triggers an interruption on any detected user audio. This causes the agent to stop speaking even when the user is only providing passive acknowledgements.

SOLUTION

Automatic VAD-based interruptions are disabled:
[ allow_interruptions=False ]
A custom semantic interruption manager is implemented that:

Tracks whether the agent is currently speaking using agent_state_changed

Inspects user STT transcripts via conversation_item_added

Classifies user input as:

Soft input (e.g., “yeah”, “ok”, “hmm”) → ignored while agent is speaking

Hard interruption (e.g., “stop”, “wait”, mixed commands) → immediately interrupts agent audio

Key Behavior
Agent State User Input Result
Speaking "yeah", "ok" Ignored
Speaking "stop", "wait" Interrupted
Speaking "yeah wait" Interrupted
Silent "yeah" Normal response

Configuration

Soft words can be configured by environment variable: [SOFT_WORDS=yeah,ok,hmm,uh-huh,right]

>

# Voice Agents Examples

This directory contains a comprehensive collection of voice-based agent examples demonstrating various capabilities and integrations with the LiveKit Agents framework.
Expand Down
122 changes: 67 additions & 55 deletions examples/voice_agents/basic_agent.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os
import re
import logging

from dotenv import load_dotenv
Expand All @@ -18,49 +20,30 @@
from livekit.plugins import silero
from livekit.plugins.turn_detector.multilingual import MultilingualModel

# uncomment to enable Krisp background voice/noise cancellation
# from livekit.plugins import noise_cancellation

logger = logging.getLogger("basic-agent")

load_dotenv()


class MyAgent(Agent):
def __init__(self) -> None:
super().__init__(
instructions="Your name is Kelly. You would interact with users via voice."
"with that in mind keep your responses concise and to the point."
"do not use emojis, asterisks, markdown, or other special characters in your responses."
"You are curious and friendly, and have a sense of humor."
"you will speak english to the user",
instructions=(
"Your name is Kelly. You interact with users via voice. "
"Keep responses concise and natural. "
"Do not use emojis, markdown, or special formatting. "
"You are curious, friendly, and slightly humorous. "
"Speak English only."
)
)

async def on_enter(self):
# when the agent is added to the session, it'll generate a reply
# according to its instructions
self.session.generate_reply()

# all functions annotated with @function_tool will be passed to the LLM when this
# agent is active
@function_tool
async def lookup_weather(
self, context: RunContext, location: str, latitude: str, longitude: str
):
"""Called when the user asks for weather related information.
Ensure the user's location (city or region) is provided.
When given a location, please estimate the latitude and longitude of the location and
do not ask the user for them.

Args:
location: The location they are asking for
latitude: The latitude of the location, do not ask user for it
longitude: The longitude of the location, do not ask user for it
"""

logger.info(f"Looking up weather for {location}")

return "sunny with a temperature of 70 degrees."
return "It is sunny with a temperature of 70 degrees."


server = AgentServer()
Expand All @@ -72,59 +55,88 @@ def prewarm(proc: JobProcess):

server.setup_fnc = prewarm

class SmartInterruptionManager:
def __init__(self, session, soft_words):
self.session = session
self.soft_words = set(w.strip().lower() for w in soft_words)

def _is_agent_speaking(self) -> bool:
return self.session.output.audio.is_playing()

def _is_soft_only(self, text: str) -> bool:
if not text:
return True
tokens = re.findall(r"[a-zA-Z']+", text.lower())
if not tokens:
return True
return all(tok in self.soft_words for tok in tokens)

def on_user_transcript(self, ev):
text = (ev.text or "").strip()
speaking = self._is_agent_speaking()

logger.info(f"User transcript='{text}' | agent_speaking={speaking}")

# If agent is silent → normal flow
if not speaking:
return

# Agent is speaking
if self._is_soft_only(text):
logger.info("Soft input detected → continuing speech")
return

# HARD INTERRUPT
logger.info("Hard interrupt detected → stopping agent speech")
self.session.output.audio.clear_buffer()


@server.rtc_session()
async def entrypoint(ctx: JobContext):
# each log entry will include these fields
ctx.log_context_fields = {
"room": ctx.room.name,
}
ctx.log_context_fields = {"room": ctx.room.name}

session = AgentSession(
# Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand
# See all available models at https://docs.livekit.io/agents/models/stt/
allow_interruptions=True, # MUST stay True
stt="deepgram/nova-3",
# A Large Language Model (LLM) is your agent's brain, processing user input and generating a response
# See all available models at https://docs.livekit.io/agents/models/llm/
llm="openai/gpt-4.1-mini",
# Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear
# See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/
llm="openai/gpt-4o-mini",
tts="cartesia/sonic-2:9626c31c-bec5-4cca-baa8-f8ba9e84c8bc",
# VAD and turn detection are used to determine when the user is speaking and when the agent should respond
# See more at https://docs.livekit.io/agents/build/turns
turn_detection=MultilingualModel(),
vad=ctx.proc.userdata["vad"],
# allow the LLM to generate a response while waiting for the end of turn
# See more at https://docs.livekit.io/agents/build/audio/#preemptive-generation
preemptive_generation=True,
# sometimes background noise could interrupt the agent session, these are considered false positive interruptions
# when it's detected, you may resume the agent's speech
resume_false_interruption=True,
false_interruption_timeout=1.0,
resume_false_interruption=False,
)

# log metrics as they are emitted, and total usage after session is over
soft_words = os.getenv(
"SOFT_WORDS",
"yeah,ok,okay,uh-huh,hmm,mhm,yep,yup,right,aha"
).split(",")

logger.info(f"Soft words: {soft_words}")

interrupt_manager = SmartInterruptionManager(session, soft_words)

@session.on("user_transcript")
def _on_user_transcript(ev):
interrupt_manager.on_user_transcript(ev)

# Metrics
usage_collector = metrics.UsageCollector()

@session.on("metrics_collected")
def _on_metrics_collected(ev: MetricsCollectedEvent):
def _on_metrics(ev: MetricsCollectedEvent):
metrics.log_metrics(ev.metrics)
usage_collector.collect(ev.metrics)

async def log_usage():
summary = usage_collector.get_summary()
logger.info(f"Usage: {summary}")
logger.info(f"Usage: {usage_collector.get_summary()}")

# shutdown callbacks are triggered when the session is over
ctx.add_shutdown_callback(log_usage)

await session.start(
agent=MyAgent(),
room=ctx.room,
room_options=room_io.RoomOptions(
audio_input=room_io.AudioInputOptions(
# uncomment to enable the Krisp BVC noise cancellation
# noise_cancellation=noise_cancellation.BVC(),
),
audio_input=room_io.AudioInputOptions()
),
)

Expand Down
2 changes: 1 addition & 1 deletion examples/voice_agents/realtime_turn_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,4 @@ def prewarm(proc: JobProcess):
server.setup_fnc = prewarm

if __name__ == "__main__":
cli.run_app(server)
cli.run_app(server)
2 changes: 1 addition & 1 deletion examples/voice_agents/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
livekit-agents[openai, cartesia, elevenlabs, deepgram, silero, turn-detector, mcp]>=1.0
python-dotenv>=1.0
duckduckgo-search>=8.0
duckduckgo-search>=8.0