diff --git a/Proof-of-working b/Proof-of-working new file mode 100644 index 0000000000..e8fb660140 --- /dev/null +++ b/Proof-of-working @@ -0,0 +1,136 @@ +2025-12-04 16:43:48,850 INFO livekit livekit_ffi::server:139:livekit_ffi::server - initializing ffi server v0.12.40 +2025-12-04 16:43:49,249 INFO livekit.agents starting worker +2025-12-04 16:43:49,249 INFO livekit.agents starting inference executor +2025-12-04 16:43:49,273 INFO livekit.agents initializing process +2025-12-04 16:43:53,941 INFO livekit.agents process initialized +2025-12-04 16:43:53,948 INFO livekit.agents HTTP server listening on :63063 +2025-12-04 16:43:53,966 INFO livekit.agents initializing job runner +2025-12-04 16:43:53,969 INFO basic-agent [PREWARM] loading VAD model +2025-12-04 16:43:54,138 INFO basic-agent [PREWARM] VAD model loaded +2025-12-04 16:43:54,140 INFO livekit.agents job runner initialized +2025-12-04 16:43:54,185 INFO basic-agent [ENTRYPOINT] starting AgentSession for room=mock_room +2025-12-04 16:43:54,669 INFO basic-agent [AGENT] on_enter -> generating initial reply +2025-12-04 16:43:55,159 INFO basic-agent [STATE] agent_state changed: initializing -> listening | user_state=listening +2025-12-04 16:43:55,166 WARNING livekit.agents resume_false_interruption is enabled but audio output does not support pause, it will be ignored +2025-12-04 16:43:55,175 INFO basic-agent [STATE] agent_state changed: listening -> thinking | user_state=listening +2025-12-04 16:43:56,488 INFO basic-agent [TOOL] Looking up weather for New York +2025-12-04 16:43:56,497 INFO livekit.agents LLM metrics +2025-12-04 16:43:57,870 INFO livekit.agents LLM metrics +2025-12-04 16:43:59,578 INFO basic-agent [STATE] agent_state changed: thinking -> speaking | user_state=listening +2025-12-04 16:44:00,676 INFO livekit.agents STT metrics +2025-12-04 16:44:00,846 INFO livekit.agents TTS metrics +2025-12-04 16:44:03,698 INFO livekit.agents STT metrics +2025-12-04 16:44:04,538 INFO basic-agent [STATE] agent_state changed: speaking -> listening | user_state=listening +2025-12-04 16:44:06,648 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=listening +2025-12-04 16:44:07,104 INFO livekit.agents STT metrics +2025-12-04 16:44:07,346 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=listening +2025-12-04 16:44:07,386 INFO livekit.agents STT metrics +2025-12-04 16:44:07,386 INFO basic-agent [STT] FINAL text='Yeah.' | agent_state=listening | user_state=listening +2025-12-04 16:44:07,395 INFO basic-agent [LOGIC] words=['yeah'] | is_soft=True | has_hard=False | agent_state=listening +2025-12-04 16:44:07,396 INFO basic-agent [LOGIC] SOFT input while agent NOT speaking -> normal user turn (agent will respond) +2025-12-04 16:44:08,888 INFO livekit.agents LLM metrics +2025-12-04 16:44:09,836 INFO livekit.agents EOU metrics +2025-12-04 16:44:09,849 INFO basic-agent [STATE] agent_state changed: listening -> thinking | user_state=listening +2025-12-04 16:44:09,849 INFO basic-agent [STATE] agent_state changed: thinking -> speaking | user_state=listening +2025-12-04 16:44:11,146 INFO livekit.agents TTS metrics +2025-12-04 16:44:12,376 INFO livekit.agents STT metrics +2025-12-04 16:44:15,043 INFO basic-agent [STATE] agent_state changed: speaking -> listening | user_state=listening +2025-12-04 16:44:15,418 INFO livekit.agents STT metrics +2025-12-04 16:44:16,336 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=listening +2025-12-04 16:44:16,988 INFO basic-agent [STT] PARTIAL text='There are no' | agent_state=listening | user_state=speaking +2025-12-04 16:44:17,741 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=listening +2025-12-04 16:44:17,990 INFO basic-agent [STT] PARTIAL text='Tell me about volcanoes.' | agent_state=listening | user_state=listening +2025-12-04 16:44:17,998 INFO livekit.agents STT metrics +2025-12-04 16:44:18,006 INFO basic-agent [STT] FINAL text='Tell me about volcanoes.' | agent_state=listening | user_state=listening +2025-12-04 16:44:18,008 INFO basic-agent [LOGIC] words=['tell', 'me', 'about', 'volcanoes'] | is_soft=False | has_hard=False | agent_state=listening +2025-12-04 16:44:18,010 INFO basic-agent [LOGIC] NORMAL input while agent NOT speaking -> normal user turn +2025-12-04 16:44:18,098 INFO livekit.agents EOU metrics +2025-12-04 16:44:18,098 INFO basic-agent [STATE] agent_state changed: listening -> thinking | user_state=listening +2025-12-04 16:44:19,746 INFO livekit.agents LLM metrics +2025-12-04 16:44:20,470 INFO basic-agent [STATE] agent_state changed: thinking -> speaking | user_state=listening +2025-12-04 16:44:23,006 INFO livekit.agents STT metrics +2025-12-04 16:44:23,935 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=speaking +2025-12-04 16:44:24,586 INFO livekit.agents STT metrics +2025-12-04 16:44:24,590 INFO basic-agent [STT] FINAL text='Okay.' | agent_state=speaking | user_state=speaking +2025-12-04 16:44:24,595 INFO basic-agent [LOGIC] words=['okay'] | is_soft=True | has_hard=False | agent_state=speaking +2025-12-04 16:44:24,598 INFO basic-agent [LOGIC] SOFT backchannel while speaking -> IGNORE COMPLETELY (no interrupt, no new turn) +2025-12-04 16:44:24,645 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=speaking +2025-12-04 16:44:24,726 WARNING livekit.agents skipping reply to user input, current speech generation cannot be interrupted +2025-12-04 16:44:25,338 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=speaking +2025-12-04 16:44:26,138 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=speaking +2025-12-04 16:44:26,188 INFO livekit.agents STT metrics +2025-12-04 16:44:26,188 INFO basic-agent [STT] FINAL text='Yeah.' | agent_state=speaking | user_state=listening +2025-12-04 16:44:26,196 INFO basic-agent [LOGIC] words=['yeah'] | is_soft=True | has_hard=False | agent_state=speaking +2025-12-04 16:44:26,197 INFO basic-agent [LOGIC] SOFT backchannel while speaking -> IGNORE COMPLETELY (no interrupt, no new turn) +2025-12-04 16:44:26,268 WARNING livekit.agents skipping reply to user input, current speech generation cannot be interrupted +2025-12-04 16:44:26,636 INFO livekit.agents TTS metrics +2025-12-04 16:44:27,246 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=speaking +2025-12-04 16:44:27,926 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=speaking +2025-12-04 16:44:28,097 INFO livekit.agents STT metrics +2025-12-04 16:44:28,097 INFO basic-agent [STT] FINAL text='Right.' | agent_state=speaking | user_state=listening +2025-12-04 16:44:28,107 INFO basic-agent [LOGIC] words=['right'] | is_soft=True | has_hard=False | agent_state=speaking +2025-12-04 16:44:28,107 INFO basic-agent [LOGIC] SOFT backchannel while speaking -> IGNORE COMPLETELY (no interrupt, no new turn) +2025-12-04 16:44:28,186 WARNING livekit.agents skipping reply to user input, current speech generation cannot be interrupted +2025-12-04 16:44:29,848 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=speaking +2025-12-04 16:44:30,496 INFO livekit.agents STT metrics +2025-12-04 16:44:30,501 INFO basic-agent [STT] FINAL text='Stop.' | agent_state=speaking | user_state=speaking +2025-12-04 16:44:30,503 INFO basic-agent [LOGIC] words=['stop'] | is_soft=False | has_hard=True | agent_state=speaking +2025-12-04 16:44:30,506 INFO basic-agent [LOGIC] HARD interrupt while speaking -> calling session.interrupt(force=True) +2025-12-04 16:44:30,515 INFO basic-agent [STATE] agent_state changed: speaking -> listening | user_state=speaking +2025-12-04 16:44:30,539 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=listening +2025-12-04 16:44:30,686 WARNING livekit.agents preemptive generation enabled but chat context or tools have changed after `on_user_turn_completed` +2025-12-04 16:44:30,690 INFO livekit.agents EOU metrics +2025-12-04 16:44:30,701 INFO basic-agent [STATE] agent_state changed: listening -> thinking | user_state=listening +2025-12-04 16:44:31,716 INFO livekit.agents LLM metrics +2025-12-04 16:44:33,675 INFO basic-agent [STATE] agent_state changed: thinking -> speaking | user_state=listening +2025-12-04 16:44:33,679 INFO livekit.agents TTS metrics +2025-12-04 16:44:35,478 INFO livekit.agents STT metrics +2025-12-04 16:44:36,346 INFO basic-agent [STATE] agent_state changed: speaking -> listening | user_state=listening +2025-12-04 16:44:38,046 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=listening +2025-12-04 16:44:38,676 INFO basic-agent [STT] PARTIAL text='Daniel' | agent_state=listening | user_state=speaking +2025-12-04 16:44:39,236 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=listening +2025-12-04 16:44:39,476 INFO livekit.agents STT metrics +2025-12-04 16:44:39,492 INFO basic-agent [STT] FINAL text='Ten more trucks.' | agent_state=listening | user_state=listening +2025-12-04 16:44:39,496 INFO basic-agent [LOGIC] words=['ten', 'more', 'trucks'] | is_soft=False | has_hard=False | agent_state=listening +2025-12-04 16:44:39,496 INFO basic-agent [LOGIC] NORMAL input while agent NOT speaking -> normal user turn +2025-12-04 16:44:39,646 INFO livekit.agents EOU metrics +2025-12-04 16:44:39,646 INFO basic-agent [STATE] agent_state changed: listening -> thinking | user_state=listening +2025-12-04 16:44:40,662 INFO livekit.agents LLM metrics +2025-12-04 16:44:41,836 INFO basic-agent [STATE] agent_state changed: thinking -> speaking | user_state=listening +2025-12-04 16:44:44,494 INFO livekit.agents STT metrics +2025-12-04 16:44:47,337 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=speaking +2025-12-04 16:44:47,492 INFO livekit.agents STT metrics +2025-12-04 16:44:48,446 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=speaking +2025-12-04 16:44:48,486 INFO basic-agent [STT] PARTIAL text='Tell me about rocks.' | agent_state=speaking | user_state=listening +2025-12-04 16:44:48,796 INFO livekit.agents STT metrics +2025-12-04 16:44:48,803 INFO basic-agent [STT] FINAL text='Tell me about rocks.' | agent_state=speaking | user_state=listening +2025-12-04 16:44:48,806 INFO basic-agent [LOGIC] words=['tell', 'me', 'about', 'rocks'] | is_soft=False | has_hard=False | agent_state=speaking +2025-12-04 16:44:48,806 INFO basic-agent [LOGIC] NON-SOFT utterance while speaking -> treating as interrupt (session.interrupt(force=True)) +2025-12-04 16:44:48,820 INFO basic-agent [STATE] agent_state changed: speaking -> listening | user_state=listening +2025-12-04 16:44:48,936 WARNING livekit.agents preemptive generation enabled but chat context or tools have changed after `on_user_turn_completed` +2025-12-04 16:44:48,936 INFO livekit.agents EOU metrics +2025-12-04 16:44:48,950 INFO basic-agent [STATE] agent_state changed: listening -> thinking | user_state=listening +2025-12-04 16:44:50,238 INFO livekit.agents LLM metrics +2025-12-04 16:44:51,024 INFO basic-agent [STATE] agent_state changed: thinking -> speaking | user_state=listening +2025-12-04 16:44:53,776 INFO livekit.agents STT metrics +2025-12-04 16:44:54,816 INFO livekit.agents TTS metrics +2025-12-04 16:44:55,647 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=speaking +2025-12-04 16:44:56,887 INFO basic-agent [STT] PARTIAL text='Okay. Right. But wait.' | agent_state=speaking | user_state=speaking +2025-12-04 16:44:56,946 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=speaking +2025-12-04 16:44:57,093 INFO livekit.agents STT metrics +2025-12-04 16:44:57,096 INFO basic-agent [STT] FINAL text='Okay. Right. But wait.' | agent_state=speaking | user_state=listening +2025-12-04 16:44:57,096 INFO basic-agent [LOGIC] words=['okay', 'right', 'but', 'wait'] | is_soft=False | has_hard=True | agent_state=speaking +2025-12-04 16:44:57,096 INFO basic-agent [LOGIC] HARD interrupt while speaking -> calling session.interrupt(force=True) +2025-12-04 16:44:57,113 INFO basic-agent [STATE] agent_state changed: speaking -> listening | user_state=listening +2025-12-04 16:44:57,192 WARNING livekit.agents preemptive generation enabled but chat context or tools have changed after `on_user_turn_completed` +2025-12-04 16:44:57,194 INFO livekit.agents EOU metrics +2025-12-04 16:44:57,203 INFO basic-agent [STATE] agent_state changed: listening -> thinking | user_state=listening +2025-12-04 16:44:58,697 INFO livekit.agents LLM metrics +2025-12-04 16:44:59,296 INFO basic-agent [STATE] agent_state changed: thinking -> speaking | user_state=listening +2025-12-04 16:44:59,876 INFO livekit.agents TTS metrics +2025-12-04 16:45:02,092 INFO livekit.agents STT metrics +2025-12-04 16:45:02,846 INFO basic-agent [STATE] agent_state changed: speaking -> listening | user_state=listening +2025-12-04 16:45:05,086 INFO livekit.agents STT metrics +2025-12-04 16:45:08,216 INFO livekit.agents shutting down worker +2025-12-04 16:45:08,235 INFO basic-agent [METRICS] Usage: UsageSummary(llm_prompt_tokens=2032, llm_prompt_cached_tokens=0, llm_input_audio_tokens=0, llm_input_cached_audio_tokens=0, llm_input_text_tokens=0, llm_input_cached_text_tokens=0, llm_input_image_tokens=0, llm_input_cached_image_tokens=0, llm_completion_tokens=235, llm_output_audio_tokens=0, llm_output_image_tokens=0, llm_output_text_tokens=0, tts_characters_count=838, tts_audio_duration=52.919999999999945, stt_audio_duration=69.74999999999991) +2025-12-04 16:45:08,248 INFO livekit.agents process exiting diff --git a/examples/voice_agents/interrupt_config.py b/examples/voice_agents/interrupt_config.py new file mode 100644 index 0000000000..66f3264738 --- /dev/null +++ b/examples/voice_agents/interrupt_config.py @@ -0,0 +1,42 @@ +# interrupt_config.py + +""" +Configuration for interrupt / backchannel word lists. + +Change SOFT_WORDS and HARD_WORDS here without touching the agent logic. +All words must be lowercase because transcripts are lowercased before matching. +""" + +SOFT_WORDS: set[str] = { + "yeah", + "yea", + "yah", + "ya", + "ok", + "okay", + "k", + "kk", + "hmm", + "mm", + "mmm", + "uh", + "uhh", + "uhm", + "um", + "huh", + "right", + "yep", + "yup", + "uh-huh", + "uhhuh", +} + +HARD_WORDS: set[str] = { + "stop", + "wait", + "no", + "nope", + "cancel", + "pause", + "hold", +} diff --git a/examples/voice_agents/interrupt_handle.py b/examples/voice_agents/interrupt_handle.py new file mode 100644 index 0000000000..5eea8e79db --- /dev/null +++ b/examples/voice_agents/interrupt_handle.py @@ -0,0 +1,295 @@ +import logging +import re + +from dotenv import load_dotenv + +from livekit.agents import ( + Agent, + AgentServer, + AgentSession, + JobContext, + JobProcess, + MetricsCollectedEvent, + RunContext, + cli, + metrics, + room_io, + AgentStateChangedEvent, + UserInputTranscribedEvent, + UserStateChangedEvent, +) +from livekit.agents.llm import function_tool +from livekit.plugins import silero +from livekit.plugins.turn_detector.multilingual import MultilingualModel + +# uncomment to enable Krisp background voice/noise cancellation +# from livekit.plugins import noise_cancellation + +logger = logging.getLogger("basic-agent") + +load_dotenv() + +# ----------------- WORD LISTS (from config) ----------------- + +from interrupt_config import SOFT_WORDS, HARD_WORDS + +def _tokens(text: str) -> list[str]: + # split on non-letters so "ok," / "yeah?" etc still work + return [w for w in re.split(r"[^a-z]+", text.lower()) if w] + + +def _is_soft(words: list[str]) -> bool: + return bool(words) and all(w in SOFT_WORDS for w in words) + + +def _has_hard(words: list[str]) -> bool: + return any(w in HARD_WORDS for w in words) + + +# ----------------- AGENT ----------------- + + +class MyAgent(Agent): + def __init__(self) -> None: + # we do NOT set allow_interruptions here; we control it at session-level + super().__init__( + instructions=( + "Your name is Kelly. You interact with users via voice. " + "Keep responses concise and to the point. " + "Do not use emojis, asterisks, markdown, or other special characters. " + "You are curious and friendly, and have a sense of humor. " + "You will speak English to the user." + ), + ) + + async def on_enter(self): + logger.info("[AGENT] on_enter -> generating initial reply") + self.session.generate_reply() + + @function_tool + async def lookup_weather( + self, context: RunContext, location: str, latitude: str, longitude: str + ): + """Called when the user asks for weather related information.""" + logger.info(f"[TOOL] Looking up weather for {location}") + return "sunny with a temperature of 70 degrees." + + +server = AgentServer() + + +def prewarm(proc: JobProcess): + logger.info("[PREWARM] loading VAD model") + proc.userdata["vad"] = silero.VAD.load() + logger.info("[PREWARM] VAD model loaded") + + +server.setup_fnc = prewarm + + +@server.rtc_session() +async def entrypoint(ctx: JobContext): + ctx.log_context_fields = { + "room": ctx.room.name, + } + + logger.info("[ENTRYPOINT] starting AgentSession for room=%s", ctx.room.name) + + session = AgentSession( + # REQUIRED MODELS + stt="deepgram/nova-3", + llm="openai/gpt-4.1-nano", + tts="inworld/inworld-tts-1", + + # Turn detection + VAD + turn_detection=MultilingualModel(), + vad=ctx.proc.userdata["vad"], + + preemptive_generation=True, + + # false interruption handling (still useful) + resume_false_interruption=True, + false_interruption_timeout=1.0, + + # 🔑 KEY SETTINGS: + # DO NOT allow automatic interruptions of TTS by user speech. + allow_interruptions=False, + # BUT still keep user audio and send to STT while uninterruptible. + discard_audio_if_uninterruptible=False, + ) + + # ------------- METRICS ------------- + + usage_collector = metrics.UsageCollector() + + @session.on("metrics_collected") + def _on_metrics_collected(ev: MetricsCollectedEvent): + metrics.log_metrics(ev.metrics) + usage_collector.collect(ev.metrics) + + async def log_usage(): + summary = usage_collector.get_summary() + logger.info(f"[METRICS] Usage: {summary}") + + ctx.add_shutdown_callback(log_usage) + + # ------------- DEBUG STATE LOGS ------------- + + @session.on("agent_state_changed") + def _on_agent_state_changed(ev: AgentStateChangedEvent): + logger.info( + "[STATE] agent_state changed: %s -> %s | user_state=%s", + ev.old_state, + ev.new_state, + session.user_state, + ) + + @session.on("user_state_changed") + def _on_user_state_changed(ev: UserStateChangedEvent): + logger.info( + "[STATE] user_state changed: %s -> %s | agent_state=%s", + ev.old_state, + ev.new_state, + session.agent_state, + ) + + # ------------- CORE STT LOGIC ------------- + + @session.on("user_input_transcribed") + def _on_user_input_transcribed(ev: UserInputTranscribedEvent): + text = (ev.transcript or "").strip() + agent_state = session.agent_state + + logger.info( + "[STT] %s text=%r | agent_state=%s | user_state=%s", + "FINAL" if ev.is_final else "PARTIAL", + text, + agent_state, + session.user_state, + ) + + if not text: + return + + # Only act on FINAL transcripts (partials are too noisy) + if not ev.is_final: + return + + words = _tokens(text) + is_soft = _is_soft(words) + has_hard = _has_hard(words) + + logger.info( + "[LOGIC] words=%s | is_soft=%s | has_hard=%s | agent_state=%s", + words, + is_soft, + has_hard, + agent_state, + ) + + # ----- CASE 1: Agent is SPEAKING ----- + if agent_state == "speaking": + # HARD interrupt: "stop / wait / no / cancel / pause" + if has_hard: + logger.info( + "[LOGIC] HARD interrupt while speaking -> calling session.interrupt(force=True)" + ) + try: + # 🔥 IMPORTANT: force=True so we can override non-interruptible handles + session.interrupt(force=True) + except Exception: + logger.exception( + "[ERROR] session.interrupt(force=True) raised an exception" + ) + return + + # PURE SOFT backchannel: only yeah/ok/hmm, etc. + if is_soft: + logger.info( + "[LOGIC] SOFT backchannel while speaking -> IGNORE COMPLETELY " + "(no interrupt, no new turn)" + ) + # do nothing: agent continues talking, no hiccup, no new LLM turn + return + + # Other content while speaking: treat as real interrupt for now + logger.info( + "[LOGIC] NON-SOFT utterance while speaking -> treating as interrupt " + "(session.interrupt(force=True))" + ) + try: + session.interrupt(force=True) + except Exception: + logger.exception( + "[ERROR] session.interrupt(force=True) raised an exception" + ) + return + + # ----- CASE 2: Agent is NOT SPEAKING ----- + # In this case, we let everything pass through as normal user input. + # This gives you: "yeah/ok/hmm" while silent -> agent responds. + if agent_state != "speaking": + if is_soft: + logger.info( + "[LOGIC] SOFT input while agent NOT speaking -> normal user turn " + "(agent will respond)" + ) + elif has_hard: + logger.info( + "[LOGIC] HARD word while agent NOT speaking -> normal user turn " + "(LLM decides what to do)" + ) + else: + logger.info( + "[LOGIC] NORMAL input while agent NOT speaking -> normal user turn" + ) + # No special actions; just let the framework handle it. + return + + # ------------- START SESSION ------------- + + await session.start( + agent=MyAgent(), + room=ctx.room, + room_options=room_io.RoomOptions( + audio_input=room_io.AudioInputOptions( + # noise_cancellation=noise_cancellation.BVC(), + ), + ), + ) + + +# if __name__ == "__main__": +# logging.basicConfig( +# level=logging.INFO, +# format="%(asctime)s %(levelname)-5s %(name)-12s %(message)s", +# ) +# cli.run_app(server) + +if __name__ == "__main__": + import os + + # Make sure proof/ exists + os.makedirs("proof", exist_ok=True) + + # Console logging (optional, for debugging) + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)-5s %(name)-12s %(message)s", + ) + + # File logging for assignment proof + file_handler = logging.FileHandler( + "proof/log-transcript-harshmehta1618.txt", + encoding="utf-8", + ) + file_handler.setLevel(logging.INFO) + file_handler.setFormatter( + logging.Formatter("%(asctime)s %(levelname)-5s %(name)-12s %(message)s") + ) + + # Attach to root logger so *all* logs (your [LOGIC], [STT], livekit.agents, etc.) go there + root_logger = logging.getLogger() + root_logger.addHandler(file_handler) + + cli.run_app(server) diff --git a/proof-of-working/Log-transcript.txt b/proof-of-working/Log-transcript.txt new file mode 100644 index 0000000000..968a9ec581 --- /dev/null +++ b/proof-of-working/Log-transcript.txt @@ -0,0 +1,136 @@ +2025-12-04 16:43:48,850 INFO livekit livekit_ffi::server:139:livekit_ffi::server - initializing ffi server v0.12.40 +2025-12-04 16:43:49,249 INFO livekit.agents starting worker +2025-12-04 16:43:49,249 INFO livekit.agents starting inference executor +2025-12-04 16:43:49,273 INFO livekit.agents initializing process +2025-12-04 16:43:53,941 INFO livekit.agents process initialized +2025-12-04 16:43:53,948 INFO livekit.agents HTTP server listening on :63063 +2025-12-04 16:43:53,966 INFO livekit.agents initializing job runner +2025-12-04 16:43:53,969 INFO basic-agent [PREWARM] loading VAD model +2025-12-04 16:43:54,138 INFO basic-agent [PREWARM] VAD model loaded +2025-12-04 16:43:54,140 INFO livekit.agents job runner initialized +2025-12-04 16:43:54,185 INFO basic-agent [ENTRYPOINT] starting AgentSession for room=mock_room +2025-12-04 16:43:54,669 INFO basic-agent [AGENT] on_enter -> generating initial reply +2025-12-04 16:43:55,159 INFO basic-agent [STATE] agent_state changed: initializing -> listening | user_state=listening +2025-12-04 16:43:55,166 WARNING livekit.agents resume_false_interruption is enabled but audio output does not support pause, it will be ignored +2025-12-04 16:43:55,175 INFO basic-agent [STATE] agent_state changed: listening -> thinking | user_state=listening +2025-12-04 16:43:56,488 INFO basic-agent [TOOL] Looking up weather for New York +2025-12-04 16:43:56,497 INFO livekit.agents LLM metrics +2025-12-04 16:43:57,870 INFO livekit.agents LLM metrics +2025-12-04 16:43:59,578 INFO basic-agent [STATE] agent_state changed: thinking -> speaking | user_state=listening +2025-12-04 16:44:00,676 INFO livekit.agents STT metrics +2025-12-04 16:44:00,846 INFO livekit.agents TTS metrics +2025-12-04 16:44:03,698 INFO livekit.agents STT metrics +2025-12-04 16:44:04,538 INFO basic-agent [STATE] agent_state changed: speaking -> listening | user_state=listening +2025-12-04 16:44:06,648 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=listening +2025-12-04 16:44:07,104 INFO livekit.agents STT metrics +2025-12-04 16:44:07,346 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=listening +2025-12-04 16:44:07,386 INFO livekit.agents STT metrics +2025-12-04 16:44:07,386 INFO basic-agent [STT] FINAL text='Yeah.' | agent_state=listening | user_state=listening +2025-12-04 16:44:07,395 INFO basic-agent [LOGIC] words=['yeah'] | is_soft=True | has_hard=False | agent_state=listening +2025-12-04 16:44:07,396 INFO basic-agent [LOGIC] SOFT input while agent NOT speaking -> normal user turn (agent will respond) +2025-12-04 16:44:08,888 INFO livekit.agents LLM metrics +2025-12-04 16:44:09,836 INFO livekit.agents EOU metrics +2025-12-04 16:44:09,849 INFO basic-agent [STATE] agent_state changed: listening -> thinking | user_state=listening +2025-12-04 16:44:09,849 INFO basic-agent [STATE] agent_state changed: thinking -> speaking | user_state=listening +2025-12-04 16:44:11,146 INFO livekit.agents TTS metrics +2025-12-04 16:44:12,376 INFO livekit.agents STT metrics +2025-12-04 16:44:15,043 INFO basic-agent [STATE] agent_state changed: speaking -> listening | user_state=listening +2025-12-04 16:44:15,418 INFO livekit.agents STT metrics +2025-12-04 16:44:16,336 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=listening +2025-12-04 16:44:16,988 INFO basic-agent [STT] PARTIAL text='There are no' | agent_state=listening | user_state=speaking +2025-12-04 16:44:17,741 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=listening +2025-12-04 16:44:17,990 INFO basic-agent [STT] PARTIAL text='Tell me about volcanoes.' | agent_state=listening | user_state=listening +2025-12-04 16:44:17,998 INFO livekit.agents STT metrics +2025-12-04 16:44:18,006 INFO basic-agent [STT] FINAL text='Tell me about volcanoes.' | agent_state=listening | user_state=listening +2025-12-04 16:44:18,008 INFO basic-agent [LOGIC] words=['tell', 'me', 'about', 'volcanoes'] | is_soft=False | has_hard=False | agent_state=listening +2025-12-04 16:44:18,010 INFO basic-agent [LOGIC] NORMAL input while agent NOT speaking -> normal user turn +2025-12-04 16:44:18,098 INFO livekit.agents EOU metrics +2025-12-04 16:44:18,098 INFO basic-agent [STATE] agent_state changed: listening -> thinking | user_state=listening +2025-12-04 16:44:19,746 INFO livekit.agents LLM metrics +2025-12-04 16:44:20,470 INFO basic-agent [STATE] agent_state changed: thinking -> speaking | user_state=listening +2025-12-04 16:44:23,006 INFO livekit.agents STT metrics +2025-12-04 16:44:23,935 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=speaking +2025-12-04 16:44:24,586 INFO livekit.agents STT metrics +2025-12-04 16:44:24,590 INFO basic-agent [STT] FINAL text='Okay.' | agent_state=speaking | user_state=speaking +2025-12-04 16:44:24,595 INFO basic-agent [LOGIC] words=['okay'] | is_soft=True | has_hard=False | agent_state=speaking +2025-12-04 16:44:24,598 INFO basic-agent [LOGIC] SOFT backchannel while speaking -> IGNORE COMPLETELY (no interrupt, no new turn) +2025-12-04 16:44:24,645 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=speaking +2025-12-04 16:44:24,726 WARNING livekit.agents skipping reply to user input, current speech generation cannot be interrupted +2025-12-04 16:44:25,338 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=speaking +2025-12-04 16:44:26,138 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=speaking +2025-12-04 16:44:26,188 INFO livekit.agents STT metrics +2025-12-04 16:44:26,188 INFO basic-agent [STT] FINAL text='Yeah.' | agent_state=speaking | user_state=listening +2025-12-04 16:44:26,196 INFO basic-agent [LOGIC] words=['yeah'] | is_soft=True | has_hard=False | agent_state=speaking +2025-12-04 16:44:26,197 INFO basic-agent [LOGIC] SOFT backchannel while speaking -> IGNORE COMPLETELY (no interrupt, no new turn) +2025-12-04 16:44:26,268 WARNING livekit.agents skipping reply to user input, current speech generation cannot be interrupted +2025-12-04 16:44:26,636 INFO livekit.agents TTS metrics +2025-12-04 16:44:27,246 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=speaking +2025-12-04 16:44:27,926 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=speaking +2025-12-04 16:44:28,097 INFO livekit.agents STT metrics +2025-12-04 16:44:28,097 INFO basic-agent [STT] FINAL text='Right.' | agent_state=speaking | user_state=listening +2025-12-04 16:44:28,107 INFO basic-agent [LOGIC] words=['right'] | is_soft=True | has_hard=False | agent_state=speaking +2025-12-04 16:44:28,107 INFO basic-agent [LOGIC] SOFT backchannel while speaking -> IGNORE COMPLETELY (no interrupt, no new turn) +2025-12-04 16:44:28,186 WARNING livekit.agents skipping reply to user input, current speech generation cannot be interrupted +2025-12-04 16:44:29,848 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=speaking +2025-12-04 16:44:30,496 INFO livekit.agents STT metrics +2025-12-04 16:44:30,501 INFO basic-agent [STT] FINAL text='Stop.' | agent_state=speaking | user_state=speaking +2025-12-04 16:44:30,503 INFO basic-agent [LOGIC] words=['stop'] | is_soft=False | has_hard=True | agent_state=speaking +2025-12-04 16:44:30,506 INFO basic-agent [LOGIC] HARD interrupt while speaking -> calling session.interrupt(force=True) +2025-12-04 16:44:30,515 INFO basic-agent [STATE] agent_state changed: speaking -> listening | user_state=speaking +2025-12-04 16:44:30,539 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=listening +2025-12-04 16:44:30,686 WARNING livekit.agents preemptive generation enabled but chat context or tools have changed after `on_user_turn_completed` +2025-12-04 16:44:30,690 INFO livekit.agents EOU metrics +2025-12-04 16:44:30,701 INFO basic-agent [STATE] agent_state changed: listening -> thinking | user_state=listening +2025-12-04 16:44:31,716 INFO livekit.agents LLM metrics +2025-12-04 16:44:33,675 INFO basic-agent [STATE] agent_state changed: thinking -> speaking | user_state=listening +2025-12-04 16:44:33,679 INFO livekit.agents TTS metrics +2025-12-04 16:44:35,478 INFO livekit.agents STT metrics +2025-12-04 16:44:36,346 INFO basic-agent [STATE] agent_state changed: speaking -> listening | user_state=listening +2025-12-04 16:44:38,046 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=listening +2025-12-04 16:44:38,676 INFO basic-agent [STT] PARTIAL text='Daniel' | agent_state=listening | user_state=speaking +2025-12-04 16:44:39,236 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=listening +2025-12-04 16:44:39,476 INFO livekit.agents STT metrics +2025-12-04 16:44:39,492 INFO basic-agent [STT] FINAL text='Ten more trucks.' | agent_state=listening | user_state=listening +2025-12-04 16:44:39,496 INFO basic-agent [LOGIC] words=['ten', 'more', 'trucks'] | is_soft=False | has_hard=False | agent_state=listening +2025-12-04 16:44:39,496 INFO basic-agent [LOGIC] NORMAL input while agent NOT speaking -> normal user turn +2025-12-04 16:44:39,646 INFO livekit.agents EOU metrics +2025-12-04 16:44:39,646 INFO basic-agent [STATE] agent_state changed: listening -> thinking | user_state=listening +2025-12-04 16:44:40,662 INFO livekit.agents LLM metrics +2025-12-04 16:44:41,836 INFO basic-agent [STATE] agent_state changed: thinking -> speaking | user_state=listening +2025-12-04 16:44:44,494 INFO livekit.agents STT metrics +2025-12-04 16:44:47,337 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=speaking +2025-12-04 16:44:47,492 INFO livekit.agents STT metrics +2025-12-04 16:44:48,446 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=speaking +2025-12-04 16:44:48,486 INFO basic-agent [STT] PARTIAL text='Tell me about rocks.' | agent_state=speaking | user_state=listening +2025-12-04 16:44:48,796 INFO livekit.agents STT metrics +2025-12-04 16:44:48,803 INFO basic-agent [STT] FINAL text='Tell me about rocks.' | agent_state=speaking | user_state=listening +2025-12-04 16:44:48,806 INFO basic-agent [LOGIC] words=['tell', 'me', 'about', 'rocks'] | is_soft=False | has_hard=False | agent_state=speaking +2025-12-04 16:44:48,806 INFO basic-agent [LOGIC] NON-SOFT utterance while speaking -> treating as interrupt (session.interrupt(force=True)) +2025-12-04 16:44:48,820 INFO basic-agent [STATE] agent_state changed: speaking -> listening | user_state=listening +2025-12-04 16:44:48,936 WARNING livekit.agents preemptive generation enabled but chat context or tools have changed after `on_user_turn_completed` +2025-12-04 16:44:48,936 INFO livekit.agents EOU metrics +2025-12-04 16:44:48,950 INFO basic-agent [STATE] agent_state changed: listening -> thinking | user_state=listening +2025-12-04 16:44:50,238 INFO livekit.agents LLM metrics +2025-12-04 16:44:51,024 INFO basic-agent [STATE] agent_state changed: thinking -> speaking | user_state=listening +2025-12-04 16:44:53,776 INFO livekit.agents STT metrics +2025-12-04 16:44:54,816 INFO livekit.agents TTS metrics +2025-12-04 16:44:55,647 INFO basic-agent [STATE] user_state changed: listening -> speaking | agent_state=speaking +2025-12-04 16:44:56,887 INFO basic-agent [STT] PARTIAL text='Okay. Right. But wait.' | agent_state=speaking | user_state=speaking +2025-12-04 16:44:56,946 INFO basic-agent [STATE] user_state changed: speaking -> listening | agent_state=speaking +2025-12-04 16:44:57,093 INFO livekit.agents STT metrics +2025-12-04 16:44:57,096 INFO basic-agent [STT] FINAL text='Okay. Right. But wait.' | agent_state=speaking | user_state=listening +2025-12-04 16:44:57,096 INFO basic-agent [LOGIC] words=['okay', 'right', 'but', 'wait'] | is_soft=False | has_hard=True | agent_state=speaking +2025-12-04 16:44:57,096 INFO basic-agent [LOGIC] HARD interrupt while speaking -> calling session.interrupt(force=True) +2025-12-04 16:44:57,113 INFO basic-agent [STATE] agent_state changed: speaking -> listening | user_state=listening +2025-12-04 16:44:57,192 WARNING livekit.agents preemptive generation enabled but chat context or tools have changed after `on_user_turn_completed` +2025-12-04 16:44:57,194 INFO livekit.agents EOU metrics +2025-12-04 16:44:57,203 INFO basic-agent [STATE] agent_state changed: listening -> thinking | user_state=listening +2025-12-04 16:44:58,697 INFO livekit.agents LLM metrics +2025-12-04 16:44:59,296 INFO basic-agent [STATE] agent_state changed: thinking -> speaking | user_state=listening +2025-12-04 16:44:59,876 INFO livekit.agents TTS metrics +2025-12-04 16:45:02,092 INFO livekit.agents STT metrics +2025-12-04 16:45:02,846 INFO basic-agent [STATE] agent_state changed: speaking -> listening | user_state=listening +2025-12-04 16:45:05,086 INFO livekit.agents STT metrics +2025-12-04 16:45:08,216 INFO livekit.agents shutting down worker +2025-12-04 16:45:08,235 INFO basic-agent [METRICS] Usage: UsageSummary(llm_prompt_tokens=2032, llm_prompt_cached_tokens=0, llm_input_audio_tokens=0, llm_input_cached_audio_tokens=0, llm_input_text_tokens=0, llm_input_cached_text_tokens=0, llm_input_image_tokens=0, llm_input_cached_image_tokens=0, llm_completion_tokens=235, llm_output_audio_tokens=0, llm_output_image_tokens=0, llm_output_text_tokens=0, tts_characters_count=838, tts_audio_duration=52.919999999999945, stt_audio_duration=69.74999999999991) +2025-12-04 16:45:08,248 INFO livekit.agents process exiting \ No newline at end of file diff --git a/proof-of-working/demo_video.mp4 b/proof-of-working/demo_video.mp4 new file mode 100644 index 0000000000..c6d487d159 Binary files /dev/null and b/proof-of-working/demo_video.mp4 differ