loadxf · loadxf · Mar 2, 2026 · Mar 2, 2026 · Copilot · Mar 2, 2026
diff --git a/gwenn/channels/discord_channel.py b/gwenn/channels/discord_channel.py
@@ -124,6 +124,11 @@ async def stop(self) -> None:
                 await self._audio_transcriber.close()
             except Exception:
                 logger.debug("discord_channel.audio_transcriber_close_error", exc_info=True)
+        if self._tts_synthesizer is not None:
+            try:
+                await self._tts_synthesizer.close()
+            except Exception:
+                logger.debug("discord_channel.tts_synthesizer_close_error", exc_info=True)
         if self._client is not None:
             try:
                 await self._client.close()
@@ -241,6 +246,7 @@ def _session_id_for_interaction(self, interaction, raw_user_id: str) -> str:
             and isinstance(channel_obj, _discord.Thread)
         ):
             raw_thread_id = raw_chat_id
+            raw_chat_id = self._normalize_optional_id(getattr(channel_obj, "parent_id", None))
         scope_key = self._session_scope_key(
             raw_user_id=raw_user_id,
             raw_chat_id=raw_chat_id,
@@ -380,6 +386,55 @@ def _get_audio_transcriber(self):
             logger.warning("discord_channel.transcriber_init_failed", error=str(exc))
             return None
 
+    # ------------------------------------------------------------------
+    # TTS synthesizer helper
+    # ------------------------------------------------------------------
+
+    _tts_synthesizer = None
+
+    def _get_tts_synthesizer(self):
+        """Lazily create a TextToSpeechSynthesizer from the agent's ElevenLabs config."""
+        if self._tts_synthesizer is not None:
+            return self._tts_synthesizer
+        el_config = getattr(self._agent._config, "elevenlabs", None)
+        if el_config is None or not el_config.is_available:
+            return None
+        try:
+            from gwenn.media.tts import TextToSpeechSynthesizer
+
+            self._tts_synthesizer = TextToSpeechSynthesizer(el_config)
+            return self._tts_synthesizer
+        except Exception as exc:
+            logger.warning("discord_channel.tts_init_failed", error=str(exc))
+            return None
+
+    def _should_send_voice(self, is_voice_message: bool) -> bool:
+        """Return True if a voice reply should be sent given the current TTS mode."""
+        el_config = getattr(self._agent._config, "elevenlabs", None)
+        if el_config is None or getattr(el_config, "is_available", False) is not True:
+            return False
+        return el_config.should_send_voice(is_voice_message)
+
+    async def _send_voice_reply(self, message, text: str) -> None:
+        """Synthesize *text* and send as a Discord file attachment."""
+        try:
+            import io
+
+            import discord
+
+            synthesizer = self._get_tts_synthesizer()
+            if synthesizer is None:
+                return
+            audio_bytes = await synthesizer.synthesize(text)
+            if audio_bytes is None:
+                return
+            await message.reply(
+                file=discord.File(io.BytesIO(audio_bytes), filename="response.ogg"),
+                mention_author=False,
+            )
+        except Exception as exc:
+            logger.warning("discord_channel.voice_reply_failed", error=str(exc))
+
     # ------------------------------------------------------------------
     # Message handler
     # ------------------------------------------------------------------
@@ -428,6 +483,7 @@ async def _on_message(self, message) -> None:
         # Extract media attachments when media is enabled.
         image_blocks: list[dict] = []
         media_descriptions: list[str] = []
+        is_voice_message = False
         if getattr(self._config, "enable_media", False) and message.attachments:
             image_blocks = await self._extract_image_attachments(message)
 
@@ -452,8 +508,9 @@ async def _on_message(self, message) -> None:
                         desc_parts.append("]")
                     media_descriptions.append("".join(desc_parts))
 
-            # Audio attachments — transcribe.
+            # Audio attachments — transcribe and track for TTS reply.
             audio_attachments = await self._extract_audio_attachments(message)
+            is_voice_message = bool(audio_attachments)
             if audio_attachments:
                 transcriber = self._get_audio_transcriber()
                 for audio_bytes, filename in audio_attachments:
@@ -500,11 +557,10 @@ async def _on_message(self, message) -> None:
             return
 
         raw_chat_id = self._normalize_optional_id(getattr(message.channel, "id", None))
-        raw_thread_id = (
-            raw_chat_id
-            if hasattr(discord, "Thread") and isinstance(message.channel, discord.Thread)
-            else None
-        )
+        raw_thread_id = None
+        if hasattr(discord, "Thread") and isinstance(message.channel, discord.Thread):
+            raw_thread_id = raw_chat_id
+            raw_chat_id = self._normalize_optional_id(getattr(message.channel, "parent_id", None))
         session_scope_key = self._session_scope_key(
             raw_user_id=raw_id,
             raw_chat_id=raw_chat_id,
@@ -576,6 +632,10 @@ async def _on_message(self, message) -> None:
                     except Exception as exc:
                         logger.error("discord_channel.send_error", error=str(exc), exc_info=True)
 
+                    # Send a voice reply if TTS is enabled for this context.
+                    if self._should_send_voice(is_voice_message):
+                        await self._send_voice_reply(message, str(response))
+
                 # Clear the "received" reaction now that we've replied.
                 try:
                     await message.remove_reaction("\U0001f916", self._client.user)

diff --git a/gwenn/channels/telegram_channel.py b/gwenn/channels/telegram_channel.py
@@ -244,6 +244,11 @@ async def stop(self) -> None:
                 await self._audio_transcriber.close()
             except Exception:
                 logger.debug("telegram_channel.audio_transcriber_close_error", exc_info=True)
+        if self._tts_synthesizer is not None:
+            try:
+                await self._tts_synthesizer.close()
+            except Exception:
+                logger.debug("telegram_channel.tts_synthesizer_close_error", exc_info=True)
         if self._app is None:
             return
         try:
@@ -980,6 +985,8 @@ async def _process_user_input(
         context,
         raw_id: str,
         message: UserMessage | str,
+        *,
+        is_voice_message: bool = False,
     ) -> None:
         """Core message processing shared by text and media handlers."""
         if isinstance(message, str):
@@ -1048,6 +1055,10 @@ async def _process_user_input(
                     update.message, chunks, button_rows=button_rows
                 )
 
+                # Send a voice reply if TTS is enabled for this context.
+                if self._should_send_voice(is_voice_message):
+                    await self._send_voice_reply(update, response_text)
+
                 # Clear the "received" reaction now that we've replied.
                 await self._clear_reaction(update.message)
         finally:
@@ -1237,7 +1248,7 @@ async def _on_voice(self, update, context) -> None:
             user_id=raw_id,
             has_transcript=bool(transcript),
         )
-        await self._process_user_input(update, context, raw_id, description)
+        await self._process_user_input(update, context, raw_id, description, is_voice_message=True)
 
     _MAX_VIDEO_BYTES: int = 20 * 1024 * 1024  # 20 MB
 
@@ -1340,6 +1351,50 @@ def _get_audio_transcriber(self):
             logger.warning("telegram_channel.transcriber_init_failed", error=str(exc))
             return None
 
+    # ------------------------------------------------------------------
+    # TTS synthesizer helper
+    # ------------------------------------------------------------------
+
+    _tts_synthesizer = None
+
+    def _get_tts_synthesizer(self):
+        """Lazily create a TextToSpeechSynthesizer from the agent's ElevenLabs config."""
+        if self._tts_synthesizer is not None:
+            return self._tts_synthesizer
+        el_config = getattr(self._agent._config, "elevenlabs", None)
+        if el_config is None or not el_config.is_available:
+            return None
+        try:
+            from gwenn.media.tts import TextToSpeechSynthesizer
+
+            self._tts_synthesizer = TextToSpeechSynthesizer(el_config)
+            return self._tts_synthesizer
+        except Exception as exc:
+            logger.warning("telegram_channel.tts_init_failed", error=str(exc))
+            return None
+
+    def _should_send_voice(self, is_voice_message: bool) -> bool:
+        """Return True if a voice reply should be sent given the current TTS mode."""
+        el_config = getattr(self._agent._config, "elevenlabs", None)
+        if el_config is None or getattr(el_config, "is_available", False) is not True:
+            return False
+        return el_config.should_send_voice(is_voice_message)
+
+    async def _send_voice_reply(self, update, text: str) -> None:
+        """Synthesize *text* and send as a Telegram voice message."""
+        try:
+            import io
+
+            synthesizer = self._get_tts_synthesizer()
+            if synthesizer is None:
+                return
+            audio_bytes = await synthesizer.synthesize(text)
+            if audio_bytes is None:
+                return
+            await update.message.reply_voice(voice=io.BytesIO(audio_bytes))
+        except Exception as exc:
+            logger.warning("telegram_channel.voice_reply_failed", error=str(exc))
+
     # ------------------------------------------------------------------
     # Edited / unsupported / error handlers
     # ------------------------------------------------------------------

diff --git a/gwenn/config.py b/gwenn/config.py
@@ -468,6 +468,43 @@ def is_available(self) -> bool:
         return bool(self.api_key)
 
 
+class ElevenLabsConfig(BaseSettings):
+    """Configuration for ElevenLabs text-to-speech (optional)."""
+
+    api_key: Optional[str] = Field(None, alias="ELEVENLABS_API_KEY")
+    voice_id: str = Field("JBFqnCBsd6RMkjVDRZzb", alias="GWENN_TTS_VOICE_ID")
+    model: str = Field("eleven_turbo_v2_5", alias="GWENN_TTS_MODEL")
+    output_format: str = Field("opus_48000_128", alias="GWENN_TTS_OUTPUT_FORMAT")
+    max_chars: int = Field(2500, alias="GWENN_TTS_MAX_CHARS")
+    mode: str = Field("voice_reply", alias="GWENN_TTS_MODE")
+
+    model_config = {"env_file": _ENV_FILE, "extra": "ignore"}
+
+    @property
+    def is_available(self) -> bool:
+        return bool(self.api_key)
+
+    def should_send_voice(self, is_voice_message: bool = False) -> bool:
+        """Whether a voice reply should be sent for this interaction."""
+        if not self.is_available or self.mode == "off":
+            return False
+        if self.mode == "always":
+            return True
+        return self.mode == "voice_reply" and is_voice_message
+
+    @model_validator(mode="after")
+    def validate_mode(self) -> "ElevenLabsConfig":
+        if self.mode not in {"off", "voice_reply", "always"}:
+            import structlog
+            structlog.get_logger(__name__).warning(
+                "config.elevenlabs_invalid_mode",
+                provided=self.mode,
+                fallback="voice_reply",
+            )
+            self.mode = "voice_reply"
+        return self
+
+
 class OrchestrationConfig(BaseSettings):
     """Configuration for the subagent orchestration system."""
 
@@ -720,6 +757,9 @@ def __init__(self):
         # Groq Whisper transcription (optional)
         self.groq = GroqConfig()
 
+        # ElevenLabs text-to-speech (optional)
+        self.elevenlabs = ElevenLabsConfig()
+
         # Channel config (channel mode; Telegram/Discord configs loaded lazily)
         self.channel = ChannelConfig()