Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 66 additions & 6 deletions gwenn/channels/discord_channel.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,11 @@ async def stop(self) -> None:
await self._audio_transcriber.close()
except Exception:
logger.debug("discord_channel.audio_transcriber_close_error", exc_info=True)
if self._tts_synthesizer is not None:
try:
await self._tts_synthesizer.close()
except Exception:
logger.debug("discord_channel.tts_synthesizer_close_error", exc_info=True)
if self._client is not None:
try:
await self._client.close()
Expand Down Expand Up @@ -241,6 +246,7 @@ def _session_id_for_interaction(self, interaction, raw_user_id: str) -> str:
and isinstance(channel_obj, _discord.Thread)
):
raw_thread_id = raw_chat_id
raw_chat_id = self._normalize_optional_id(getattr(channel_obj, "parent_id", None))
scope_key = self._session_scope_key(
raw_user_id=raw_user_id,
raw_chat_id=raw_chat_id,
Expand Down Expand Up @@ -380,6 +386,55 @@ def _get_audio_transcriber(self):
logger.warning("discord_channel.transcriber_init_failed", error=str(exc))
return None

# ------------------------------------------------------------------
# TTS synthesizer helper
# ------------------------------------------------------------------

_tts_synthesizer = None

def _get_tts_synthesizer(self):
"""Lazily create a TextToSpeechSynthesizer from the agent's ElevenLabs config."""
if self._tts_synthesizer is not None:
return self._tts_synthesizer
el_config = getattr(self._agent._config, "elevenlabs", None)
if el_config is None or not el_config.is_available:
return None
try:
from gwenn.media.tts import TextToSpeechSynthesizer

self._tts_synthesizer = TextToSpeechSynthesizer(el_config)
return self._tts_synthesizer
except Exception as exc:
logger.warning("discord_channel.tts_init_failed", error=str(exc))
return None

def _should_send_voice(self, is_voice_message: bool) -> bool:
"""Return True if a voice reply should be sent given the current TTS mode."""
el_config = getattr(self._agent._config, "elevenlabs", None)
if el_config is None or getattr(el_config, "is_available", False) is not True:
return False
return el_config.should_send_voice(is_voice_message)

async def _send_voice_reply(self, message, text: str) -> None:
"""Synthesize *text* and send as a Discord file attachment."""
try:
import io

import discord

synthesizer = self._get_tts_synthesizer()
if synthesizer is None:
return
audio_bytes = await synthesizer.synthesize(text)
if audio_bytes is None:
return
await message.reply(
file=discord.File(io.BytesIO(audio_bytes), filename="response.ogg"),
mention_author=False,
)
except Exception as exc:
logger.warning("discord_channel.voice_reply_failed", error=str(exc))

# ------------------------------------------------------------------
# Message handler
# ------------------------------------------------------------------
Expand Down Expand Up @@ -428,6 +483,7 @@ async def _on_message(self, message) -> None:
# Extract media attachments when media is enabled.
image_blocks: list[dict] = []
media_descriptions: list[str] = []
is_voice_message = False
if getattr(self._config, "enable_media", False) and message.attachments:
image_blocks = await self._extract_image_attachments(message)

Expand All @@ -452,8 +508,9 @@ async def _on_message(self, message) -> None:
desc_parts.append("]")
media_descriptions.append("".join(desc_parts))

# Audio attachments — transcribe.
# Audio attachments — transcribe and track for TTS reply.
audio_attachments = await self._extract_audio_attachments(message)
is_voice_message = bool(audio_attachments)
if audio_attachments:
transcriber = self._get_audio_transcriber()
for audio_bytes, filename in audio_attachments:
Expand Down Expand Up @@ -500,11 +557,10 @@ async def _on_message(self, message) -> None:
return

raw_chat_id = self._normalize_optional_id(getattr(message.channel, "id", None))
raw_thread_id = (
raw_chat_id
if hasattr(discord, "Thread") and isinstance(message.channel, discord.Thread)
else None
)
raw_thread_id = None
if hasattr(discord, "Thread") and isinstance(message.channel, discord.Thread):
raw_thread_id = raw_chat_id
raw_chat_id = self._normalize_optional_id(getattr(message.channel, "parent_id", None))
session_scope_key = self._session_scope_key(
raw_user_id=raw_id,
raw_chat_id=raw_chat_id,
Expand Down Expand Up @@ -576,6 +632,10 @@ async def _on_message(self, message) -> None:
except Exception as exc:
logger.error("discord_channel.send_error", error=str(exc), exc_info=True)

# Send a voice reply if TTS is enabled for this context.
if self._should_send_voice(is_voice_message):
await self._send_voice_reply(message, str(response))

# Clear the "received" reaction now that we've replied.
try:
await message.remove_reaction("\U0001f916", self._client.user)
Expand Down
57 changes: 56 additions & 1 deletion gwenn/channels/telegram_channel.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,11 @@ async def stop(self) -> None:
await self._audio_transcriber.close()
except Exception:
logger.debug("telegram_channel.audio_transcriber_close_error", exc_info=True)
if self._tts_synthesizer is not None:
try:
await self._tts_synthesizer.close()
except Exception:
logger.debug("telegram_channel.tts_synthesizer_close_error", exc_info=True)
if self._app is None:
return
try:
Expand Down Expand Up @@ -980,6 +985,8 @@ async def _process_user_input(
context,
raw_id: str,
message: UserMessage | str,
*,
is_voice_message: bool = False,
) -> None:
"""Core message processing shared by text and media handlers."""
if isinstance(message, str):
Expand Down Expand Up @@ -1048,6 +1055,10 @@ async def _process_user_input(
update.message, chunks, button_rows=button_rows
)

# Send a voice reply if TTS is enabled for this context.
if self._should_send_voice(is_voice_message):
await self._send_voice_reply(update, response_text)

# Clear the "received" reaction now that we've replied.
await self._clear_reaction(update.message)
finally:
Expand Down Expand Up @@ -1237,7 +1248,7 @@ async def _on_voice(self, update, context) -> None:
user_id=raw_id,
has_transcript=bool(transcript),
)
await self._process_user_input(update, context, raw_id, description)
await self._process_user_input(update, context, raw_id, description, is_voice_message=True)

_MAX_VIDEO_BYTES: int = 20 * 1024 * 1024 # 20 MB

Expand Down Expand Up @@ -1340,6 +1351,50 @@ def _get_audio_transcriber(self):
logger.warning("telegram_channel.transcriber_init_failed", error=str(exc))
return None

# ------------------------------------------------------------------
# TTS synthesizer helper
# ------------------------------------------------------------------

_tts_synthesizer = None

def _get_tts_synthesizer(self):
"""Lazily create a TextToSpeechSynthesizer from the agent's ElevenLabs config."""
if self._tts_synthesizer is not None:
return self._tts_synthesizer
el_config = getattr(self._agent._config, "elevenlabs", None)
if el_config is None or not el_config.is_available:
return None
try:
from gwenn.media.tts import TextToSpeechSynthesizer

self._tts_synthesizer = TextToSpeechSynthesizer(el_config)
return self._tts_synthesizer
except Exception as exc:
logger.warning("telegram_channel.tts_init_failed", error=str(exc))
return None

def _should_send_voice(self, is_voice_message: bool) -> bool:
"""Return True if a voice reply should be sent given the current TTS mode."""
el_config = getattr(self._agent._config, "elevenlabs", None)
if el_config is None or getattr(el_config, "is_available", False) is not True:
return False
return el_config.should_send_voice(is_voice_message)

async def _send_voice_reply(self, update, text: str) -> None:
"""Synthesize *text* and send as a Telegram voice message."""
try:
import io

synthesizer = self._get_tts_synthesizer()
if synthesizer is None:
return
audio_bytes = await synthesizer.synthesize(text)
if audio_bytes is None:
return
await update.message.reply_voice(voice=io.BytesIO(audio_bytes))
except Exception as exc:
logger.warning("telegram_channel.voice_reply_failed", error=str(exc))

# ------------------------------------------------------------------
# Edited / unsupported / error handlers
# ------------------------------------------------------------------
Expand Down
40 changes: 40 additions & 0 deletions gwenn/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,43 @@ def is_available(self) -> bool:
return bool(self.api_key)


class ElevenLabsConfig(BaseSettings):
"""Configuration for ElevenLabs text-to-speech (optional)."""

api_key: Optional[str] = Field(None, alias="ELEVENLABS_API_KEY")
voice_id: str = Field("JBFqnCBsd6RMkjVDRZzb", alias="GWENN_TTS_VOICE_ID")
model: str = Field("eleven_turbo_v2_5", alias="GWENN_TTS_MODEL")
output_format: str = Field("opus_48000_128", alias="GWENN_TTS_OUTPUT_FORMAT")
max_chars: int = Field(2500, alias="GWENN_TTS_MAX_CHARS")
mode: str = Field("voice_reply", alias="GWENN_TTS_MODE")

model_config = {"env_file": _ENV_FILE, "extra": "ignore"}

@property
def is_available(self) -> bool:
return bool(self.api_key)

def should_send_voice(self, is_voice_message: bool = False) -> bool:
"""Whether a voice reply should be sent for this interaction."""
if not self.is_available or self.mode == "off":
return False
if self.mode == "always":
return True
return self.mode == "voice_reply" and is_voice_message

@model_validator(mode="after")
def validate_mode(self) -> "ElevenLabsConfig":
if self.mode not in {"off", "voice_reply", "always"}:
import structlog
structlog.get_logger(__name__).warning(
"config.elevenlabs_invalid_mode",
provided=self.mode,
fallback="voice_reply",
)
self.mode = "voice_reply"
return self
Comment on lines +471 to +505
Copy link

Copilot AI Mar 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New ElevenLabs / TTS environment variables are introduced here (e.g., ELEVENLABS_API_KEY, GWENN_TTS_*), but they aren’t documented in the existing configuration reference or sample env file. Please update docs/configuration.md and .env.example to include these new settings so users can discover and configure TTS correctly.

Copilot uses AI. Check for mistakes.


class OrchestrationConfig(BaseSettings):
"""Configuration for the subagent orchestration system."""

Expand Down Expand Up @@ -720,6 +757,9 @@ def __init__(self):
# Groq Whisper transcription (optional)
self.groq = GroqConfig()

# ElevenLabs text-to-speech (optional)
self.elevenlabs = ElevenLabsConfig()

# Channel config (channel mode; Telegram/Discord configs loaded lazily)
self.channel = ChannelConfig()

Expand Down
Loading
Loading