diff --git a/.agents/skills/doubao-tts/SKILL.md b/.agents/skills/doubao-tts/SKILL.md new file mode 100644 index 00000000..385d1685 --- /dev/null +++ b/.agents/skills/doubao-tts/SKILL.md @@ -0,0 +1,97 @@ +--- +name: doubao-tts +description: Generate Mandarin and multilingual narration with Volcengine Doubao Speech 2.0. Use when creating Chinese voiceovers, when the user prefers Doubao/Volcengine/火山引擎/豆包 TTS, or when narration needs character-level timestamp metadata for subtitles. +--- + +# Doubao TTS + +Requires `DOUBAO_SPEECH_API_KEY` in `.env`. +Set `DOUBAO_SPEECH_VOICE_TYPE` for the default voice, or pass `voice_id` to the tool. + +## Current API + +Use the new-console API key flow: + +```text +X-Api-Key: ${DOUBAO_SPEECH_API_KEY} +X-Api-Resource-Id: seed-tts-2.0 +``` + +Do not use `X-Api-App-Id` and `X-Api-Access-Key` with a new-console API Key. If the API returns `load grant: requested grant not found`, the key type or auth header is probably wrong. + +For long-form video narration, prefer the async endpoint: + +```text +POST https://openspeech.bytedance.com/api/v3/tts/submit +POST https://openspeech.bytedance.com/api/v3/tts/query +``` + +This returns `audio_url` plus `sentences[].words[]` timing metadata that can be used to build subtitles. + +## OpenMontage Usage + +Generate with the TTS selector: + +```python +from tools.audio.tts_selector import TTSSelector + +result = TTSSelector().execute({ + "preferred_provider": "doubao", + "text": "如果 AI 真的会改变未来,普通人到底该怎么参与?", + "voice_id": "zh_female_vv_uranus_bigtts", + "output_path": "projects/my-video/assets/audio/narration.mp3", + "speech_rate": 0, + "enable_timestamp": True, +}) +``` + +Or call the provider directly: + +```python +from tools.audio.doubao_tts import DoubaoTTS + +result = DoubaoTTS().execute({ + "text": "短样本试听文本。", + "voice_id": "zh_female_vv_uranus_bigtts", + "output_path": "projects/my-video/assets/audio/doubao_sample.mp3", +}) +``` + +The provider writes: + +- `output_path`: downloaded audio file +- `metadata_path`: full query response JSON, defaulting to `.json` + +## Recommended Workflow + +1. Generate a 10-15 second sample before a full paid narration. +2. Ask the user to approve voice naturalness, accent, and speed. +3. Generate the full narration only after approval. +4. Keep the query JSON. It is the source of truth for subtitle timing. +5. Build captions from `sentences[].words[]`, not from estimated text length. +6. Group captions by Chinese semantic phrases before applying timestamps. Do not split only by fixed character count; it can break phrases like "在不押单个公司的情况下" or "可能会被慢慢稀释" and hurt comprehension. +7. Let the video duration follow the approved voice rhythm unless the user explicitly asks to match a prior runtime. + +## Parameters + +- `voice_id`: Doubao `speaker` / voice type. Defaults to `DOUBAO_SPEECH_VOICE_TYPE`. +- `resource_id`: use `seed-tts-2.0` for Doubao Speech 2.0 voices. +- `speech_rate`: `0` is normal, `100` is 2x, `-50` is 0.5x. +- `sample_rate`: default `24000`. +- `enable_timestamp`: default `true`. +- `return_usage`: default `true`, requests usage metadata when available. + +Do not pass `additions.explicit_language` by default. Some endpoint/key combinations reject `zh-cn` with `unsupported additions explicit language zh-cn`. + +For calm Mandarin explainers, start with `speech_rate: 0`. If the result is too long for the approved format, make a short comparison sample with `speech_rate: 25` or `50` before regenerating the full narration. Do not speed up only to match a previous provider's duration if the user prefers Doubao's natural pace. + +## Troubleshooting + +- `load grant: requested grant not found`: wrong key type or wrong auth header. Use `X-Api-Key` for new-console API Keys. +- `speaker permission denied`: voice id is wrong or not authorized for the selected resource. +- `quota exceeded`: quota, lifetime characters, or concurrency exceeded. +- Missing timestamps: verify `enable_timestamp: true`, keep the query JSON, and confirm the selected endpoint returned `sentences`. + +## Safety + +Never print or write the API key to logs, metadata, patches, or project artifacts. `.env.example` should contain only empty variable names. diff --git a/.env.example b/.env.example index 884c01cb..cb32576a 100644 --- a/.env.example +++ b/.env.example @@ -13,6 +13,8 @@ GOOGLE_API_KEY= # Google Imagen images, Google Cloud TTS (700+ voic ELEVENLABS_API_KEY= # TTS narration, music generation, sound effects OPENAI_API_KEY= # OpenAI TTS fallback and DALL-E image generation XAI_API_KEY= # Grok image generation/editing and Grok video generation +DOUBAO_SPEECH_API_KEY= # Volcengine Doubao Speech TTS (new console API Key) +DOUBAO_SPEECH_VOICE_TYPE= # Default Doubao speaker/voice type, e.g. zh_female_vv_uranus_bigtts # Piper local voices do not require env vars; install `piper-tts` via pip # --- Music --- diff --git a/docs/PROVIDERS.md b/docs/PROVIDERS.md index 5d84ed47..f0460a69 100644 --- a/docs/PROVIDERS.md +++ b/docs/PROVIDERS.md @@ -39,6 +39,8 @@ GOOGLE_API_KEY= # Google TTS + Google Imagen ELEVENLABS_API_KEY= # TTS, music, sound effects (10K chars/month free) OPENAI_API_KEY= # OpenAI TTS + DALL-E 3 images XAI_API_KEY= # xAI Grok image generation/editing + Grok video generation +DOUBAO_SPEECH_API_KEY= # Volcengine Doubao Speech TTS (strong Mandarin narration) +DOUBAO_SPEECH_VOICE_TYPE= # Default Doubao speaker/voice type # MULTI-MODEL GATEWAY (one key, 6+ tools) FAL_KEY= # FLUX, Recraft, Kling, Veo, MiniMax video @@ -159,6 +161,52 @@ No subscription — pure pay-as-you-go, no minimum spend. --- +### Doubao Speech — Mandarin TTS + +> **Strong Mandarin narration.** Volcengine Doubao Speech is a good choice for Chinese explainer voiceovers and long-form narration that needs subtitle timing metadata. + +**Tools unlocked:** `doubao_tts` +**Env vars:** `DOUBAO_SPEECH_API_KEY`, `DOUBAO_SPEECH_VOICE_TYPE` + +#### Setup + +1. Open the Volcengine Doubao Speech console and enable Speech Synthesis 2.0. +2. Create a new-console API Key. +3. Choose a Speech 2.0 voice type, for example `zh_female_vv_uranus_bigtts`. +4. Add to `.env`: + ```bash + DOUBAO_SPEECH_API_KEY=your-api-key + DOUBAO_SPEECH_VOICE_TYPE=zh_female_vv_uranus_bigtts + ``` + +#### API Notes + +OpenMontage uses the new-console API key flow: + +```text +X-Api-Key: ${DOUBAO_SPEECH_API_KEY} +X-Api-Resource-Id: seed-tts-2.0 +``` + +Do not pass a new-console API Key as `X-Api-App-Id` or `X-Api-Access-Key`. That mismatch can produce `load grant: requested grant not found`. + +#### What It Is Best For + +- Natural Mandarin narration for Chinese-language explainers +- Async long-form narration via `/api/v3/tts/submit` and `/api/v3/tts/query` +- Character-level timing metadata for subtitle alignment +- Calm educational pacing where the video duration can follow the approved voice rhythm + +#### Pacing + +Start with `speech_rate: 0` for natural Mandarin delivery. If the approved format needs a tighter runtime, compare short samples at `speech_rate: 25` or `50` before generating the full narration. Do not force Doubao to match another provider's duration unless the user explicitly wants that tradeoff. + +#### Pricing + +Doubao Speech 2.0 is billed by character package or usage in Volcengine. OpenMontage estimates cost from text length and prefers provider-returned usage metadata when available. + +--- + ### Google — TTS + Imagen (Shared Key) > **One key, two tools.** Google Cloud TTS has 700+ voices in 50+ languages — the strongest localization option. Imagen 4 generates high-quality images. diff --git a/tools/audio/doubao_tts.py b/tools/audio/doubao_tts.py new file mode 100644 index 00000000..331aff38 --- /dev/null +++ b/tools/audio/doubao_tts.py @@ -0,0 +1,420 @@ +"""Doubao Speech text-to-speech provider tool.""" + +from __future__ import annotations + +import json +import os +import time +import uuid +from pathlib import Path +from typing import Any + +from tools.base_tool import ( + BaseTool, + Determinism, + ExecutionMode, + ResourceProfile, + RetryPolicy, + ToolResult, + ToolRuntime, + ToolStability, + ToolStatus, + ToolTier, +) + + +class DoubaoTTS(BaseTool): + name = "doubao_tts" + version = "0.1.0" + tier = ToolTier.VOICE + capability = "tts" + provider = "doubao" + stability = ToolStability.EXPERIMENTAL + execution_mode = ExecutionMode.ASYNC + determinism = Determinism.STOCHASTIC + runtime = ToolRuntime.API + + dependencies = [] + install_instructions = ( + "Set DOUBAO_SPEECH_API_KEY to a Volcengine Doubao Speech API Key.\n" + "Optional: set DOUBAO_SPEECH_VOICE_TYPE to the default speaker voice.\n" + "Use the new console API key flow; do not pass app id/access token as the API key." + ) + fallback = "google_tts" + fallback_tools = ["google_tts", "elevenlabs_tts", "openai_tts", "piper_tts"] + agent_skills = ["doubao-tts", "text-to-speech"] + + capabilities = [ + "text_to_speech", + "voice_selection", + "multilingual", + "timestamp_alignment", + ] + supports = { + "voice_cloning": False, + "multilingual": True, + "offline": False, + "native_audio": True, + "timestamps": True, + "long_text_async": True, + } + best_for = [ + "natural Mandarin narration", + "Chinese explainer voiceovers with character-level timestamps", + "long-form narration that needs subtitle alignment", + ] + not_good_for = [ + "fully offline production", + "voice clone matching", + "real-time interactive speech playback", + ] + + input_schema = { + "type": "object", + "required": ["text"], + "properties": { + "text": {"type": "string", "description": "Text to convert to speech"}, + "voice_id": { + "type": "string", + "description": "Doubao speaker/voice_type. Defaults to DOUBAO_SPEECH_VOICE_TYPE.", + }, + "resource_id": { + "type": "string", + "default": "seed-tts-2.0", + "description": "Volcengine resource id. Use seed-tts-2.0 for Doubao Speech 2.0 voices.", + }, + "format": { + "type": "string", + "default": "mp3", + "enum": ["mp3", "ogg_opus", "pcm"], + }, + "sample_rate": { + "type": "integer", + "default": 24000, + "enum": [8000, 16000, 22050, 24000, 32000, 44100, 48000], + }, + "speech_rate": { + "type": "integer", + "default": 0, + "minimum": -50, + "maximum": 100, + "description": "Doubao speech rate. 0=normal, 100=2x, -50=0.5x.", + }, + "enable_timestamp": { + "type": "boolean", + "default": True, + "description": "Return sentence/word timing metadata when supported by the selected endpoint.", + }, + "disable_markdown_filter": { + "type": "boolean", + "default": False, + "description": "Pass through Doubao markdown filtering behavior. Defaults to API-safe false.", + }, + "return_usage": { + "type": "boolean", + "default": True, + "description": "Request usage token data from Volcengine when available.", + }, + "output_path": {"type": "string"}, + "metadata_path": { + "type": "string", + "description": "Where to save the full query JSON. Defaults next to output_path.", + }, + "poll_interval_seconds": { + "type": "number", + "default": 2.0, + "minimum": 0.5, + }, + "timeout_seconds": { + "type": "integer", + "default": 300, + "minimum": 30, + }, + }, + } + + output_schema = { + "type": "object", + "properties": { + "output": {"type": "string"}, + "metadata_path": {"type": "string"}, + "task_id": {"type": "string"}, + "audio_duration_seconds": {"type": ["number", "null"]}, + "sentences": {"type": "array"}, + "usage": {"type": ["object", "null"]}, + }, + } + artifact_schema = { + "type": "array", + "items": {"type": "string"}, + } + + resource_profile = ResourceProfile( + cpu_cores=1, ram_mb=256, vram_mb=0, disk_mb=50, network_required=True + ) + retry_policy = RetryPolicy( + max_retries=2, + backoff_seconds=2.0, + retryable_errors=["timeout", "rate_limit", "quota exceeded for types: concurrency"], + ) + idempotency_key_fields = ["text", "voice_id", "resource_id", "speech_rate", "sample_rate"] + side_effects = [ + "writes audio file to output_path", + "writes Doubao query metadata JSON next to output_path", + "calls Volcengine Doubao Speech API", + ] + user_visible_verification = [ + "Listen to generated audio for Mandarin naturalness and pacing", + "Check timestamp JSON before building subtitles", + ] + quality_score = 0.88 + latency_p50_seconds = 8.0 + + SUBMIT_URL = "https://openspeech.bytedance.com/api/v3/tts/submit" + QUERY_URL = "https://openspeech.bytedance.com/api/v3/tts/query" + DEFAULT_RESOURCE_ID = "seed-tts-2.0" + DEFAULT_VOICE_ENV = "DOUBAO_SPEECH_VOICE_TYPE" + + def get_status(self) -> ToolStatus: + if os.environ.get("DOUBAO_SPEECH_API_KEY"): + return ToolStatus.AVAILABLE + return ToolStatus.UNAVAILABLE + + def estimate_cost(self, inputs: dict[str, Any]) -> float: + # Volcengine bills Doubao Speech 2.0 by characters. Keep this conservative + # and prefer provider-returned usage when available. + return round(len(inputs.get("text", "")) * 0.000015, 4) + + def execute(self, inputs: dict[str, Any]) -> ToolResult: + api_key = os.environ.get("DOUBAO_SPEECH_API_KEY") + if not api_key: + return ToolResult(success=False, error="No Doubao Speech API key. " + self.install_instructions) + + voice_id = inputs.get("voice_id") or os.environ.get(self.DEFAULT_VOICE_ENV) + if not voice_id: + return ToolResult( + success=False, + error=( + "No Doubao voice_id provided. Pass voice_id or set " + f"{self.DEFAULT_VOICE_ENV} in the environment." + ), + ) + + start = time.time() + try: + result = self._generate(inputs, api_key=api_key, voice_id=voice_id) + except Exception as exc: + return ToolResult(success=False, error=f"Doubao TTS failed: {self._safe_error(exc)}") + + result.duration_seconds = round(time.time() - start, 2) + if not result.cost_usd: + result.cost_usd = self.estimate_cost(inputs) + return result + + def _generate(self, inputs: dict[str, Any], *, api_key: str, voice_id: str) -> ToolResult: + import requests + + text = inputs["text"] + fmt = inputs.get("format", "mp3") + resource_id = inputs.get("resource_id", self.DEFAULT_RESOURCE_ID) + output_path = Path(inputs.get("output_path", f"doubao_tts.{self._extension_for_format(fmt)}")) + metadata_path = Path( + inputs.get("metadata_path") or output_path.with_suffix(output_path.suffix + ".json") + ) + output_path.parent.mkdir(parents=True, exist_ok=True) + metadata_path.parent.mkdir(parents=True, exist_ok=True) + + req_id = str(uuid.uuid4()) + headers = self._headers( + api_key=api_key, + resource_id=resource_id, + request_id=req_id, + return_usage=bool(inputs.get("return_usage", True)), + ) + body = self._submit_body(inputs, voice_id=voice_id, request_id=req_id) + + submit_response = requests.post(self.SUBMIT_URL, headers=headers, json=body, timeout=(10, 60)) + submit_data = self._json_or_raise(submit_response) + self._raise_for_doubao_error(submit_response.status_code, submit_data) + + task_id = submit_data.get("data", {}).get("task_id") + if not task_id: + raise RuntimeError("Doubao submit succeeded but did not return data.task_id") + + query_data = self._poll_query( + requests_module=requests, + api_key=api_key, + resource_id=resource_id, + task_id=task_id, + return_usage=bool(inputs.get("return_usage", True)), + poll_interval=float(inputs.get("poll_interval_seconds", 2.0)), + timeout_seconds=int(inputs.get("timeout_seconds", 300)), + ) + data = query_data.get("data", {}) + audio_url = data.get("audio_url") + if not audio_url: + raise RuntimeError("Doubao task completed but did not return data.audio_url") + + audio_response = requests.get(audio_url, timeout=(10, 120)) + audio_response.raise_for_status() + output_path.write_bytes(audio_response.content) + metadata_path.write_text(json.dumps(query_data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + + audio_duration = self._audio_duration(output_path) + usage = data.get("usage") + cost = self._cost_from_usage(usage) or self.estimate_cost(inputs) + + return ToolResult( + success=True, + data={ + "provider": self.provider, + "model": resource_id, + "resource_id": resource_id, + "voice_id": voice_id, + "format": fmt, + "sample_rate": inputs.get("sample_rate", 24000), + "speech_rate": inputs.get("speech_rate", 0), + "text_length": len(text), + "task_id": task_id, + "task_status": data.get("task_status"), + "req_text_length": data.get("req_text_length"), + "synthesize_text_length": data.get("synthesize_text_length"), + "audio_duration_seconds": round(audio_duration, 2) if audio_duration else None, + "output": str(output_path), + "metadata_path": str(metadata_path), + "sentences": data.get("sentences", []), + "usage": usage, + "url_expire_time": data.get("url_expire_time"), + }, + artifacts=[str(output_path), str(metadata_path)], + cost_usd=cost, + model=resource_id, + ) + + def _headers( + self, + *, + api_key: str, + resource_id: str, + request_id: str, + return_usage: bool, + ) -> dict[str, str]: + headers = { + "X-Api-Key": api_key, + "X-Api-Resource-Id": resource_id, + "X-Api-Request-Id": request_id, + "Content-Type": "application/json", + } + if return_usage: + headers["X-Control-Require-Usage-Tokens-Return"] = "true" + return headers + + def _submit_body(self, inputs: dict[str, Any], *, voice_id: str, request_id: str) -> dict[str, Any]: + audio_params = { + "format": inputs.get("format", "mp3"), + "sample_rate": inputs.get("sample_rate", 24000), + "speech_rate": inputs.get("speech_rate", 0), + "enable_timestamp": bool(inputs.get("enable_timestamp", True)), + } + additions = { + "disable_markdown_filter": bool(inputs.get("disable_markdown_filter", False)), + } + return { + "user": {"uid": inputs.get("user_id", "openmontage")}, + "unique_id": request_id, + "req_params": { + "text": inputs["text"], + "speaker": voice_id, + "audio_params": audio_params, + "additions": json.dumps(additions, ensure_ascii=False), + }, + } + + def _poll_query( + self, + *, + requests_module: Any, + api_key: str, + resource_id: str, + task_id: str, + return_usage: bool, + poll_interval: float, + timeout_seconds: int, + ) -> dict[str, Any]: + deadline = time.time() + timeout_seconds + while time.time() < deadline: + time.sleep(poll_interval) + headers = self._headers( + api_key=api_key, + resource_id=resource_id, + request_id=str(uuid.uuid4()), + return_usage=return_usage, + ) + response = requests_module.post(self.QUERY_URL, headers=headers, json={"task_id": task_id}, timeout=(10, 60)) + query_data = self._json_or_raise(response) + self._raise_for_doubao_error(response.status_code, query_data) + status = query_data.get("data", {}).get("task_status") + if status == 2: + return query_data + if status == 3: + raise RuntimeError(f"Doubao task failed: {query_data.get('message', 'unknown error')}") + raise TimeoutError(f"Doubao task did not finish within {timeout_seconds} seconds") + + @staticmethod + def _json_or_raise(response: Any) -> dict[str, Any]: + try: + return response.json() + except ValueError as exc: + raise RuntimeError(f"Non-JSON response from Doubao API: HTTP {response.status_code}") from exc + + def _raise_for_doubao_error(self, http_status: int, payload: dict[str, Any]) -> None: + code = payload.get("code") + if http_status < 400 and code == 20000000: + return + message = payload.get("message", "unknown error") + hint = self._diagnostic_hint(message) + raise RuntimeError(f"HTTP {http_status}, code {code}: {message}{hint}") + + @staticmethod + def _diagnostic_hint(message: str) -> str: + lowered = message.lower() + if "load grant" in lowered or "requested grant not found" in lowered: + return " (check DOUBAO_SPEECH_API_KEY and use the new-console X-Api-Key flow)" + if "speaker permission denied" in lowered or "access denied" in lowered: + return " (check voice_id/DOUBAO_SPEECH_VOICE_TYPE and voice authorization)" + if "quota exceeded" in lowered: + return " (check quota, concurrency, or remaining character package)" + if "unsupported additions explicit language" in lowered: + return " (do not pass additions.explicit_language for this endpoint)" + return "" + + @staticmethod + def _safe_error(exc: Exception) -> str: + # Avoid ever echoing request headers or secrets in user-visible errors. + return str(exc).replace(os.environ.get("DOUBAO_SPEECH_API_KEY", ""), "[redacted]") + + @staticmethod + def _extension_for_format(fmt: str) -> str: + if fmt == "ogg_opus": + return "ogg" + if fmt == "pcm": + return "pcm" + return "mp3" + + @staticmethod + def _audio_duration(path: Path) -> float | None: + try: + from tools.analysis.audio_probe import probe_duration + + return probe_duration(path) + except Exception: + return None + + @staticmethod + def _cost_from_usage(usage: Any) -> float | None: + if not isinstance(usage, dict): + return None + text_words = usage.get("text_words") + if not isinstance(text_words, (int, float)): + return None + return round(float(text_words) * 0.000015, 4)