diff --git a/.agents/skills/doubao-tts/SKILL.md b/.agents/skills/doubao-tts/SKILL.md
new file mode 100644
index 00000000..385d1685
--- /dev/null
+++ b/.agents/skills/doubao-tts/SKILL.md
@@ -0,0 +1,97 @@
+---
+name: doubao-tts
+description: Generate Mandarin and multilingual narration with Volcengine Doubao Speech 2.0. Use when creating Chinese voiceovers, when the user prefers Doubao/Volcengine/火山引擎/豆包 TTS, or when narration needs character-level timestamp metadata for subtitles.
+---
+
+# Doubao TTS
+
+Requires `DOUBAO_SPEECH_API_KEY` in `.env`.
+Set `DOUBAO_SPEECH_VOICE_TYPE` for the default voice, or pass `voice_id` to the tool.
+
+## Current API
+
+Use the new-console API key flow:
+
+```text
+X-Api-Key: ${DOUBAO_SPEECH_API_KEY}
+X-Api-Resource-Id: seed-tts-2.0
+```
+
+Do not use `X-Api-App-Id` and `X-Api-Access-Key` with a new-console API Key. If the API returns `load grant: requested grant not found`, the key type or auth header is probably wrong.
+
+For long-form video narration, prefer the async endpoint:
+
+```text
+POST https://openspeech.bytedance.com/api/v3/tts/submit
+POST https://openspeech.bytedance.com/api/v3/tts/query
+```
+
+This returns `audio_url` plus `sentences[].words[]` timing metadata that can be used to build subtitles.
+
+## OpenMontage Usage
+
+Generate with the TTS selector:
+
+```python
+from tools.audio.tts_selector import TTSSelector
+
+result = TTSSelector().execute({
+    "preferred_provider": "doubao",
+    "text": "如果 AI 真的会改变未来，普通人到底该怎么参与？",
+    "voice_id": "zh_female_vv_uranus_bigtts",
+    "output_path": "projects/my-video/assets/audio/narration.mp3",
+    "speech_rate": 0,
+    "enable_timestamp": True,
+})
+```
+
+Or call the provider directly:
+
+```python
+from tools.audio.doubao_tts import DoubaoTTS
+
+result = DoubaoTTS().execute({
+    "text": "短样本试听文本。",
+    "voice_id": "zh_female_vv_uranus_bigtts",
+    "output_path": "projects/my-video/assets/audio/doubao_sample.mp3",
+})
+```
+
+The provider writes:
+
+- `output_path`: downloaded audio file
+- `metadata_path`: full query response JSON, defaulting to `<output_path>.json`
+
+## Recommended Workflow
+
+1. Generate a 10-15 second sample before a full paid narration.
+2. Ask the user to approve voice naturalness, accent, and speed.
+3. Generate the full narration only after approval.
+4. Keep the query JSON. It is the source of truth for subtitle timing.
+5. Build captions from `sentences[].words[]`, not from estimated text length.
+6. Group captions by Chinese semantic phrases before applying timestamps. Do not split only by fixed character count; it can break phrases like "在不押单个公司的情况下" or "可能会被慢慢稀释" and hurt comprehension.
+7. Let the video duration follow the approved voice rhythm unless the user explicitly asks to match a prior runtime.
+
+## Parameters
+
+- `voice_id`: Doubao `speaker` / voice type. Defaults to `DOUBAO_SPEECH_VOICE_TYPE`.
+- `resource_id`: use `seed-tts-2.0` for Doubao Speech 2.0 voices.
+- `speech_rate`: `0` is normal, `100` is 2x, `-50` is 0.5x.
+- `sample_rate`: default `24000`.
+- `enable_timestamp`: default `true`.
+- `return_usage`: default `true`, requests usage metadata when available.
+
+Do not pass `additions.explicit_language` by default. Some endpoint/key combinations reject `zh-cn` with `unsupported additions explicit language zh-cn`.
+
+For calm Mandarin explainers, start with `speech_rate: 0`. If the result is too long for the approved format, make a short comparison sample with `speech_rate: 25` or `50` before regenerating the full narration. Do not speed up only to match a previous provider's duration if the user prefers Doubao's natural pace.
+
+## Troubleshooting
+
+- `load grant: requested grant not found`: wrong key type or wrong auth header. Use `X-Api-Key` for new-console API Keys.
+- `speaker permission denied`: voice id is wrong or not authorized for the selected resource.
+- `quota exceeded`: quota, lifetime characters, or concurrency exceeded.
+- Missing timestamps: verify `enable_timestamp: true`, keep the query JSON, and confirm the selected endpoint returned `sentences`.
+
+## Safety
+
+Never print or write the API key to logs, metadata, patches, or project artifacts. `.env.example` should contain only empty variable names.
diff --git a/.env.example b/.env.example
index 884c01cb..cb32576a 100644
--- a/.env.example
+++ b/.env.example
@@ -13,6 +13,8 @@ GOOGLE_API_KEY=              # Google Imagen images, Google Cloud TTS (700+ voic
 ELEVENLABS_API_KEY=          # TTS narration, music generation, sound effects
 OPENAI_API_KEY=              # OpenAI TTS fallback and DALL-E image generation
 XAI_API_KEY=                 # Grok image generation/editing and Grok video generation
+DOUBAO_SPEECH_API_KEY=       # Volcengine Doubao Speech TTS (new console API Key)
+DOUBAO_SPEECH_VOICE_TYPE=    # Default Doubao speaker/voice type, e.g. zh_female_vv_uranus_bigtts
 # Piper local voices do not require env vars; install `piper-tts` via pip
 
 # --- Music ---
diff --git a/docs/PROVIDERS.md b/docs/PROVIDERS.md
index 5d84ed47..f0460a69 100644
--- a/docs/PROVIDERS.md
+++ b/docs/PROVIDERS.md
@@ -39,6 +39,8 @@ GOOGLE_API_KEY=              # Google TTS + Google Imagen
 ELEVENLABS_API_KEY=          # TTS, music, sound effects (10K chars/month free)
 OPENAI_API_KEY=              # OpenAI TTS + DALL-E 3 images
 XAI_API_KEY=                 # xAI Grok image generation/editing + Grok video generation
+DOUBAO_SPEECH_API_KEY=       # Volcengine Doubao Speech TTS (strong Mandarin narration)
+DOUBAO_SPEECH_VOICE_TYPE=    # Default Doubao speaker/voice type
 
 # MULTI-MODEL GATEWAY (one key, 6+ tools)
 FAL_KEY=                     # FLUX, Recraft, Kling, Veo, MiniMax video
@@ -159,6 +161,52 @@ No subscription — pure pay-as-you-go, no minimum spend.
 
 ---
 
+### Doubao Speech — Mandarin TTS
+
+> **Strong Mandarin narration.** Volcengine Doubao Speech is a good choice for Chinese explainer voiceovers and long-form narration that needs subtitle timing metadata.
+
+**Tools unlocked:** `doubao_tts`
+**Env vars:** `DOUBAO_SPEECH_API_KEY`, `DOUBAO_SPEECH_VOICE_TYPE`
+
+#### Setup
+
+1. Open the Volcengine Doubao Speech console and enable Speech Synthesis 2.0.
+2. Create a new-console API Key.
+3. Choose a Speech 2.0 voice type, for example `zh_female_vv_uranus_bigtts`.
+4. Add to `.env`:
+   ```bash
+   DOUBAO_SPEECH_API_KEY=your-api-key
+   DOUBAO_SPEECH_VOICE_TYPE=zh_female_vv_uranus_bigtts
+   ```
+
+#### API Notes
+
+OpenMontage uses the new-console API key flow:
+
+```text
+X-Api-Key: ${DOUBAO_SPEECH_API_KEY}
+X-Api-Resource-Id: seed-tts-2.0
+```
+
+Do not pass a new-console API Key as `X-Api-App-Id` or `X-Api-Access-Key`. That mismatch can produce `load grant: requested grant not found`.
+
+#### What It Is Best For
+
+- Natural Mandarin narration for Chinese-language explainers
+- Async long-form narration via `/api/v3/tts/submit` and `/api/v3/tts/query`
+- Character-level timing metadata for subtitle alignment
+- Calm educational pacing where the video duration can follow the approved voice rhythm
+
+#### Pacing
+
+Start with `speech_rate: 0` for natural Mandarin delivery. If the approved format needs a tighter runtime, compare short samples at `speech_rate: 25` or `50` before generating the full narration. Do not force Doubao to match another provider's duration unless the user explicitly wants that tradeoff.
+
+#### Pricing
+
+Doubao Speech 2.0 is billed by character package or usage in Volcengine. OpenMontage estimates cost from text length and prefers provider-returned usage metadata when available.
+
+---
+
 ### Google — TTS + Imagen (Shared Key)
 
 > **One key, two tools.** Google Cloud TTS has 700+ voices in 50+ languages — the strongest localization option. Imagen 4 generates high-quality images.
diff --git a/tools/audio/doubao_tts.py b/tools/audio/doubao_tts.py
new file mode 100644
index 00000000..331aff38
--- /dev/null
+++ b/tools/audio/doubao_tts.py
@@ -0,0 +1,420 @@
+"""Doubao Speech text-to-speech provider tool."""
+
+from __future__ import annotations
+
+import json
+import os
+import time
+import uuid
+from pathlib import Path
+from typing import Any
+
+from tools.base_tool import (
+    BaseTool,
+    Determinism,
+    ExecutionMode,
+    ResourceProfile,
+    RetryPolicy,
+    ToolResult,
+    ToolRuntime,
+    ToolStability,
+    ToolStatus,
+    ToolTier,
+)
+
+
+class DoubaoTTS(BaseTool):
+    name = "doubao_tts"
+    version = "0.1.0"
+    tier = ToolTier.VOICE
+    capability = "tts"
+    provider = "doubao"
+    stability = ToolStability.EXPERIMENTAL
+    execution_mode = ExecutionMode.ASYNC
+    determinism = Determinism.STOCHASTIC
+    runtime = ToolRuntime.API
+
+    dependencies = []
+    install_instructions = (
+        "Set DOUBAO_SPEECH_API_KEY to a Volcengine Doubao Speech API Key.\n"
+        "Optional: set DOUBAO_SPEECH_VOICE_TYPE to the default speaker voice.\n"
+        "Use the new console API key flow; do not pass app id/access token as the API key."
+    )
+    fallback = "google_tts"
+    fallback_tools = ["google_tts", "elevenlabs_tts", "openai_tts", "piper_tts"]
+    agent_skills = ["doubao-tts", "text-to-speech"]
+
+    capabilities = [
+        "text_to_speech",
+        "voice_selection",
+        "multilingual",
+        "timestamp_alignment",
+    ]
+    supports = {
+        "voice_cloning": False,
+        "multilingual": True,
+        "offline": False,
+        "native_audio": True,
+        "timestamps": True,
+        "long_text_async": True,
+    }
+    best_for = [
+        "natural Mandarin narration",
+        "Chinese explainer voiceovers with character-level timestamps",
+        "long-form narration that needs subtitle alignment",
+    ]
+    not_good_for = [
+        "fully offline production",
+        "voice clone matching",
+        "real-time interactive speech playback",
+    ]
+
+    input_schema = {
+        "type": "object",
+        "required": ["text"],
+        "properties": {
+            "text": {"type": "string", "description": "Text to convert to speech"},
+            "voice_id": {
+                "type": "string",
+                "description": "Doubao speaker/voice_type. Defaults to DOUBAO_SPEECH_VOICE_TYPE.",
+            },
+            "resource_id": {
+                "type": "string",
+                "default": "seed-tts-2.0",
+                "description": "Volcengine resource id. Use seed-tts-2.0 for Doubao Speech 2.0 voices.",
+            },
+            "format": {
+                "type": "string",
+                "default": "mp3",
+                "enum": ["mp3", "ogg_opus", "pcm"],
+            },
+            "sample_rate": {
+                "type": "integer",
+                "default": 24000,
+                "enum": [8000, 16000, 22050, 24000, 32000, 44100, 48000],
+            },
+            "speech_rate": {
+                "type": "integer",
+                "default": 0,
+                "minimum": -50,
+                "maximum": 100,
+                "description": "Doubao speech rate. 0=normal, 100=2x, -50=0.5x.",
+            },
+            "enable_timestamp": {
+                "type": "boolean",
+                "default": True,
+                "description": "Return sentence/word timing metadata when supported by the selected endpoint.",
+            },
+            "disable_markdown_filter": {
+                "type": "boolean",
+                "default": False,
+                "description": "Pass through Doubao markdown filtering behavior. Defaults to API-safe false.",
+            },
+            "return_usage": {
+                "type": "boolean",
+                "default": True,
+                "description": "Request usage token data from Volcengine when available.",
+            },
+            "output_path": {"type": "string"},
+            "metadata_path": {
+                "type": "string",
+                "description": "Where to save the full query JSON. Defaults next to output_path.",
+            },
+            "poll_interval_seconds": {
+                "type": "number",
+                "default": 2.0,
+                "minimum": 0.5,
+            },
+            "timeout_seconds": {
+                "type": "integer",
+                "default": 300,
+                "minimum": 30,
+            },
+        },
+    }
+
+    output_schema = {
+        "type": "object",
+        "properties": {
+            "output": {"type": "string"},
+            "metadata_path": {"type": "string"},
+            "task_id": {"type": "string"},
+            "audio_duration_seconds": {"type": ["number", "null"]},
+            "sentences": {"type": "array"},
+            "usage": {"type": ["object", "null"]},
+        },
+    }
+    artifact_schema = {
+        "type": "array",
+        "items": {"type": "string"},
+    }
+
+    resource_profile = ResourceProfile(
+        cpu_cores=1, ram_mb=256, vram_mb=0, disk_mb=50, network_required=True
+    )
+    retry_policy = RetryPolicy(
+        max_retries=2,
+        backoff_seconds=2.0,
+        retryable_errors=["timeout", "rate_limit", "quota exceeded for types: concurrency"],
+    )
+    idempotency_key_fields = ["text", "voice_id", "resource_id", "speech_rate", "sample_rate"]
+    side_effects = [
+        "writes audio file to output_path",
+        "writes Doubao query metadata JSON next to output_path",
+        "calls Volcengine Doubao Speech API",
+    ]
+    user_visible_verification = [
+        "Listen to generated audio for Mandarin naturalness and pacing",
+        "Check timestamp JSON before building subtitles",
+    ]
+    quality_score = 0.88
+    latency_p50_seconds = 8.0
+
+    SUBMIT_URL = "https://openspeech.bytedance.com/api/v3/tts/submit"
+    QUERY_URL = "https://openspeech.bytedance.com/api/v3/tts/query"
+    DEFAULT_RESOURCE_ID = "seed-tts-2.0"
+    DEFAULT_VOICE_ENV = "DOUBAO_SPEECH_VOICE_TYPE"
+
+    def get_status(self) -> ToolStatus:
+        if os.environ.get("DOUBAO_SPEECH_API_KEY"):
+            return ToolStatus.AVAILABLE
+        return ToolStatus.UNAVAILABLE
+
+    def estimate_cost(self, inputs: dict[str, Any]) -> float:
+        # Volcengine bills Doubao Speech 2.0 by characters. Keep this conservative
+        # and prefer provider-returned usage when available.
+        return round(len(inputs.get("text", "")) * 0.000015, 4)
+
+    def execute(self, inputs: dict[str, Any]) -> ToolResult:
+        api_key = os.environ.get("DOUBAO_SPEECH_API_KEY")
+        if not api_key:
+            return ToolResult(success=False, error="No Doubao Speech API key. " + self.install_instructions)
+
+        voice_id = inputs.get("voice_id") or os.environ.get(self.DEFAULT_VOICE_ENV)
+        if not voice_id:
+            return ToolResult(
+                success=False,
+                error=(
+                    "No Doubao voice_id provided. Pass voice_id or set "
+                    f"{self.DEFAULT_VOICE_ENV} in the environment."
+                ),
+            )
+
+        start = time.time()
+        try:
+            result = self._generate(inputs, api_key=api_key, voice_id=voice_id)
+        except Exception as exc:
+            return ToolResult(success=False, error=f"Doubao TTS failed: {self._safe_error(exc)}")
+
+        result.duration_seconds = round(time.time() - start, 2)
+        if not result.cost_usd:
+            result.cost_usd = self.estimate_cost(inputs)
+        return result
+
+    def _generate(self, inputs: dict[str, Any], *, api_key: str, voice_id: str) -> ToolResult:
+        import requests
+
+        text = inputs["text"]
+        fmt = inputs.get("format", "mp3")
+        resource_id = inputs.get("resource_id", self.DEFAULT_RESOURCE_ID)
+        output_path = Path(inputs.get("output_path", f"doubao_tts.{self._extension_for_format(fmt)}"))
+        metadata_path = Path(
+            inputs.get("metadata_path") or output_path.with_suffix(output_path.suffix + ".json")
+        )
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        metadata_path.parent.mkdir(parents=True, exist_ok=True)
+
+        req_id = str(uuid.uuid4())
+        headers = self._headers(
+            api_key=api_key,
+            resource_id=resource_id,
+            request_id=req_id,
+            return_usage=bool(inputs.get("return_usage", True)),
+        )
+        body = self._submit_body(inputs, voice_id=voice_id, request_id=req_id)
+
+        submit_response = requests.post(self.SUBMIT_URL, headers=headers, json=body, timeout=(10, 60))
+        submit_data = self._json_or_raise(submit_response)
+        self._raise_for_doubao_error(submit_response.status_code, submit_data)
+
+        task_id = submit_data.get("data", {}).get("task_id")
+        if not task_id:
+            raise RuntimeError("Doubao submit succeeded but did not return data.task_id")
+
+        query_data = self._poll_query(
+            requests_module=requests,
+            api_key=api_key,
+            resource_id=resource_id,
+            task_id=task_id,
+            return_usage=bool(inputs.get("return_usage", True)),
+            poll_interval=float(inputs.get("poll_interval_seconds", 2.0)),
+            timeout_seconds=int(inputs.get("timeout_seconds", 300)),
+        )
+        data = query_data.get("data", {})
+        audio_url = data.get("audio_url")
+        if not audio_url:
+            raise RuntimeError("Doubao task completed but did not return data.audio_url")
+
+        audio_response = requests.get(audio_url, timeout=(10, 120))
+        audio_response.raise_for_status()
+        output_path.write_bytes(audio_response.content)
+        metadata_path.write_text(json.dumps(query_data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
+
+        audio_duration = self._audio_duration(output_path)
+        usage = data.get("usage")
+        cost = self._cost_from_usage(usage) or self.estimate_cost(inputs)
+
+        return ToolResult(
+            success=True,
+            data={
+                "provider": self.provider,
+                "model": resource_id,
+                "resource_id": resource_id,
+                "voice_id": voice_id,
+                "format": fmt,
+                "sample_rate": inputs.get("sample_rate", 24000),
+                "speech_rate": inputs.get("speech_rate", 0),
+                "text_length": len(text),
+                "task_id": task_id,
+                "task_status": data.get("task_status"),
+                "req_text_length": data.get("req_text_length"),
+                "synthesize_text_length": data.get("synthesize_text_length"),
+                "audio_duration_seconds": round(audio_duration, 2) if audio_duration else None,
+                "output": str(output_path),
+                "metadata_path": str(metadata_path),
+                "sentences": data.get("sentences", []),
+                "usage": usage,
+                "url_expire_time": data.get("url_expire_time"),
+            },
+            artifacts=[str(output_path), str(metadata_path)],
+            cost_usd=cost,
+            model=resource_id,
+        )
+
+    def _headers(
+        self,
+        *,
+        api_key: str,
+        resource_id: str,
+        request_id: str,
+        return_usage: bool,
+    ) -> dict[str, str]:
+        headers = {
+            "X-Api-Key": api_key,
+            "X-Api-Resource-Id": resource_id,
+            "X-Api-Request-Id": request_id,
+            "Content-Type": "application/json",
+        }
+        if return_usage:
+            headers["X-Control-Require-Usage-Tokens-Return"] = "true"
+        return headers
+
+    def _submit_body(self, inputs: dict[str, Any], *, voice_id: str, request_id: str) -> dict[str, Any]:
+        audio_params = {
+            "format": inputs.get("format", "mp3"),
+            "sample_rate": inputs.get("sample_rate", 24000),
+            "speech_rate": inputs.get("speech_rate", 0),
+            "enable_timestamp": bool(inputs.get("enable_timestamp", True)),
+        }
+        additions = {
+            "disable_markdown_filter": bool(inputs.get("disable_markdown_filter", False)),
+        }
+        return {
+            "user": {"uid": inputs.get("user_id", "openmontage")},
+            "unique_id": request_id,
+            "req_params": {
+                "text": inputs["text"],
+                "speaker": voice_id,
+                "audio_params": audio_params,
+                "additions": json.dumps(additions, ensure_ascii=False),
+            },
+        }
+
+    def _poll_query(
+        self,
+        *,
+        requests_module: Any,
+        api_key: str,
+        resource_id: str,
+        task_id: str,
+        return_usage: bool,
+        poll_interval: float,
+        timeout_seconds: int,
+    ) -> dict[str, Any]:
+        deadline = time.time() + timeout_seconds
+        while time.time() < deadline:
+            time.sleep(poll_interval)
+            headers = self._headers(
+                api_key=api_key,
+                resource_id=resource_id,
+                request_id=str(uuid.uuid4()),
+                return_usage=return_usage,
+            )
+            response = requests_module.post(self.QUERY_URL, headers=headers, json={"task_id": task_id}, timeout=(10, 60))
+            query_data = self._json_or_raise(response)
+            self._raise_for_doubao_error(response.status_code, query_data)
+            status = query_data.get("data", {}).get("task_status")
+            if status == 2:
+                return query_data
+            if status == 3:
+                raise RuntimeError(f"Doubao task failed: {query_data.get('message', 'unknown error')}")
+        raise TimeoutError(f"Doubao task did not finish within {timeout_seconds} seconds")
+
+    @staticmethod
+    def _json_or_raise(response: Any) -> dict[str, Any]:
+        try:
+            return response.json()
+        except ValueError as exc:
+            raise RuntimeError(f"Non-JSON response from Doubao API: HTTP {response.status_code}") from exc
+
+    def _raise_for_doubao_error(self, http_status: int, payload: dict[str, Any]) -> None:
+        code = payload.get("code")
+        if http_status < 400 and code == 20000000:
+            return
+        message = payload.get("message", "unknown error")
+        hint = self._diagnostic_hint(message)
+        raise RuntimeError(f"HTTP {http_status}, code {code}: {message}{hint}")
+
+    @staticmethod
+    def _diagnostic_hint(message: str) -> str:
+        lowered = message.lower()
+        if "load grant" in lowered or "requested grant not found" in lowered:
+            return " (check DOUBAO_SPEECH_API_KEY and use the new-console X-Api-Key flow)"
+        if "speaker permission denied" in lowered or "access denied" in lowered:
+            return " (check voice_id/DOUBAO_SPEECH_VOICE_TYPE and voice authorization)"
+        if "quota exceeded" in lowered:
+            return " (check quota, concurrency, or remaining character package)"
+        if "unsupported additions explicit language" in lowered:
+            return " (do not pass additions.explicit_language for this endpoint)"
+        return ""
+
+    @staticmethod
+    def _safe_error(exc: Exception) -> str:
+        # Avoid ever echoing request headers or secrets in user-visible errors.
+        return str(exc).replace(os.environ.get("DOUBAO_SPEECH_API_KEY", ""), "[redacted]")
+
+    @staticmethod
+    def _extension_for_format(fmt: str) -> str:
+        if fmt == "ogg_opus":
+            return "ogg"
+        if fmt == "pcm":
+            return "pcm"
+        return "mp3"
+
+    @staticmethod
+    def _audio_duration(path: Path) -> float | None:
+        try:
+            from tools.analysis.audio_probe import probe_duration
+
+            return probe_duration(path)
+        except Exception:
+            return None
+
+    @staticmethod
+    def _cost_from_usage(usage: Any) -> float | None:
+        if not isinstance(usage, dict):
+            return None
+        text_words = usage.get("text_words")
+        if not isinstance(text_words, (int, float)):
+            return None
+        return round(float(text_words) * 0.000015, 4)