diff --git a/packages/cli/src/media-config.ts b/packages/cli/src/media-config.ts index eccb0a8..c723f53 100644 --- a/packages/cli/src/media-config.ts +++ b/packages/cli/src/media-config.ts @@ -14,10 +14,21 @@ import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'node:fs'; import { join } from 'node:path'; -import { resolveMinimaxCredentials, type MinimaxCredentials } from '@html-video/core'; +import { + resolveMinimaxCredentials, + type MinimaxCredentials, + resolveFishAudioCredentials, + type FishAudioCredentials, +} from '@html-video/core'; + +/** Which provider synthesizes narration. Music is always MiniMax (FishAudio + * has no music generation). Defaults to 'minimax' for backward compat. */ +export type NarrationProvider = 'minimax' | 'fishaudio'; interface MediaConfig { minimax?: { apiKey?: string; baseUrl?: string }; + fishaudio?: { apiKey?: string; baseUrl?: string }; + narrationProvider?: NarrationProvider; } export class MediaConfigStore { @@ -86,6 +97,67 @@ export class MediaConfigStore { } return resolveMinimaxCredentials(); } + + // --- FishAudio (narration only; no region — single global host) ---------- + + /** What the Settings UI shows for FishAudio: configured? + masked key + base + * URL. Never returns the raw key. Reports the source (config file vs env). */ + getFishAudioStatus(): { configured: boolean; source: 'config' | 'env' | 'none'; maskedKey: string; baseUrl: string } { + const cfg = this.read().fishaudio; + if (cfg?.apiKey) { + return { configured: true, source: 'config', maskedKey: mask(cfg.apiKey), baseUrl: cfg.baseUrl ?? '' }; + } + const env = resolveFishAudioCredentials(); + if (env) { + return { configured: true, source: 'env', maskedKey: mask(env.apiKey), baseUrl: env.baseUrl }; + } + return { configured: false, source: 'none', maskedKey: '', baseUrl: '' }; + } + + /** Persist a FishAudio key (and optional base URL) entered in the UI. */ + setFishAudio(apiKey: string, baseUrl?: string): void { + const cfg = this.read(); + cfg.fishaudio = { apiKey: apiKey.trim() }; + const b = (baseUrl ?? '').trim(); + if (b) cfg.fishaudio.baseUrl = b; + this.write(cfg); + } + + /** Forget the stored FishAudio key (env fallback, if any, still applies). */ + clearFishAudio(): void { + const cfg = this.read(); + delete cfg.fishaudio; + this.write(cfg); + } + + /** Resolve usable FishAudio creds: config file first, then env. The model is + * always env-controlled (FISH_AUDIO_MODEL); we reuse the core resolver so + * the model + base-URL defaults stay in one place. */ + resolveFishAudio(): FishAudioCredentials | null { + const cfg = this.read().fishaudio; + if (cfg?.apiKey) { + // Inject the stored key into the env resolver so model + base defaults + // are computed identically, then let a config baseUrl win if present. + const ref = resolveFishAudioCredentials({ ...process.env, FISH_AUDIO_API_KEY: cfg.apiKey })!; + const baseUrl = (cfg.baseUrl || '').trim().replace(/\/$/, '') || ref.baseUrl; + return { apiKey: cfg.apiKey, baseUrl, model: ref.model }; + } + return resolveFishAudioCredentials(); + } + + // --- Active narration provider ------------------------------------------- + + /** Which provider synthesizes narration. Defaults to 'minimax'. */ + getNarrationProvider(): NarrationProvider { + return this.read().narrationProvider === 'fishaudio' ? 'fishaudio' : 'minimax'; + } + + /** Persist the active narration provider. */ + setNarrationProvider(provider: NarrationProvider): void { + const cfg = this.read(); + cfg.narrationProvider = provider === 'fishaudio' ? 'fishaudio' : 'minimax'; + this.write(cfg); + } } function mask(key: string): string { diff --git a/packages/cli/src/studio-server.ts b/packages/cli/src/studio-server.ts index 6888cba..0ce3e34 100644 --- a/packages/cli/src/studio-server.ts +++ b/packages/cli/src/studio-server.ts @@ -11,7 +11,15 @@ import { randomUUID } from 'node:crypto'; import { fileURLToPath } from 'node:url'; import { tmpdir } from 'node:os'; import type { CliContext } from './context.js'; -import { AssetStore, generateTts, generateMusic } from '@html-video/core'; +import { + AssetStore, + generateTts, + generateMusic, + generateFishTts, + listFishVoices, + type MinimaxCredentials, + type FishAudioCredentials, +} from '@html-video/core'; import { extractUrls, fetchSource } from './fetch-source.js'; import { detectAll, findAgent, spawnAgent } from '@html-video/runtime'; @@ -433,23 +441,42 @@ export async function startStudioServer(ctx: CliContext, port: number): Promise< }; try { sse({ type: 'audio_started' }); - const creds = ctx.mediaConfig.resolveMinimax(); - if (!creds) { + const project = await ctx.orchestrator.load(projectId); + const soundtrack = { ...(project.soundtrack ?? {}) }; + const wantMusic = !!body.music?.prompt?.trim(); + const wantNarration = !!body.narration?.text?.trim(); + if (!wantMusic && !wantNarration) { + sse({ type: 'audio_failed', message: 'Nothing to generate — provide a music prompt and/or narration text.' }); + res.end(); + return; + } + + // Music is always MiniMax (FishAudio has no music generation). + const musicCreds = wantMusic ? ctx.mediaConfig.resolveMinimax() : null; + if (wantMusic && !musicCreds) { sse({ type: 'audio_failed', - message: - 'MiniMax API key not configured — add it in Settings → Audio (or set OD_MINIMAX_API_KEY).', + message: 'MiniMax API key not configured — add it in Settings → Audio (or set OD_MINIMAX_API_KEY).', }); res.end(); return; } - const project = await ctx.orchestrator.load(projectId); - const soundtrack = { ...(project.soundtrack ?? {}) }; - const wantMusic = !!body.music?.prompt?.trim(); - const wantNarration = !!body.narration?.text?.trim(); - if (!wantMusic && !wantNarration) { - sse({ type: 'audio_failed', message: 'Nothing to generate — provide a music prompt and/or narration text.' }); + // Narration provider is user-selectable (MiniMax or FishAudio). + const narrationProvider = ctx.mediaConfig.getNarrationProvider(); + const narrationCreds = wantNarration + ? narrationProvider === 'fishaudio' + ? ctx.mediaConfig.resolveFishAudio() + : ctx.mediaConfig.resolveMinimax() + : null; + if (wantNarration && !narrationCreds) { + sse({ + type: 'audio_failed', + message: + narrationProvider === 'fishaudio' + ? 'FishAudio API key not configured — add it in Settings → Audio (or set FISH_AUDIO_API_KEY).' + : 'MiniMax API key not configured — add it in Settings → Audio (or set OD_MINIMAX_API_KEY).', + }); res.end(); return; } @@ -459,7 +486,7 @@ export async function startStudioServer(ctx: CliContext, port: number): Promise< const music = await generateMusic({ prompt: body.music!.prompt!.trim(), instrumental: body.music!.instrumental ?? true, - creds, + creds: musicCreds!, }); const { asset } = await ctx.orchestrator.addBufferAsset( projectId, @@ -474,13 +501,20 @@ export async function startStudioServer(ctx: CliContext, port: number): Promise< } if (wantNarration) { - sse({ type: 'audio_progress', stage: 'narration', message: 'generating narration…' }); - const nar = await generateTts({ - text: body.narration!.text!.trim(), - ...(body.narration!.voiceId !== undefined && { voiceId: body.narration!.voiceId }), - ...(body.narration!.languageBoost !== undefined && { languageBoost: body.narration!.languageBoost }), - creds, - }); + sse({ type: 'audio_progress', stage: 'narration', message: `generating narration (${narrationProvider})…` }); + const nar = + narrationProvider === 'fishaudio' + ? await generateFishTts({ + text: body.narration!.text!.trim(), + ...(body.narration!.voiceId ? { referenceId: body.narration!.voiceId } : {}), + creds: narrationCreds as FishAudioCredentials, + }) + : await generateTts({ + text: body.narration!.text!.trim(), + ...(body.narration!.voiceId !== undefined && { voiceId: body.narration!.voiceId }), + ...(body.narration!.languageBoost !== undefined && { languageBoost: body.narration!.languageBoost }), + creds: narrationCreds as MinimaxCredentials, + }); const { asset } = await ctx.orchestrator.addBufferAsset( projectId, nar.bytes, @@ -632,6 +666,47 @@ export async function startStudioServer(ctx: CliContext, port: number): Promise< return json(res, 200, ctx.mediaConfig.getMinimaxStatus()); } + // FishAudio audio API config — mirrors MiniMax (no region; single host). + if (url.pathname === '/api/config/fishaudio' && m === 'GET') { + return json(res, 200, ctx.mediaConfig.getFishAudioStatus()); + } + if (url.pathname === '/api/config/fishaudio' && m === 'POST') { + const body = (await readBody(req)) as { apiKey?: string; baseUrl?: string }; + const key = (body.apiKey ?? '').trim(); + if (!key) return json(res, 400, { error: 'apiKey is required' }); + ctx.mediaConfig.setFishAudio(key, body.baseUrl); + return json(res, 200, ctx.mediaConfig.getFishAudioStatus()); + } + if (url.pathname === '/api/config/fishaudio' && m === 'DELETE') { + ctx.mediaConfig.clearFishAudio(); + return json(res, 200, ctx.mediaConfig.getFishAudioStatus()); + } + + // Active narration provider (which backend synthesizes voiceover). + if (url.pathname === '/api/config/narration-provider' && m === 'GET') { + return json(res, 200, { provider: ctx.mediaConfig.getNarrationProvider() }); + } + if (url.pathname === '/api/config/narration-provider' && m === 'POST') { + const body = (await readBody(req)) as { provider?: string }; + const provider = body.provider === 'fishaudio' ? 'fishaudio' : 'minimax'; + ctx.mediaConfig.setNarrationProvider(provider); + return json(res, 200, { provider: ctx.mediaConfig.getNarrationProvider() }); + } + + // FishAudio voice search — proxies the account's own models server-side so + // the browser never sees the key. Returns a trimmed list for the picker. + if (url.pathname === '/api/fishaudio/voices' && m === 'GET') { + const creds = ctx.mediaConfig.resolveFishAudio(); + if (!creds) return json(res, 400, { error: 'FishAudio API key not configured' }); + try { + const voices = await listFishVoices({ creds, query: url.searchParams.get('q') ?? '' }); + return json(res, 200, { voices }); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + return json(res, 502, { error: msg }); + } + } + // Agents (detected on each call; cheap thanks to the in-process cache) if (url.pathname === '/api/agents' && m === 'GET') { const force = url.searchParams.get('force') === '1'; diff --git a/packages/core/package.json b/packages/core/package.json index 5bac310..46a3d60 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -20,7 +20,7 @@ "scripts": { "build": "tsc -p tsconfig.json", "typecheck": "tsc -p tsconfig.json --noEmit", - "test": "node --test test/" + "test": "npm run build && node --test --experimental-strip-types \"test/**/*.test.ts\"" }, "dependencies": { "@html-video/content-graph": "workspace:*", diff --git a/packages/core/src/fishaudio.ts b/packages/core/src/fishaudio.ts new file mode 100644 index 0000000..c381ab2 --- /dev/null +++ b/packages/core/src/fishaudio.ts @@ -0,0 +1,213 @@ +/** + * @html-video/core — FishAudio TTS provider (narration only). + * + * FishAudio (https://fish.audio) is a second narration backend alongside + * {@link ./minimax.ts}. Unlike MiniMax it does NOT generate music, so this + * module only exposes speech synthesis + a voice-listing helper. + * + * Shape differences from MiniMax that this module absorbs: + * - the model is selected via an HTTP `model` header (s1 / s2-pro), not a + * body field, and is configured through the environment (FISH_AUDIO_MODEL); + * - `/v1/tts` returns the audio as a RAW binary body — no JSON envelope, no + * hex string to decode (MiniMax wraps it in `base_resp` + hex); + * - errors are surfaced as ordinary HTTP status codes (401/402/422); + * - a single global host (no international/China region split). + * + * Credentials are read from the environment so the studio works without a + * config file; a missing key yields `null` from + * {@link resolveFishAudioCredentials} and callers report it gracefully. + */ + +import { HtmlVideoError } from './errors.js'; +import type { TtsAudioResult } from './types/index.js'; + +/** Default host. FishAudio is a single global endpoint (no region split). We + * store the host only and append `/v1/tts` and `/model`; override via + * FISH_AUDIO_BASE_URL (or the Studio Settings UI). */ +const FISH_DEFAULT_BASE_URL = 'https://api.fish.audio'; +/** Default speech model. `s1` is the fast general model; `s2-pro` adds + * multi-speaker. Selected via the `model` header, configured by env. */ +const FISH_DEFAULT_MODEL = 's1'; +/** Hard ceiling for a single TTS request — a request that hasn't returned in + * 2 minutes is hung, not slow. */ +const FISH_REQUEST_TIMEOUT_MS = 120_000; + +export interface FishAudioCredentials { + apiKey: string; + baseUrl: string; + /** Speech model sent in the `model` header (e.g. 's1' or 's2-pro'). */ + model: string; +} + +/** + * Resolve FishAudio credentials from the environment. Returns `null` (not + * throw) when no key is set, so the studio can show a friendly "configure your + * key" message instead of a 500. + * + * Key precedence: FISH_AUDIO_API_KEY → FISHAUDIO_API_KEY + * Base precedence: FISH_AUDIO_BASE_URL → default + * Model precedence: FISH_AUDIO_MODEL → default (s1) + */ +export function resolveFishAudioCredentials( + env: NodeJS.ProcessEnv = process.env, +): FishAudioCredentials | null { + const apiKey = (env.FISH_AUDIO_API_KEY || env.FISHAUDIO_API_KEY || '').trim(); + if (!apiKey) return null; + const baseUrl = (env.FISH_AUDIO_BASE_URL || FISH_DEFAULT_BASE_URL).trim().replace(/\/$/, ''); + const model = (env.FISH_AUDIO_MODEL || FISH_DEFAULT_MODEL).trim(); + return { apiKey, baseUrl, model }; +} + +/** + * Synthesize spoken narration via FishAudio TTS (`POST /v1/tts`). + * + * The model is sent in the `model` header (from creds.model). The response is + * a RAW binary audio body — no JSON envelope — so we read `arrayBuffer()` + * directly. A missing `referenceId` falls back to FishAudio's default voice. + */ +export async function generateFishTts(opts: { + text: string; + referenceId?: string; + creds: FishAudioCredentials; + signal?: AbortSignal; +}): Promise { + const text = (opts.text || '').trim(); + if (!text) { + throw new HtmlVideoError('invalid-input', 'narration text is empty'); + } + const referenceId = (opts.referenceId || '').trim(); + const { creds } = opts; + + const body = { + text, + format: 'mp3', + ...(referenceId ? { reference_id: referenceId } : {}), + }; + + const timeoutSignal = AbortSignal.timeout(FISH_REQUEST_TIMEOUT_MS); + const effectiveSignal = opts.signal + ? AbortSignal.any + ? AbortSignal.any([opts.signal, timeoutSignal]) + : opts.signal + : timeoutSignal; + + let resp: Response; + try { + resp = await fetch(`${creds.baseUrl}/v1/tts`, { + method: 'POST', + headers: { + authorization: `Bearer ${creds.apiKey}`, + 'content-type': 'application/json', + // FishAudio selects the speech model via a header, not a body field. + model: creds.model, + }, + body: JSON.stringify(body), + signal: effectiveSignal, + }); + } catch (e) { + const isTimeout = e instanceof Error && (e.name === 'TimeoutError' || e.name === 'AbortError'); + const msg = e instanceof Error ? e.message : String(e); + throw new HtmlVideoError( + 'render-failed', + isTimeout + ? `fishaudio tts timed out after ${Math.round(FISH_REQUEST_TIMEOUT_MS / 1000)}s (the API did not respond — try again, or check FISH_AUDIO_BASE_URL)` + : `fishaudio tts request failed: ${msg}`, + true, + ); + } + + if (!resp.ok) { + const detail = truncate(await resp.text().catch(() => ''), 200); + const hint = + resp.status === 401 + ? ' (auth — check the FishAudio API key)' + : resp.status === 402 + ? ' (no credit — check the account balance)' + : ''; + throw new HtmlVideoError( + 'render-failed', + `fishaudio tts ${resp.status}: ${detail || 'request rejected'}${hint}`, + resp.status >= 500, + ); + } + + const bytes = Buffer.from(await resp.arrayBuffer()); + if (bytes.length === 0) { + throw new HtmlVideoError('render-failed', 'fishaudio tts returned zero audio bytes'); + } + return { + bytes, + ext: '.mp3', + providerNote: `fishaudio/${creds.model} · ${referenceId || 'default'} · ${bytes.length} bytes`, + }; +} + +/** A trimmed FishAudio voice model, for the studio's voice picker. */ +export interface FishVoice { + /** The model id — passed back as `reference_id` when synthesizing. */ + id: string; + title: string; + languages: string[]; + /** Preview audio URL (first sample), if the model has one. */ + sampleUrl?: string; +} + +/** + * List the account's own voice models via `GET /model?self=true`, optionally + * filtered by a title query. Returns a trimmed shape for the picker; the raw + * FishAudio model objects carry far more than the UI needs. + */ +export async function listFishVoices(opts: { + creds: FishAudioCredentials; + query?: string; + pageSize?: number; + signal?: AbortSignal; +}): Promise { + const { creds } = opts; + const params = new URLSearchParams({ self: 'true', page_size: String(opts.pageSize ?? 20) }); + const query = (opts.query || '').trim(); + if (query) params.set('title', query); + + let resp: Response; + try { + resp = await fetch(`${creds.baseUrl}/model?${params.toString()}`, { + headers: { authorization: `Bearer ${creds.apiKey}` }, + ...(opts.signal ? { signal: opts.signal } : {}), + }); + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + throw new HtmlVideoError('render-failed', `fishaudio list voices request failed: ${msg}`, true); + } + if (!resp.ok) { + const detail = truncate(await resp.text().catch(() => ''), 200); + throw new HtmlVideoError( + 'render-failed', + `fishaudio list voices ${resp.status}: ${detail || 'request rejected'}`, + resp.status >= 500, + ); + } + + const data = (await resp.json().catch(() => ({}))) as { + items?: Array<{ + _id?: string; + title?: string; + languages?: string[]; + samples?: Array<{ audio?: string }>; + }>; + }; + return (data.items ?? []) + .filter((m): m is { _id: string } & typeof m => typeof m._id === 'string' && m._id.length > 0) + .map((m) => { + const sampleUrl = m.samples?.find((s) => typeof s.audio === 'string')?.audio; + return { + id: m._id, + title: (m.title || '').trim() || m._id, + languages: Array.isArray(m.languages) ? m.languages : [], + ...(sampleUrl ? { sampleUrl } : {}), + }; + }); +} + +function truncate(s: string, n: number): string { + return s.length > n ? `${s.slice(0, n)}…` : s; +} diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 0339d17..0e9c913 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -19,3 +19,9 @@ export { generateMusic, } from './minimax.js'; export type { MinimaxCredentials, MinimaxAudioResult } from './minimax.js'; +export { + resolveFishAudioCredentials, + generateFishTts, + listFishVoices, +} from './fishaudio.js'; +export type { FishAudioCredentials, FishVoice } from './fishaudio.js'; diff --git a/packages/core/src/minimax.ts b/packages/core/src/minimax.ts index e5bd0dc..6711622 100644 --- a/packages/core/src/minimax.ts +++ b/packages/core/src/minimax.ts @@ -16,6 +16,7 @@ */ import { HtmlVideoError } from './errors.js'; +import type { TtsAudioResult } from './types/index.js'; /** Default base URL. The old `api.minimaxi.chat` host is RETIRED server-side * (issue #4). MiniMax now has two region-bound endpoints — international @@ -43,16 +44,9 @@ export interface MinimaxCredentials { baseUrl: string; } -export interface MinimaxAudioResult { - /** Decoded audio bytes (MP3). */ - bytes: Buffer; - /** File extension to store under. */ - ext: '.mp3'; - /** Human-readable note of what was produced (provider · model · size). */ - providerNote: string; - /** Reported duration in seconds, if the API surfaced it. */ - durationSec?: number; -} +/** @deprecated Use the shared {@link TtsAudioResult}. Kept as an alias so + * existing imports keep working. */ +export type MinimaxAudioResult = TtsAudioResult; /** * Resolve MiniMax credentials from the environment. Returns `null` (not throw) diff --git a/packages/core/src/types/index.ts b/packages/core/src/types/index.ts index 22c4a29..b5560d3 100644 --- a/packages/core/src/types/index.ts +++ b/packages/core/src/types/index.ts @@ -368,6 +368,22 @@ export interface FrameRecord { previewMp4Path?: string; } +/** + * Result of a TTS / audio synthesis call, shared across narration providers + * (MiniMax, FishAudio, …). Providers decode whatever wire format they speak + * (MiniMax: JSON+hex envelope; FishAudio: raw binary) into these common bytes. + */ +export interface TtsAudioResult { + /** Decoded audio bytes. */ + bytes: Buffer; + /** File extension to store under (e.g. '.mp3', '.wav'). */ + ext: string; + /** Human-readable note of what was produced (provider · model · size). */ + providerNote: string; + /** Reported duration in seconds, if the provider surfaced it (cosmetic). */ + durationSec?: number; +} + /** * v0.9: project-level soundtrack — one background music track + one narration * track mixed into the exported MP4. Both reference an entry in `assets[]` diff --git a/packages/core/test/fishaudio.test.ts b/packages/core/test/fishaudio.test.ts new file mode 100644 index 0000000..509b1f0 --- /dev/null +++ b/packages/core/test/fishaudio.test.ts @@ -0,0 +1,199 @@ +import { test } from 'node:test'; +import assert from 'node:assert/strict'; +import { resolveFishAudioCredentials, generateFishTts, listFishVoices } from '../dist/fishaudio.js'; +import { HtmlVideoError } from '../dist/errors.js'; + +const CREDS = { apiKey: 'k-secret', baseUrl: 'https://api.fish.audio', model: 's1' }; + +/** Swap in a stubbed global fetch for the duration of `fn`, recording the + * single request it receives. Restores the real fetch afterwards. */ +async function withFetch( + responder: (url: string, init: RequestInit) => Response, + fn: (calls: { url: string; init: RequestInit }[]) => Promise, +): Promise { + const calls: { url: string; init: RequestInit }[] = []; + const real = globalThis.fetch; + globalThis.fetch = (async (url: string, init: RequestInit) => { + calls.push({ url, init }); + return responder(url, init); + }) as unknown as typeof fetch; + try { + await fn(calls); + } finally { + globalThis.fetch = real; + } +} + +const okAudio = () => + new Response(new Uint8Array([0xff, 0xfb, 0x10, 0x20]), { + status: 200, + headers: { 'content-type': 'audio/mpeg' }, + }); + +// resolveFishAudioCredentials mirrors resolveMinimaxCredentials: it reads creds +// from the environment, returns null (never throws) when no key is set, and +// applies env precedence + defaults. FishAudio has a single global host (no +// region split) and selects the model via env (FISH_AUDIO_MODEL). + +test('returns null when no key is set', () => { + assert.equal(resolveFishAudioCredentials({}), null); +}); + +test('FISH_AUDIO_API_KEY yields default host + default model s1', () => { + const c = resolveFishAudioCredentials({ FISH_AUDIO_API_KEY: 'k-123' }); + assert.deepEqual(c, { apiKey: 'k-123', baseUrl: 'https://api.fish.audio', model: 's1' }); +}); + +test('FISHAUDIO_API_KEY is accepted as a fallback key name', () => { + const c = resolveFishAudioCredentials({ FISHAUDIO_API_KEY: 'k-fallback' }); + assert.equal(c?.apiKey, 'k-fallback'); +}); + +test('FISH_AUDIO_API_KEY takes precedence over FISHAUDIO_API_KEY', () => { + const c = resolveFishAudioCredentials({ + FISH_AUDIO_API_KEY: 'primary', + FISHAUDIO_API_KEY: 'secondary', + }); + assert.equal(c?.apiKey, 'primary'); +}); + +test('FISH_AUDIO_BASE_URL overrides the default and strips a trailing slash', () => { + const c = resolveFishAudioCredentials({ + FISH_AUDIO_API_KEY: 'k', + FISH_AUDIO_BASE_URL: 'https://proxy.example.com/', + }); + assert.equal(c?.baseUrl, 'https://proxy.example.com'); +}); + +test('FISH_AUDIO_MODEL overrides the default model', () => { + const c = resolveFishAudioCredentials({ FISH_AUDIO_API_KEY: 'k', FISH_AUDIO_MODEL: 's2-pro' }); + assert.equal(c?.model, 's2-pro'); +}); + +test('a whitespace-only key is treated as unset', () => { + assert.equal(resolveFishAudioCredentials({ FISH_AUDIO_API_KEY: ' ' }), null); +}); + +// --- generateFishTts ------------------------------------------------------- + +test('posts to /v1/tts with bearer auth, the model header, and the right body', async () => { + await withFetch(okAudio, async (calls) => { + const r = await generateFishTts({ text: 'hello world', referenceId: 'voice-42', creds: CREDS }); + assert.equal(calls.length, 1); + assert.equal(calls[0]!.url, 'https://api.fish.audio/v1/tts'); + const h = calls[0]!.init.headers as Record; + assert.equal(h.authorization, 'Bearer k-secret'); + assert.equal(h.model, 's1'); + assert.equal(h['content-type'], 'application/json'); + const body = JSON.parse(calls[0]!.init.body as string); + assert.equal(body.text, 'hello world'); + assert.equal(body.reference_id, 'voice-42'); + assert.equal(body.format, 'mp3'); + // decoded the raw binary body into bytes, tagged as mp3 + assert.ok(Buffer.isBuffer(r.bytes)); + assert.equal(r.bytes.length, 4); + assert.equal(r.ext, '.mp3'); + assert.match(r.providerNote, /fishaudio\/s1/); + }); +}); + +test('omits reference_id from the body when no voice is given (default voice)', async () => { + await withFetch(okAudio, async (calls) => { + await generateFishTts({ text: 'no voice', creds: CREDS }); + const body = JSON.parse(calls[0]!.init.body as string); + assert.equal('reference_id' in body, false); + }); +}); + +test('rejects empty text without hitting the network', async () => { + await withFetch(okAudio, async (calls) => { + await assert.rejects( + () => generateFishTts({ text: ' ', creds: CREDS }), + (e: unknown) => e instanceof HtmlVideoError && e.code === 'invalid-input', + ); + assert.equal(calls.length, 0); + }); +}); + +test('maps HTTP 401 to a friendly auth error', async () => { + const resp401 = () => new Response('unauthorized', { status: 401 }); + await withFetch(resp401, async () => { + await assert.rejects( + () => generateFishTts({ text: 'x', creds: CREDS }), + (e: unknown) => e instanceof HtmlVideoError && /401|auth|key/i.test(e.message), + ); + }); +}); + +test('maps HTTP 402 to a friendly credit/balance error', async () => { + const resp402 = () => new Response('payment required', { status: 402 }); + await withFetch(resp402, async () => { + await assert.rejects( + () => generateFishTts({ text: 'x', creds: CREDS }), + (e: unknown) => e instanceof HtmlVideoError && /402|credit|balance/i.test(e.message), + ); + }); +}); + +test('rejects a zero-byte audio body', async () => { + const empty = () => + new Response(new Uint8Array([]), { status: 200, headers: { 'content-type': 'audio/mpeg' } }); + await withFetch(empty, async () => { + await assert.rejects( + () => generateFishTts({ text: 'x', creds: CREDS }), + (e: unknown) => e instanceof HtmlVideoError, + ); + }); +}); + +// --- listFishVoices -------------------------------------------------------- + +const voicesBody = (items: unknown[]) => + new Response(JSON.stringify({ total: items.length, items }), { + status: 200, + headers: { 'content-type': 'application/json' }, + }); + +test('lists own voices via GET /model with self=true + a title query, bearer auth', async () => { + const responder = () => + voicesBody([ + { + _id: 'v1', + title: '平淡', + languages: ['zh'], + samples: [{ audio: 'https://cdn.fish.audio/v1.mp3' }], + }, + ]); + await withFetch(responder, async (calls) => { + const voices = await listFishVoices({ creds: CREDS, query: '平淡' }); + const u = new URL(calls[0]!.url); + assert.equal(u.origin + u.pathname, 'https://api.fish.audio/model'); + assert.equal(u.searchParams.get('self'), 'true'); + assert.equal(u.searchParams.get('title'), '平淡'); + const h = calls[0]!.init.headers as Record; + assert.equal(h.authorization, 'Bearer k-secret'); + assert.deepEqual(voices, [ + { id: 'v1', title: '平淡', languages: ['zh'], sampleUrl: 'https://cdn.fish.audio/v1.mp3' }, + ]); + }); +}); + +test('omits the title param when no query is given', async () => { + await withFetch( + () => voicesBody([]), + async (calls) => { + await listFishVoices({ creds: CREDS }); + const u = new URL(calls[0]!.url); + assert.equal(u.searchParams.has('title'), false); + assert.equal(u.searchParams.get('self'), 'true'); + }, + ); +}); + +test('a voice with no samples yields an undefined sampleUrl', async () => { + const responder = () => voicesBody([{ _id: 'v2', title: 'bare', languages: [], samples: [] }]); + await withFetch(responder, async () => { + const voices = await listFishVoices({ creds: CREDS }); + assert.equal(voices[0]!.sampleUrl, undefined); + }); +}); diff --git a/packages/project-studio/public/app.js b/packages/project-studio/public/app.js index 13785b2..f46c0e6 100644 --- a/packages/project-studio/public/app.js +++ b/packages/project-studio/public/app.js @@ -835,6 +835,12 @@ function renderMain() { +
@@ -1103,8 +1109,13 @@ function wireSoundtrackPanel() { .filter((s) => s.length > 0).join('\n'); const nt = stitched || narrationText.value.trim(); if (!nt) { if (statusEl) statusEl.textContent = t('soundtrack.empty_narration'); return; } - const voiceSel = document.getElementById('st-narration-voice'); - payload.narration = { text: nt, volumeDb: Number(narrationVol.value), byFrame: state._narrationByFrame, ...(voiceSel?.value && { voiceId: voiceSel.value }) }; + // voiceId comes from whichever provider's control is active: the MiniMax + // + + +
+ + + +
+

${esc(t('settings.audio.fish_hint'))}

+ `; + const mmPane = panel.querySelector('#mm-pane'); + const faPane = panel.querySelector('#fa-pane'); + const toggle = panel.querySelector('#narration-provider-toggle'); + + // Reflect a provider in the toggle + which pane shows, and keep the + // soundtrack panel's voice control in sync if it's open. + const applyProvider = (provider) => { + toggle.querySelectorAll('.st-preset').forEach((b) => b.classList.toggle('active', b.dataset.provider === provider)); + mmPane.style.display = provider === 'fishaudio' ? 'none' : ''; + faPane.style.display = provider === 'fishaudio' ? '' : 'none'; + applyNarrationProviderToUI(provider); + }; + + // Load + persist the active narration provider. + let provider = 'minimax'; + try { + provider = (await fetch('/api/config/narration-provider').then((r) => r.json())).provider || 'minimax'; + } catch { /* default minimax */ } + applyProvider(provider); + toggle.querySelectorAll('.st-preset').forEach((btn) => { + btn.onclick = async () => { + const p = btn.dataset.provider; + applyProvider(p); + try { + await fetch('/api/config/narration-provider', { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ provider: p }), + }); + } catch { /* non-fatal; UI already reflects the choice */ } + }; + }); + + // --- MiniMax wiring --- const statusEl = panel.querySelector('#audio-status'); const keyInput = panel.querySelector('#mm-api-key'); const baseInput = panel.querySelector('#mm-base-url'); @@ -3059,6 +3206,66 @@ async function renderSettingsAudio(panel) { saveState.textContent = ''; await refresh(); }; + + // --- FishAudio wiring (no region; key + base URL only) --- + const faStatusEl = panel.querySelector('#fa-status'); + const faKeyInput = panel.querySelector('#fa-api-key'); + const faBaseInput = panel.querySelector('#fa-base-url'); + const faSaveState = panel.querySelector('#fa-save-state'); + + const faRefresh = async () => { + try { + const s = await fetch('/api/config/fishaudio').then((r) => r.json()); + if (s.configured) { + const src = s.source === 'env' ? t('settings.audio.source_env') : t('settings.audio.source_config'); + faStatusEl.innerHTML = `${esc(t('settings.audio.configured', { key: s.maskedKey, source: src }))}`; + if (s.baseUrl) faBaseInput.value = s.baseUrl; + } else { + faStatusEl.innerHTML = `${esc(t('settings.audio.fish_not_configured'))}`; + } + } catch { + faStatusEl.textContent = t('settings.audio.fish_not_configured'); + } + }; + await faRefresh(); + + panel.querySelector('#fa-save').onclick = async () => { + const apiKey = faKeyInput.value.trim(); + if (!apiKey) { faSaveState.textContent = t('settings.audio.need_key'); return; } + faSaveState.textContent = t('settings.audio.saving'); + try { + const r = await fetch('/api/config/fishaudio', { + method: 'POST', + headers: { 'content-type': 'application/json' }, + body: JSON.stringify({ apiKey, baseUrl: faBaseInput.value.trim() }), + }); + if (!r.ok) throw new Error(`HTTP ${r.status}`); + faKeyInput.value = ''; + faSaveState.textContent = t('settings.audio.saved'); + await faRefresh(); + } catch (e) { + faSaveState.textContent = t('settings.audio.save_failed', { message: (e?.message ?? e) }); + } + }; + + panel.querySelector('#fa-clear').onclick = async () => { + await fetch('/api/config/fishaudio', { method: 'DELETE' }); + faKeyInput.value = ''; + faBaseInput.value = ''; + faSaveState.textContent = ''; + await faRefresh(); + }; +} + +/** Toggle the soundtrack narration voice control between the MiniMax built-in + * voice ) */ + .st-fish-voice { position: relative; display: flex; flex-direction: column; gap: 4px; min-width: 220px; } + .st-fish-search { font-size: 11px; padding: 3px 8px; border-radius: var(--radius-sm); + background: var(--bg); border: 1px solid var(--border); color: var(--text); } + .st-fish-search:focus { outline: none; border-color: var(--accent); } + .st-fish-picked { font-size: 10.5px; color: var(--text-muted); } + .st-fish-picked a { color: var(--accent); margin-left: 6px; } + .st-fish-results { position: absolute; top: 100%; left: 0; right: 0; z-index: 30; margin-top: 2px; + max-height: 200px; overflow-y: auto; background: var(--bg); border: 1px solid var(--border); + border-radius: var(--radius-sm); box-shadow: 0 6px 20px rgba(0,0,0,.18); } + .st-fish-results:empty { display: none; } + .st-fish-result { display: flex; align-items: center; gap: 8px; padding: 5px 9px; cursor: pointer; + font-size: 11px; border-bottom: 1px solid var(--border); } + .st-fish-result:last-child { border-bottom: none; } + .st-fish-result:hover { background: var(--accent); color: var(--accent-fg); } + .st-fish-result-title { flex: 1; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; } + .st-fish-result-lang { font-size: 10px; color: var(--text-muted); } + .st-fish-result:hover .st-fish-result-lang { color: var(--accent-fg); } + .st-fish-play { font-size: 10px; padding: 1px 6px; border-radius: 999px; cursor: pointer; + background: transparent; border: 1px solid currentColor; color: inherit; } + .st-fish-hint { padding: 6px 9px; font-size: 10.5px; color: var(--text-muted); } .st-fit { font-size: 11px; padding: 3px 10px; border-radius: var(--radius-sm); cursor: pointer; background: var(--bg); border: 1px solid var(--accent); color: var(--accent); margin-left: auto; transition: all .12s; } diff --git a/research/2026-06-15-spec-10-fishaudio-tts-provider.md b/research/2026-06-15-spec-10-fishaudio-tts-provider.md new file mode 100644 index 0000000..ebd8bd5 --- /dev/null +++ b/research/2026-06-15-spec-10-fishaudio-tts-provider.md @@ -0,0 +1,104 @@ +# RFC-10 · FishAudio TTS provider for narration + +- **Date**: 2026-06-15 +- **Status**: Draft (pending review) +- **Author**: fancy +- **Scope**: Add FishAudio as a second narration (text-to-speech) provider alongside the existing MiniMax integration, selectable per workspace. Music generation stays MiniMax-only (FishAudio has none). + +## 1. Context + +Narration today is hard-wired to MiniMax across ~6 files: + +| Layer | File | What it does | +|---|---|---| +| Provider | `packages/core/src/minimax.ts` | `resolveMinimaxCredentials` + `generateTts` (POST `/t2a_v2`) + `generateMusic` | +| Config | `packages/cli/src/media-config.ts` | `MediaConfigStore` persists key to `.html-video/media-config.json` | +| Server | `packages/cli/src/studio-server.ts` | `POST /api/projects/:id/generate-audio` (SSE) + `/api/config/minimax` | +| Mux | `packages/core/src/project.ts` | stores MP3 asset, muxes into export via ffmpeg | +| UI | `packages/project-studio/public/app.js` | Settings → Audio panel + Soundtrack panel (6 hard-coded voices) | +| i18n | `packages/project-studio/public/i18n.js` | `settings.audio.*` strings say "MiniMax" | + +There is **no provider abstraction** — the request shape (`voice_setting`/`audio_setting`), the JSON+hex+`base_resp` response envelope, the region-bound keys, and the fixed 6-voice catalog are all MiniMax-specific. + +We use FishAudio's TTS heavily and want it as a first-class narration backend. + +## 2. Verified FishAudio API behaviour + +Tested live against `api.fish.audio` with a real key (raw `curl` + `ffprobe`): + +- **TTS**: `POST https://api.fish.audio/v1/tts`, headers `Authorization: Bearer `, `Content-Type: application/json`, `model: `. Body `{ text, format:"mp3", reference_id? }`. Response is **raw binary audio** (`audio/mpeg`), `Transfer-Encoding: chunked`. No JSON envelope, no hex. + - `s1` and `s2-pro` both verified. `reference_id` optional → default voice. `prosody.speed`/`format:wav` verified working. + - **`model` header is documented "required" but the live API accepts its absence** (server default). We send it explicitly anyway. + - **No duration is returned** by `/v1/tts`. (See §4 — this turns out not to matter.) + - Error codes are standard HTTP: 401 unauthorized, 402 no credit, 422 validation. +- **Voices**: `GET https://api.fish.audio/model?self=true&title=&page_size=N` → `{ total, items:[{ _id, title, languages, visibility, samples:[{ audio }] }] }`. `_id` is the `reference_id`; `samples[].audio` is a preview MP3 URL. The test account holds 6554 own models → a searchable picker is required, not a plain dropdown. +- **Single global host** — no international/China region split (unlike MiniMax). +- FishAudio also has ASR and voice-clone creation; **out of scope** here. + +## 3. Key finding that simplifies the design + +`generateTts` returns `durationSec` (from MiniMax `extra_info.audio_length`), but **nothing downstream consumes it** — it appears only in the cosmetic `providerNote` string. The "Fit timing to narration" feature (`studio-server.ts:1229`) re-paces frames by **narration text character count**, not audio duration. So FishAudio returning no duration is harmless; `durationSec` stays optional and cosmetic. + +## 4. Design (provider abstraction) + +### 4.1 Core — new `packages/core/src/fishaudio.ts` + +Mirrors `minimax.ts`'s narration surface (music intentionally absent): + +- `resolveFishAudioCredentials(env)` → `{ apiKey, baseUrl, model } | null` + - key: `FISH_AUDIO_API_KEY` → `FISHAUDIO_API_KEY` + - base: `FISH_AUDIO_BASE_URL` → default `https://api.fish.audio` (host only; we append `/v1/tts` and `/model`) + - model: `FISH_AUDIO_MODEL` → default `s1` +- `generateFishTts({ text, referenceId?, creds, signal? })` → POST `/v1/tts`, `model` header, `format:"mp3"`, read `arrayBuffer()` → `TtsAudioResult`. Maps 401/402/422 to friendly `HtmlVideoError('render-failed', …)`. +- `listFishVoices({ creds, query?, pageSize? })` → GET `/model?self=true&title=` → trimmed `[{ id, title, languages, sampleUrl }]`. + +Generalise `MinimaxAudioResult` → shared `TtsAudioResult { bytes; ext; providerNote; durationSec? }` (`ext` widened to `string`; **v1 FishAudio always emits `.mp3`** so the export mux assumptions are untouched). Re-export from `core/src/index.ts`. + +### 4.2 Config — `packages/cli/src/media-config.ts` + +`media-config.json` grows from `{ minimax }` to: + +```json +{ + "narrationProvider": "minimax" | "fishaudio", + "minimax": { "apiKey": "…", "baseUrl": "…" }, + "fishaudio": { "apiKey": "…", "baseUrl": "…" } +} +``` + +Add `getFishAudioStatus / setFishAudio / clearFishAudio / resolveFishAudio` (mirroring the MiniMax methods, minus region) and `getNarrationProvider / setNarrationProvider`. Existing MiniMax methods stay (music still uses them). `narrationProvider` defaults to `minimax` for backward compat. + +### 4.3 Server — `packages/cli/src/studio-server.ts` + +- `generate-audio` handler: **music** branch unchanged (always MiniMax). **narration** branch resolves the active `narrationProvider` and routes to `generateFishTts` or `generateTts`. If the chosen provider has no key → the existing friendly "configure your key" SSE failure, naming the right provider. +- New endpoints: + - `GET/POST/DELETE /api/config/fishaudio` — mirror `/api/config/minimax`. + - `GET/POST /api/config/narration-provider` — read/set the active provider. + - `GET /api/fishaudio/voices?q=` — proxy `listFishVoices` for the picker (server holds the key; the browser never sees it). + +### 4.4 UI — `packages/project-studio/public/app.js` + `i18n.js` + +- **Settings → Audio**: a provider toggle (MiniMax / FishAudio) driving `narrationProvider`. FishAudio pane = key input + a note that the model is controlled by `FISH_AUDIO_MODEL` (default `s1`); **region selector hidden** (MiniMax-only). Panel title becomes provider-aware (no longer a literal "MiniMax"). +- **Soundtrack → Narration**: when `narrationProvider==='fishaudio'`, the voice control becomes a **searchable picker** — a text box → debounced `GET /api/fishaudio/voices?q=` → result list (title · language · ▶ sample) → selecting stores `reference_id` (empty = default voice). When `minimax`, the existing 6-voice dropdown is shown. Volume slider (post-mix dB) is shared, unchanged. +- i18n: generalise `settings.audio.*`; add FishAudio strings (en + zh-CN). + +### 4.5 Non-goals (v1) + +ASR; voice-clone creation; per-request speed / temperature / format UI (matches the current MiniMax narration UI, which exposes only voice + volume); any music via FishAudio. + +## 5. Files touched + +`core/src/fishaudio.ts` (new), `core/src/minimax.ts` (result-type generalisation only), `core/src/index.ts`, `cli/src/media-config.ts`, `cli/src/studio-server.ts`, `project-studio/public/app.js`, `project-studio/public/i18n.js`. No change to the export/mux path or `project.ts`. + +## 6. Verification plan (must be real, not "tsc passes") + +1. `pnpm -r build` + `pnpm --filter @html-video/cli smoke` green. +2. Unit: `resolveFishAudioCredentials` env precedence; `generateFishTts` request shape + binary decode (mock fetch); error-code mapping. +3. **Real API** (with the provided key, never committed): `generateFishTts` → real MP3, `ffprobe` confirms valid audio; `listFishVoices` returns items. +4. **End-to-end in studio** (chrome-devtools): configure FishAudio key → switch provider → search + pick a voice → generate narration → asset appears + plays. Capture evidence (screenshot / ffprobe of the generated asset). +5. Regression: with `narrationProvider=minimax`, the existing flow is byte-for-byte unchanged. + +## 7. Open questions + +- Should the voice picker default `self=true` (own models) or also allow browsing the public marketplace? v1 = `self=true` only. +- Persist a small "recently used voices" shortlist later? Deferred.