diff --git a/packages/core/src/elevenlabs/index.ts b/packages/core/src/elevenlabs/index.ts index 25fc83dc2..977740358 100644 --- a/packages/core/src/elevenlabs/index.ts +++ b/packages/core/src/elevenlabs/index.ts @@ -15,3 +15,5 @@ export { ElevenLabsError, } from "./client.js"; export type { ElevenLabsVoice, SynthesizeOptions } from "./client.js"; +export { generateSoundEffect, clampSfxDuration, SFX_BOUNDS } from "./sfx.js"; +export type { GenerateSfxOptions, GenerateSfxResult } from "./sfx.js"; diff --git a/packages/core/src/elevenlabs/sfx.ts b/packages/core/src/elevenlabs/sfx.ts new file mode 100644 index 000000000..6b40469c6 --- /dev/null +++ b/packages/core/src/elevenlabs/sfx.ts @@ -0,0 +1,104 @@ +/** + * ElevenLabs Sound Generation client. + * + * Mirrors the existing `synthesize()` shape for voice — single function call + * that returns audio bytes + chosen format. Errors surface via the same + * `ElevenLabsError` so callers get a uniform try/catch shape across + * voice / sfx / music. + */ + +import { ElevenLabsError } from "./client.js"; + +const API_BASE = "https://api.elevenlabs.io/v1"; + +export interface GenerateSfxOptions { + /** 0.5..22 seconds. ElevenLabs treats this as a target — actual ±20%. */ + durationSeconds?: number; + /** + * 0..1. Higher = stick closer to the prompt; lower = more creative + * variation. Default 0.3 matches ElevenLabs's own default for the SFX + * playground. + */ + promptInfluence?: number; + /** Output format. mp3_44100_128 is the cheapest acceptable quality for SFX. */ + outputFormat?: "mp3_44100_128" | "mp3_44100_192"; +} + +export interface GenerateSfxResult { + bytes: Uint8Array; + format: NonNullable; +} + +/** + * Generate one sound effect from a text prompt. Returns mp3 bytes — caller + * writes them to disk. + * + * const { bytes } = await generateSoundEffect(apiKey, "snap zoom whoosh", { durationSeconds: 1.5 }); + * fs.writeFileSync("sfx.mp3", bytes); + * + * Cost is billed per generation, not per duration — short SFX cost the same + * as long ones up to the 22-second cap. Surface the count in the cost log. + */ +export async function generateSoundEffect( + apiKey: string, + prompt: string, + opts: GenerateSfxOptions = {}, +): Promise { + if (!prompt || !prompt.trim()) { + throw new ElevenLabsError("generateSoundEffect: prompt is required"); + } + const trimmed = prompt.trim(); + if (trimmed.length > 1000) { + throw new ElevenLabsError("generateSoundEffect: prompt too long (max 1000 chars)"); + } + const format = opts.outputFormat ?? "mp3_44100_128"; + const body: Record = { text: trimmed, output_format: format }; + if (typeof opts.durationSeconds === "number") { + // ElevenLabs caps at 0.5..22 — clamp here so a misconfigured caller + // doesn't get a 422 round-trip. + body.duration_seconds = Math.max(0.5, Math.min(22, opts.durationSeconds)); + } + if (typeof opts.promptInfluence === "number") { + body.prompt_influence = Math.max(0, Math.min(1, opts.promptInfluence)); + } + + const res = await fetch(`${API_BASE}/sound-generation`, { + method: "POST", + headers: { + "xi-api-key": apiKey, + "Content-Type": "application/json", + Accept: "audio/mpeg", + }, + body: JSON.stringify(body), + }); + if (!res.ok) { + let detail = ""; + try { + const text = await res.text(); + detail = text.length > 500 ? text.slice(0, 500) + "…" : text; + } catch { + /* ignore */ + } + throw new ElevenLabsError( + `generateSoundEffect: ${res.status} ${res.statusText}${detail ? ` — ${detail}` : ""}`, + res.status, + ); + } + const buf = new Uint8Array(await res.arrayBuffer()); + return { bytes: buf, format }; +} + +/** + * Helpers for clamping values before they hit the wire — exported so the + * studio API route can reuse the same bounds when validating user input. + */ +export const SFX_BOUNDS = { + durationMin: 0.5, + durationMax: 22, + promptMaxChars: 1000, +} as const; + +export function clampSfxDuration(durationSeconds: number): number { + if (!Number.isFinite(durationSeconds)) return 2; + return Math.max(SFX_BOUNDS.durationMin, Math.min(SFX_BOUNDS.durationMax, durationSeconds)); +} diff --git a/packages/core/src/script/assemble.ts b/packages/core/src/script/assemble.ts index 169847e19..8412b9731 100644 --- a/packages/core/src/script/assemble.ts +++ b/packages/core/src/script/assemble.ts @@ -8,6 +8,7 @@ import { getLoadedThemeByName } from "./themes/index.js"; import type { PlannedScene, PlannedScript, SceneTransition } from "./types.js"; import type { ImageEntry, ImageManifest } from "../images/index.js"; import type { VisualDirectionPlan } from "./visualDirector.js"; +import { readSfxManifest, resolveSfxStartForScene, type SfxEntry } from "./sfx/manifest.js"; export interface AssembleOptions { projectDir: string; @@ -75,6 +76,17 @@ export function assembleMaster(planned: PlannedScript, opts: AssembleOptions): A let cursor = 0; const sceneFragments: string[] = []; const audioTags: string[] = []; + + // SFX manifest: read once, group entries by sceneId so each scene's loop + // iteration can emit them at the right cursor position. The manifest is + // optional — projects without it just skip the SFX lane. + const sfxManifest = readSfxManifest(opts.projectDir); + const sfxBySceneId = new Map(); + for (const entry of sfxManifest.entries) { + const list = sfxBySceneId.get(entry.sceneId) ?? []; + list.push(entry); + sfxBySceneId.set(entry.sceneId, list); + } const sceneVisibility: Array<{ id: string; start: number; @@ -182,6 +194,22 @@ export function assembleMaster(planned: PlannedScript, opts: AssembleOptions): A ` `, ); } + + // SFX entries land on track 3 with the same audio timing rules as the + // voiceover. Each entry's start time is computed from its anchor (scene- + // start / accent-word / scene-end) — see resolveSfxStartForScene for + // the math. Volume scales the runtime mixer when supplied; the producer + // package consumes data-volume-db at render time. + const sceneSfx = sfxBySceneId.get(scene.id) ?? []; + for (const entry of sceneSfx) { + const start = resolveSfxStartForScene(entry, scene, cursor, sceneTotal); + const volumeAttr = + typeof entry.volumeDb === "number" ? ` data-volume-db="${entry.volumeDb.toFixed(1)}"` : ""; + const labelAttr = entry.label ? ` data-timeline-label="${escapeAttr(entry.label)}"` : ""; + audioTags.push( + ` `, + ); + } const transitionIn: SceneTransition = scene.transition ?? defaultTransitionForTemplate(scene.template); const transitionInMs = (TRANSITION_DURATIONS[transitionIn] ?? 0) * 1000; diff --git a/packages/core/src/script/sfx/manifest.test.ts b/packages/core/src/script/sfx/manifest.test.ts new file mode 100644 index 000000000..9d6ded79c --- /dev/null +++ b/packages/core/src/script/sfx/manifest.test.ts @@ -0,0 +1,145 @@ +import { describe, it, expect } from "vitest"; +import { resolveSfxStart, type SfxEntry } from "./manifest"; + +const baseEntry: Pick = { + anchor: "scene-start", + durationSeconds: 1.5, +}; + +describe("resolveSfxStart", () => { + it("scene-start anchor returns sceneStart + audioStartOffset", () => { + expect( + resolveSfxStart({ + sceneStart: 10, + sceneDuration: 6, + audioStartOffset: 0.15, + voiceDurationSeconds: 5, + voiceWordCount: 12, + entry: { ...baseEntry, anchor: "scene-start" }, + }), + ).toBeCloseTo(10.15, 3); + }); + + it("scene-start with zero lead-in returns sceneStart exactly", () => { + expect( + resolveSfxStart({ + sceneStart: 30, + sceneDuration: 4, + audioStartOffset: 0, + voiceDurationSeconds: 3, + voiceWordCount: 8, + entry: { ...baseEntry, anchor: "scene-start" }, + }), + ).toBe(30); + }); + + it("scene-end anchor places SFX so it finishes at scene end", () => { + // sceneEnd = 16; SFX is 1.5s long → start at 14.5 + expect( + resolveSfxStart({ + sceneStart: 10, + sceneDuration: 6, + audioStartOffset: 0.15, + voiceDurationSeconds: 5, + voiceWordCount: 12, + entry: { ...baseEntry, anchor: "scene-end", durationSeconds: 1.5 }, + }), + ).toBe(14.5); + }); + + it("scene-end with SFX longer than scene clamps to sceneStart (won't precede the scene)", () => { + expect( + resolveSfxStart({ + sceneStart: 50, + sceneDuration: 2, + audioStartOffset: 0, + voiceDurationSeconds: 1.5, + voiceWordCount: 4, + entry: { ...baseEntry, anchor: "scene-end", durationSeconds: 5 }, + }), + ).toBe(50); + }); + + it("accent-word anchor interpolates by word index", () => { + // 4 words over 5s narration → each word gets ~1.25s; word index 2 = 2 * 1.25 = 2.5s + // sceneStart 10 + audioStartOffset 0.15 + 2.5 = 12.65 + expect( + resolveSfxStart({ + sceneStart: 10, + sceneDuration: 6, + audioStartOffset: 0.15, + voiceDurationSeconds: 5, + voiceWordCount: 4, + entry: { ...baseEntry, anchor: "accent-word", accentWordIndex: 2 }, + }), + ).toBeCloseTo(12.65, 3); + }); + + it("accent-word with index 0 = scene-start + offset (the first word fires immediately)", () => { + expect( + resolveSfxStart({ + sceneStart: 10, + sceneDuration: 6, + audioStartOffset: 0.15, + voiceDurationSeconds: 5, + voiceWordCount: 5, + entry: { ...baseEntry, anchor: "accent-word", accentWordIndex: 0 }, + }), + ).toBeCloseTo(10.15, 3); + }); + + it("accent-word with index past the word count clamps to last word", () => { + // wordCount 5, requested index 99 → clamps to 4 (last word). + // Each word = 5 / 5 = 1s. Index 4 = 4s offset. 10 + 0.15 + 4 = 14.15 + expect( + resolveSfxStart({ + sceneStart: 10, + sceneDuration: 6, + audioStartOffset: 0.15, + voiceDurationSeconds: 5, + voiceWordCount: 5, + entry: { ...baseEntry, anchor: "accent-word", accentWordIndex: 99 }, + }), + ).toBeCloseTo(14.15, 3); + }); + + it("accent-word falls back to scene-start when narration is empty", () => { + expect( + resolveSfxStart({ + sceneStart: 8, + sceneDuration: 4, + audioStartOffset: 0.2, + voiceDurationSeconds: 0, + voiceWordCount: 0, + entry: { ...baseEntry, anchor: "accent-word", accentWordIndex: 3 }, + }), + ).toBeCloseTo(8.2, 3); + }); + + it("accent-word with negative index clamps to 0", () => { + expect( + resolveSfxStart({ + sceneStart: 10, + sceneDuration: 6, + audioStartOffset: 0, + voiceDurationSeconds: 4, + voiceWordCount: 4, + entry: { ...baseEntry, anchor: "accent-word", accentWordIndex: -5 }, + }), + ).toBe(10); + }); + + it("never returns a value before sceneStart", () => { + // Defensive: scene-end with bizarre inputs. + expect( + resolveSfxStart({ + sceneStart: 100, + sceneDuration: 0.1, + audioStartOffset: 0, + voiceDurationSeconds: 0.05, + voiceWordCount: 1, + entry: { ...baseEntry, anchor: "scene-end", durationSeconds: 99 }, + }), + ).toBe(100); + }); +}); diff --git a/packages/core/src/script/sfx/manifest.ts b/packages/core/src/script/sfx/manifest.ts new file mode 100644 index 000000000..f8d266d3f --- /dev/null +++ b/packages/core/src/script/sfx/manifest.ts @@ -0,0 +1,204 @@ +import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import { dirname, join } from "node:path"; +import type { PlannedScene } from "../types.js"; + +/** + * SFX manifest — the contract between the storyline routes (which generate + * sound-effects via ElevenLabs) and the assembler (which emits + * `