Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions packages/engine/src/services/audioMixer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,121 @@ describe("processCompositionAudio", () => {
expect(filter).toContain("adelay=2000|2000");
});

it("bounds expression nesting for dense keyframe automation without dropping the envelope", async () => {
const baseDir = mkdtempSync(join(tmpdir(), "hf-audio-base-"));
const workDir = mkdtempSync(join(tmpdir(), "hf-audio-work-"));
tempDirs.push(baseDir, workDir);

writeFileSync(join(baseDir, "bgm.wav"), "stub");

// Mirrors the 60 Hz timeline probe: a 10s eased fade emits hundreds of
// keyframes. The nested-if volume expression must not grow one level per
// keyframe — past ~95 levels FFmpeg fails filter-graph init and the audio
// track is dropped entirely (GH #1066 follow-up).
const keyframes = Array.from({ length: 300 }, (_, i) => {
const time = (i / 299) * 10;
const volume =
time < 3 ? 0.8 * (time / 3) ** 2 : time < 7 ? 0.8 : 0.8 * (1 - (time - 7) / 3) ** 2;
return { time, volume };
});

const result = await processCompositionAudio(
[
{
id: "bgm",
src: "bgm.wav",
start: 0,
end: 10,
mediaStart: 0,
layer: 0,
volume: 0,
volumeKeyframes: keyframes,
type: "audio",
},
],
baseDir,
workDir,
join(baseDir, "out.m4a"),
10,
);

expect(result.success).toBe(true);

const mixArgs = runFfmpegMock.mock.calls[1]?.[0];
const filterIndex = mixArgs.indexOf("-filter_complex");
const filter = mixArgs[filterIndex + 1];

// One nested `if(lt(...))` is emitted per segment; cap it well under the
// FFmpeg evaluator's nesting limit (MAX_VOLUME_SEGMENTS = 32).
const nestingDepth = (filter.match(/if\(lt\(t/g) ?? []).length;
expect(nestingDepth).toBeGreaterThan(1);
expect(nestingDepth).toBeLessThan(32);

// The simplified envelope still spans the clip: silent start, audible peak.
expect(filter).toContain(":eval=frame");
expect(filter).toMatch(/volume=if\(lt\(t\\,[0-9.]+\)\\,0\+/);
});

it("falls back to a static-volume mix instead of dropping audio when the automated mix fails", async () => {
const baseDir = mkdtempSync(join(tmpdir(), "hf-audio-base-"));
const workDir = mkdtempSync(join(tmpdir(), "hf-audio-work-"));
tempDirs.push(baseDir, workDir);

writeFileSync(join(baseDir, "bgm.wav"), "stub");

// Simulate an ffmpeg build that rejects the automation expression: the
// first mix attempt fails, the static-volume retry succeeds. (prepare =
// call 0, automated mix = call 1, fallback mix = call 2.)
runFfmpegMock
.mockImplementationOnce(async () => ({
success: true,
durationMs: 1,
stderr: "",
exitCode: 0,
}))
.mockImplementationOnce(async () => ({
success: false,
durationMs: 1,
stderr: "Error initializing filters",
exitCode: 234,
}));

const result = await processCompositionAudio(
[
{
id: "bgm",
src: "bgm.wav",
start: 0,
end: 5,
mediaStart: 0,
layer: 0,
volume: 0.8,
volumeKeyframes: [
{ time: 0, volume: 0.8 },
{ time: 5, volume: 0 },
],
type: "audio",
},
],
baseDir,
workDir,
join(baseDir, "out.m4a"),
5,
);

expect(result.success).toBe(true);
expect(result.tracksProcessed).toBe(1);
expect(runFfmpegMock).toHaveBeenCalledTimes(3);
// Degradation is surfaced, not silent — the track rendered at base volume.
expect(result.error).toMatch(/base volume/i);

// The fallback mix omits the automation expression (base volume only).
const fallbackArgs = runFfmpegMock.mock.calls[2]?.[0];
const fallbackFilter = fallbackArgs[fallbackArgs.indexOf("-filter_complex") + 1];
expect(fallbackFilter).not.toContain(":eval=frame");
expect(fallbackFilter).toContain("volume=0.8");
});

it("prepares percent-encoded non-Latin audio srcs from decoded filesystem paths", async () => {
const baseDir = mkdtempSync(join(tmpdir(), "hf-audio-base-"));
const workDir = mkdtempSync(join(tmpdir(), "hf-audio-work-"));
Expand Down
206 changes: 162 additions & 44 deletions packages/engine/src/services/audioMixer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import { runFfmpeg } from "../utils/runFfmpeg.js";
import { unwrapTemplate } from "../utils/htmlTemplate.js";
import { resolveProjectRelativeSrc } from "./videoFrameExtractor.js";
import type { AudioElement, AudioTrack, MixResult } from "./audioMixer.types.js";
import { applyVolumeEnvelopeToWav } from "./audioVolumeEnvelope.js";

export type { AudioElement, MixResult } from "./audioMixer.types.js";

Expand All @@ -30,10 +31,89 @@ function escapeExpressionCommas(expression: string): string {
return expression.replace(/\\/g, "\\\\").replace(/,/g, "\\,");
}

function buildVolumeExpression(track: AudioTrack): string {
/**
* Upper bound on volume-automation keyframes folded into the FFmpeg `volume`
* expression. The expression nests one `if(lt(...))` per keyframe, and
* FFmpeg's expression evaluator has a finite nesting depth: past ~95 levels
* (build-dependent — lower on some Linux ffmpeg builds) `volume=...:eval=frame`
* fails filter-graph init, which fails the whole mix and drops the audio track
* entirely. The 60 Hz timeline probe routinely emits 100–300 keyframes for a
* multi-second fade (GH #1066 follow-up: a 171-keyframe GSAP fade rendered with
* no audio). 32 segments keeps a wide safety margin and is far more resolution
* than a piecewise-linear volume envelope needs.
*/
const MAX_VOLUME_SEGMENTS = 32;

/**
* Volume delta below which a keyframe is collinear enough to drop. Kept tight
* (0.5% linear) so the rendered piecewise-linear envelope tracks the GSAP curve
* the browser plays in preview to within ~0.2 dB across the audible range — well
* under the ~1 dB loudness JND, so render stays WYSIWYG with preview. A full
* ease-in/ease-out fade still reduces to ~25 segments, inside MAX_VOLUME_SEGMENTS.
*/
const VOLUME_SIMPLIFY_EPSILON = 0.005;

/**
* Reduce a sorted keyframe list to a perceptually-equivalent piecewise-linear
* envelope with a bounded segment count.
*
* Ramer–Douglas–Peucker drops control points lying within
* `VOLUME_SIMPLIFY_EPSILON` of the line through their neighbours (a linear fade
* collapses to its two endpoints; an eased fade to a handful). A uniform
* downsample backstop then bounds pathological inputs (e.g. audio-rate volume
* oscillation) to `MAX_VOLUME_SEGMENTS`. Endpoints are always preserved so the
* envelope still spans the full clip.
*/
function simplifyVolumeKeyframes(
keyframes: { time: number; volume: number }[],
): { time: number; volume: number }[] {
if (keyframes.length < 3) return keyframes;

const keep = new Array<boolean>(keyframes.length).fill(false);
keep[0] = true;
keep[keyframes.length - 1] = true;
const stack: [number, number][] = [[0, keyframes.length - 1]];
while (stack.length > 0) {
const [startIndex, endIndex] = stack.pop()!;
const start = keyframes[startIndex]!;
const end = keyframes[endIndex]!;
const span = end.time - start.time;
let maxDistance = VOLUME_SIMPLIFY_EPSILON;
let splitIndex = -1;
for (let i = startIndex + 1; i < endIndex; i += 1) {
const point = keyframes[i]!;
const interpolated =
span === 0
? start.volume
: start.volume + ((end.volume - start.volume) * (point.time - start.time)) / span;
const distance = Math.abs(point.volume - interpolated);
if (distance > maxDistance) {
maxDistance = distance;
splitIndex = i;
}
}
if (splitIndex !== -1) {
keep[splitIndex] = true;
stack.push([startIndex, splitIndex], [splitIndex, endIndex]);
}
}

const simplified = keyframes.filter((_, i) => keep[i]);
if (simplified.length <= MAX_VOLUME_SEGMENTS) return simplified;

const step = (simplified.length - 1) / (MAX_VOLUME_SEGMENTS - 1);
const sampled: { time: number; volume: number }[] = [];
for (let i = 0; i < MAX_VOLUME_SEGMENTS; i += 1) {
const point = simplified[Math.round(i * step)]!;
if (sampled.length === 0 || point.time > sampled.at(-1)!.time) sampled.push(point);
}
return sampled;
}

function buildVolumeExpression(track: AudioTrack, ignoreKeyframes = false): string {
const trimDuration = track.end - track.start;
const staticVolume = clampVolume(track.volume);
const keyframes = (track.volumeKeyframes ?? [])
const keyframes = (ignoreKeyframes ? [] : (track.volumeKeyframes ?? []))
.filter((keyframe) => Number.isFinite(keyframe.time) && Number.isFinite(keyframe.volume))
.map((keyframe) => ({
time: Math.max(0, Math.min(trimDuration, keyframe.time - track.start)),
Expand All @@ -57,14 +137,19 @@ function buildVolumeExpression(track: AudioTrack): string {
}
}

if (deduped.length === 1) {
return `volume=${formatFilterNumber(deduped[0]!.volume)}`;
// Collapse the densely-sampled probe output to a bounded piecewise-linear
// envelope. Without this, the nested-if expression below grows one level per
// keyframe and overflows FFmpeg's expression evaluator (see MAX_VOLUME_SEGMENTS).
const simplified = simplifyVolumeKeyframes(deduped);

if (simplified.length === 1) {
return `volume=${formatFilterNumber(simplified[0]!.volume)}`;
}

let expression = formatFilterNumber(deduped.at(-1)!.volume);
for (let i = deduped.length - 2; i >= 0; i -= 1) {
const current = deduped[i]!;
const next = deduped[i + 1]!;
let expression = formatFilterNumber(simplified.at(-1)!.volume);
for (let i = simplified.length - 2; i >= 0; i -= 1) {
const current = simplified[i]!;
const next = simplified[i + 1]!;
const currentTime = formatFilterNumber(current.time);
const nextTime = formatFilterNumber(next.time);
const currentVolume = formatFilterNumber(current.volume);
Expand Down Expand Up @@ -299,42 +384,58 @@ async function mixAudioTracks(
const outputDir = dirname(outputPath);
if (!existsSync(outputDir)) mkdirSync(outputDir, { recursive: true });

const inputs: string[] = [];
const filterParts: string[] = [];

tracks.forEach((track, i) => {
inputs.push("-i", track.srcPath);
const delayMs = Math.round(track.start * 1000);
const trimDuration = track.end - track.start;
const volumeFilter = buildVolumeExpression(track);
filterParts.push(
`[${i}:a]atrim=0:${trimDuration},${volumeFilter},adelay=${delayMs}|${delayMs},apad=whole_dur=${totalDuration}[a${i}]`,
);
});

const mixInputs = tracks.map((_, i) => `[a${i}]`).join("");
const weights = tracks.map(() => "1").join(" ");
const mixFilter = `${mixInputs}amix=inputs=${tracks.length}:duration=longest:dropout_transition=0:normalize=0:weights='${weights}'[mixed]`;
const postMixGainFilter = `[mixed]volume=${masterOutputGain}[out]`;
const fullFilter = [...filterParts, mixFilter, postMixGainFilter].join(";");
const buildArgs = (ignoreAutomation: boolean): string[] => {
const inputs: string[] = [];
const filterParts: string[] = [];
tracks.forEach((track, i) => {
inputs.push("-i", track.srcPath);
const delayMs = Math.round(track.start * 1000);
const trimDuration = track.end - track.start;
const volumeFilter = buildVolumeExpression(track, ignoreAutomation);
filterParts.push(
`[${i}:a]atrim=0:${trimDuration},${volumeFilter},adelay=${delayMs}|${delayMs},apad=whole_dur=${totalDuration}[a${i}]`,
);
});

const args = [
...inputs,
"-filter_complex",
fullFilter,
"-map",
"[out]",
"-acodec",
"aac",
"-b:a",
"192k",
"-t",
String(totalDuration),
"-y",
outputPath,
];
const mixInputs = tracks.map((_, i) => `[a${i}]`).join("");
const weights = tracks.map(() => "1").join(" ");
const mixFilter = `${mixInputs}amix=inputs=${tracks.length}:duration=longest:dropout_transition=0:normalize=0:weights='${weights}'[mixed]`;
const postMixGainFilter = `[mixed]volume=${masterOutputGain}[out]`;
const fullFilter = [...filterParts, mixFilter, postMixGainFilter].join(";");

return [
...inputs,
"-filter_complex",
fullFilter,
"-map",
"[out]",
"-acodec",
"aac",
"-b:a",
"192k",
"-t",
String(totalDuration),
"-y",
outputPath,
];
};

const result = await runFfmpeg(args, { signal, timeout: ffmpegProcessTimeout });
let result = await runFfmpeg(buildArgs(false), { signal, timeout: ffmpegProcessTimeout });

// Defense in depth: volume automation is folded into an FFmpeg `volume`
// expression whose evaluator limits are build-dependent (see
// MAX_VOLUME_SEGMENTS). If that ever fails the mix, retry once without the
// automation so the track renders at its base volume rather than being
// dropped from the output entirely — a missing fade beats missing audio.
let degradedAutomation = false;
const hasAutomation = tracks.some((track) => (track.volumeKeyframes?.length ?? 0) > 0);
if (!result.success && !signal?.aborted && hasAutomation) {
const retry = await runFfmpeg(buildArgs(true), { signal, timeout: ffmpegProcessTimeout });
if (retry.success) {
result = retry;
degradedAutomation = true;
}
}

if (signal?.aborted) {
return {
Expand All @@ -360,6 +461,9 @@ async function mixAudioTracks(
outputPath,
durationMs: result.durationMs,
tracksProcessed: tracks.length,
error: degradedAutomation
? "Volume automation exceeded this ffmpeg build's expression limits; rendered at base volume"
: undefined,
};
}

Expand Down Expand Up @@ -452,15 +556,29 @@ export async function processCompositionAudio(
audioSrcPath = trimmedPath;
}

// Primary volume-automation path: bake the envelope into the PCM samples
// (sample-accurate, no keyframe ceiling). If the WAV isn't the expected
// 16-bit PCM, fall back to the ffmpeg expression path by leaving the
// keyframes on the track for buildVolumeExpression to handle.
let bakedEnvelope = false;
if (element.volumeKeyframes && element.volumeKeyframes.length > 0) {
bakedEnvelope = applyVolumeEnvelopeToWav(
audioSrcPath,
element.volumeKeyframes,
element.start,
element.volume ?? 1.0,
);
}
tracks.push({
id: element.id,
srcPath: audioSrcPath,
start: element.start,
end: element.end,
mediaStart: element.mediaStart,
duration: element.end - element.start,
volume: element.volume ?? 1.0,
volumeKeyframes: element.volumeKeyframes,
// Gain is already in the samples when baked, so mix at unity.
volume: bakedEnvelope ? 1.0 : (element.volume ?? 1.0),
volumeKeyframes: bakedEnvelope ? undefined : element.volumeKeyframes,
});
} catch (err: unknown) {
errors.push(`Error: ${element.id} — ${err instanceof Error ? err.message : String(err)}`);
Expand Down
Loading
Loading