diff --git a/.specs/video-transcript.md b/.specs/video-transcript.md index 7db25f972..eeacff58d 100644 --- a/.specs/video-transcript.md +++ b/.specs/video-transcript.md @@ -81,7 +81,7 @@ const { text } = await generateText({ messages: [{ role: "user", content: [ - { type: "file", data: readFileSync(videoPath), mimeType: "video/mp4" }, + { type: "file", data: readFileSync(videoPath), mediaType: "video/mp4" }, { type: "text", text: transcriptPrompt }, ], }], diff --git a/apps/cli/package.json b/apps/cli/package.json index 164148c56..70559423c 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -52,9 +52,12 @@ "typecheck": "tsgo --noEmit" }, "dependencies": { + "accessibility-checker-engine": "^4.0.16", "@agentclientprotocol/claude-agent-acp": "^0.24.2", "@effect/atom-react": "4.0.0-beta.35", "@effect/platform-node": "4.0.0-beta.35", + "@github/copilot": "^1.0.12", + "@google/gemini-cli": "^0.35.3", "@hono/node-server": "^1.19.11", "@tanstack/react-query": "^5.80.7", "@zed-industries/codex-acp": "^0.10.0", diff --git a/apps/cli/tests/runtime-deps.test.ts b/apps/cli/tests/runtime-deps.test.ts new file mode 100644 index 000000000..e21c04965 --- /dev/null +++ b/apps/cli/tests/runtime-deps.test.ts @@ -0,0 +1,76 @@ +import { describe, it, expect } from "vite-plus/test"; +import { readFileSync, readdirSync } from "node:fs"; +import { join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; + +const cliRoot = join(dirname(fileURLToPath(import.meta.url)), ".."); +const distDir = join(cliRoot, "dist"); +const packageJson = JSON.parse(readFileSync(join(cliRoot, "package.json"), "utf-8")); + +const declaredDeps = new Set([ + ...Object.keys(packageJson.dependencies ?? {}), + ...Object.keys(packageJson.peerDependencies ?? {}), + ...Object.keys(packageJson.optionalDependencies ?? {}), +]); + +/** + * Extracts packages that are resolved at runtime from the bundled dist. + * + * The bundler (vp pack) inlines source but leaves dynamic require.resolve() + * and resolvePackageBin() calls intact. These need to be resolvable from + * the consumer's node_modules, so they must be declared in package.json. + * + * Patterns matched: + * - .resolve(`@scope/pkg/path`) — minified makeRequire().resolve() + * - .resolve("pkg/path") — unminified require.resolve() + * - resolvePackageBin(`@scope/pkg`) — minified as varName(`@scope/pkg`) + * detected via try/catch context: try:()=>{let t=Fn(`pkg`) + */ +const extractRuntimeResolvedPackages = (): string[] => { + const distFiles = readdirSync(distDir).filter( + (file) => file.endsWith(".js") && !file.endsWith(".map"), + ); + const patterns = [ + /\.resolve\(["']([^"']+)["']\)/g, + /\.resolve\(`([^`$]+)`\)/g, + // resolvePackageBin gets minified to a short var called inside try blocks + /try:\(\)=>\{let \w+=\w+\(`([^`]+)`\)/g, + ]; + const packages = new Set(); + + for (const file of distFiles) { + const content = readFileSync(join(distDir, file), "utf-8"); + for (const pattern of patterns) { + let match: RegExpExecArray | null; + while ((match = pattern.exec(content)) !== null) { + const specifier = match[1]; + const parts = specifier.startsWith("@") + ? specifier.split("/").slice(0, 2) + : specifier.split("/").slice(0, 1); + const packageName = parts.join("/"); + + if (specifier.startsWith(`${packageName}/package.json`)) continue; + + packages.add(packageName); + } + } + } + + return [...packages]; +}; + +describe("runtime dependency safety", () => { + it("all runtime-resolved packages in dist are declared in package.json dependencies", () => { + const runtimePackages = extractRuntimeResolvedPackages(); + const missing = runtimePackages.filter((pkg) => !declaredDeps.has(pkg)); + + if (missing.length > 0) { + throw new Error( + `Found runtime-resolved packages in dist/ not in dependencies, peerDependencies, or optionalDependencies:\n\n` + + missing.map((pkg) => ` - ${pkg}`).join("\n") + + `\n\nAdd them to "dependencies" in apps/cli/package.json ` + + `so consumers with strict node_modules (pnpm) can resolve them.`, + ); + } + }); +}); diff --git a/apps/video-transcript/package.json b/apps/video-transcript/package.json new file mode 100644 index 000000000..e0744c051 --- /dev/null +++ b/apps/video-transcript/package.json @@ -0,0 +1,43 @@ +{ + "name": "@expect/video-transcript", + "version": "0.0.1", + "private": true, + "description": "Extract structured interaction transcripts from screen recordings", + "type": "module", + "bin": { + "video-transcript": "./dist/index.js" + }, + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + }, + "scripts": { + "build": "vp pack", + "dev": "vp pack --watch", + "lint": "vp lint && tsc --noEmit", + "format": "vp fmt", + "format:check": "vp fmt --check", + "check": "vp check", + "test": "vp test", + "typecheck": "tsgo --noEmit" + }, + "dependencies": { + "@ai-sdk/gateway": "^3.0.88", + "ai": "^6.0.146", + "commander": "^13.1.0", + "effect": "4.0.0-beta.35", + "picocolors": "^1.1.1" + }, + "devDependencies": { + "@types/node": "^22.15.0", + "typescript": "^5.7.0", + "vitest": "*" + }, + "engines": { + "node": ">=18" + } +} diff --git a/apps/video-transcript/src/activity-analyzer.test.ts b/apps/video-transcript/src/activity-analyzer.test.ts new file mode 100644 index 000000000..029601158 --- /dev/null +++ b/apps/video-transcript/src/activity-analyzer.test.ts @@ -0,0 +1,107 @@ +import { describe, it, expect } from "vitest"; +import { computeFrameDiff, classifySegments, formatTimeline } from "./activity-analyzer"; + +describe("computeFrameDiff", () => { + it("returns 0 for identical frames", () => { + const frame = Buffer.from([0, 128, 255, 64]); + expect(computeFrameDiff(frame, frame)).toBe(0); + }); + + it("returns 1 for maximally different frames", () => { + const black = Buffer.from([0, 0, 0, 0]); + const white = Buffer.from([255, 255, 255, 255]); + expect(computeFrameDiff(black, white)).toBeCloseTo(1, 5); + }); + + it("returns 0 for empty buffers", () => { + expect(computeFrameDiff(Buffer.alloc(0), Buffer.alloc(0))).toBe(0); + }); + + it("computes correct diff for known values", () => { + const frameA = Buffer.from([0, 0, 0, 0]); + const frameB = Buffer.from([255, 0, 0, 0]); + expect(computeFrameDiff(frameA, frameB)).toBeCloseTo(0.25, 5); + }); + + it("handles frames of different sizes using the shorter length", () => { + const short = Buffer.from([0, 0]); + const long = Buffer.from([255, 255, 255, 255]); + expect(computeFrameDiff(short, long)).toBeCloseTo(1, 5); + }); +}); + +describe("classifySegments", () => { + it("returns empty for no diffs", () => { + expect(classifySegments([])).toEqual([]); + }); + + it("classifies all-idle diffs as a single idle segment", () => { + const diffs = [0.001, 0.002, 0.001, 0.003, 0.002]; + const result = classifySegments(diffs); + expect(result.length).toBe(1); + expect(result[0]!.type).toBe("idle"); + expect(result[0]!.startSeconds).toBe(0); + expect(result[0]!.endSeconds).toBe(5); + }); + + it("classifies all-active diffs as a single active segment", () => { + const diffs = [0.05, 0.08, 0.06, 0.07]; + const result = classifySegments(diffs); + expect(result.length).toBe(1); + expect(result[0]!.type).toBe("active"); + }); + + it("detects scene changes from high diffs", () => { + const diffs = [0.01, 0.02, 0.5, 0.01, 0.02]; + const result = classifySegments(diffs); + const sceneChanges = result.filter((s) => s.type === "scene_change"); + expect(sceneChanges.length).toBeGreaterThanOrEqual(1); + }); + + it("merges short idle gaps between active segments", () => { + const diffs = [0.05, 0.06, 0.07, 0.001, 0.002, 0.05, 0.06, 0.07]; + const result = classifySegments(diffs); + const idleSegments = result.filter((s) => s.type === "idle"); + expect(idleSegments.length).toBe(0); + expect(result.every((s) => s.type === "active")).toBe(true); + }); + + it("preserves long idle gaps between active segments", () => { + const diffs = [0.05, 0.06, 0.001, 0.002, 0.001, 0.002, 0.05, 0.06]; + const result = classifySegments(diffs); + const idleSegments = result.filter((s) => s.type === "idle"); + expect(idleSegments.length).toBeGreaterThanOrEqual(1); + }); +}); + +describe("formatTimeline", () => { + it("formats a simple timeline", () => { + const timeline = [ + { type: "idle" as const, startSeconds: 0, endSeconds: 3 }, + { type: "active" as const, startSeconds: 3, endSeconds: 10 }, + { type: "idle" as const, startSeconds: 10, endSeconds: 12 }, + ]; + const result = formatTimeline(timeline); + expect(result).toContain("00:00"); + expect(result).toContain("00:03"); + expect(result).toContain("active"); + expect(result).toContain("idle"); + }); + + it("formats scene changes", () => { + const timeline = [{ type: "scene_change" as const, startSeconds: 5, endSeconds: 6 }]; + const result = formatTimeline(timeline); + expect(result).toContain("scene change (likely navigation)"); + }); + + it("formats times with minutes", () => { + const timeline = [{ type: "active" as const, startSeconds: 65, endSeconds: 130 }]; + const result = formatTimeline(timeline); + expect(result).toContain("01:05"); + expect(result).toContain("02:10"); + }); + + it("returns empty string for empty timeline", () => { + expect(formatTimeline([])).toBe(""); + }); +}); diff --git a/apps/video-transcript/src/activity-analyzer.ts b/apps/video-transcript/src/activity-analyzer.ts new file mode 100644 index 000000000..f27bfe70e --- /dev/null +++ b/apps/video-transcript/src/activity-analyzer.ts @@ -0,0 +1,209 @@ +import { execFile } from "node:child_process"; +import { readFileSync, mkdtempSync, rmSync, readdirSync } from "node:fs"; +import { tmpdir } from "node:os"; +import path from "node:path"; +import { + FRAME_DIFF_IDLE_THRESHOLD, + FRAMES_PER_SECOND, + IDLE_CUT_THRESHOLD_SECONDS, + MIN_ACTIVE_SEGMENT_SECONDS, + SCENE_CHANGE_THRESHOLD, +} from "./constants"; +import type { ActivitySegment, ActivityTimeline } from "./types"; + +const execFileAsync = ( + command: string, + args: readonly string[], +): Promise<{ stdout: string; stderr: string }> => + new Promise((resolve, reject) => { + execFile(command, args, { maxBuffer: 10 * 1024 * 1024 }, (error, stdout, stderr) => { + if (error) reject(error); + else resolve({ stdout, stderr }); + }); + }); + +export const checkFfmpegAvailable = async (): Promise => { + try { + await execFileAsync("ffmpeg", ["-version"]); + return true; + } catch { + return false; + } +}; + +const extractFrames = async (videoPath: string, outputDir: string): Promise => { + await execFileAsync("ffmpeg", [ + "-i", + videoPath, + "-vf", + `fps=${FRAMES_PER_SECOND}`, + "-vsync", + "vfr", + "-f", + "rawvideo", + "-pix_fmt", + "gray", + "-s", + "320x180", + path.join(outputDir, "frame_%05d.raw"), + ]); + + const files = readdirSync(outputDir).filter((file) => file.startsWith("frame_")); + return files.length; +}; + +export const computeFrameDiff = (frameA: Buffer, frameB: Buffer): number => { + const length = Math.min(frameA.length, frameB.length); + if (length === 0) return 0; + + let totalDiff = 0; + for (let index = 0; index < length; index++) { + totalDiff += Math.abs(frameA[index]! - frameB[index]!) / 255; + } + + return totalDiff / length; +}; + +export const classifySegments = (diffs: readonly number[]): ActivityTimeline => { + const rawClassification: Array<"active" | "idle" | "scene_change"> = []; + + for (const diff of diffs) { + if (diff >= SCENE_CHANGE_THRESHOLD) { + rawClassification.push("scene_change"); + } else if (diff > FRAME_DIFF_IDLE_THRESHOLD) { + rawClassification.push("active"); + } else { + rawClassification.push("idle"); + } + } + + const segments: ActivitySegment[] = []; + let currentType = rawClassification[0]; + let segmentStart = 0; + + if (!currentType) return []; + + for (let index = 1; index <= rawClassification.length; index++) { + const nextType = rawClassification[index]; + if (nextType !== currentType || index === rawClassification.length) { + segments.push({ + type: currentType, + startSeconds: segmentStart, + endSeconds: index, + }); + if (nextType) { + currentType = nextType; + segmentStart = index; + } + } + } + + return mergeShortSegments(segments); +}; + +const mergeShortSegments = (segments: readonly ActivitySegment[]): ActivityTimeline => { + const merged: ActivitySegment[] = []; + + for (const segment of segments) { + const duration = segment.endSeconds - segment.startSeconds; + + if (segment.type === "active" && duration < MIN_ACTIVE_SEGMENT_SECONDS) { + const previous = merged[merged.length - 1]; + if (previous && previous.type === "idle") { + merged[merged.length - 1] = { ...previous, endSeconds: segment.endSeconds }; + } else { + merged.push(segment); + } + continue; + } + + if (segment.type === "idle" && duration <= IDLE_CUT_THRESHOLD_SECONDS) { + const previous = merged[merged.length - 1]; + if (previous && previous.type === "active") { + merged[merged.length - 1] = { ...previous, endSeconds: segment.endSeconds }; + continue; + } + } + + merged.push(segment); + } + + return merged; +}; + +export const analyzeActivity = async (videoPath: string): Promise => { + const framesDir = mkdtempSync(path.join(tmpdir(), "expect-frames-")); + + try { + const frameCount = await extractFrames(videoPath, framesDir); + if (frameCount < 2) return [{ type: "active", startSeconds: 0, endSeconds: frameCount }]; + + const frameSize = 320 * 180; + const diffs: number[] = []; + + for (let index = 1; index < frameCount; index++) { + const prevPath = path.join(framesDir, `frame_${String(index).padStart(5, "0")}.raw`); + const currPath = path.join(framesDir, `frame_${String(index + 1).padStart(5, "0")}.raw`); + + const prevFrame = readFileSync(prevPath); + const currFrame = readFileSync(currPath); + diffs.push(computeFrameDiff(prevFrame, currFrame)); + } + + return classifySegments(diffs); + } finally { + rmSync(framesDir, { recursive: true, force: true }); + } +}; + +export const buildTrimmedVideo = async ( + videoPath: string, + timeline: ActivityTimeline, +): Promise => { + const activeSegments = timeline.filter( + (segment) => segment.type === "active" || segment.type === "scene_change", + ); + + if (activeSegments.length === 0) return videoPath; + + const outputDir = mkdtempSync(path.join(tmpdir(), "expect-trimmed-")); + const outputPath = path.join(outputDir, "trimmed.mp4"); + + const filterParts = activeSegments.map( + (segment) => `between(t,${segment.startSeconds},${segment.endSeconds})`, + ); + const selectFilter = filterParts.join("+"); + + await execFileAsync("ffmpeg", [ + "-i", + videoPath, + "-vf", + `select='${selectFilter}',setpts=N/FRAME_RATE/TB`, + "-af", + `aselect='${selectFilter}',asetpts=N/SR/TB`, + "-y", + outputPath, + ]); + + return outputPath; +}; + +export const formatTimeline = (timeline: ActivityTimeline): string => { + const formatTime = (seconds: number): string => { + const minutes = Math.floor(seconds / 60); + const secs = seconds % 60; + return `${String(minutes).padStart(2, "0")}:${String(secs).padStart(2, "0")}`; + }; + + const lines = timeline.map((segment) => { + const label = + segment.type === "scene_change" + ? "scene change (likely navigation)" + : segment.type === "idle" + ? "idle" + : "active"; + return `- [${formatTime(segment.startSeconds)}–${formatTime(segment.endSeconds)}] ${label}`; + }); + + return lines.join("\n"); +}; diff --git a/apps/video-transcript/src/constants.ts b/apps/video-transcript/src/constants.ts new file mode 100644 index 000000000..816487dd2 --- /dev/null +++ b/apps/video-transcript/src/constants.ts @@ -0,0 +1,14 @@ +export const FRAME_DIFF_IDLE_THRESHOLD = 0.005; +export const IDLE_CUT_THRESHOLD_SECONDS = 3; +export const SCENE_CHANGE_THRESHOLD = 0.15; +export const MIN_ACTIVE_SEGMENT_SECONDS = 1; +export const FRAMES_PER_SECOND = 1; + +export const SUPPORTED_VIDEO_EXTENSIONS = [".mp4", ".webm", ".mov", ".avi", ".mkv"] as const; +export const SUPPORTED_MIME_TYPES: Record = { + ".mp4": "video/mp4", + ".webm": "video/webm", + ".mov": "video/quicktime", + ".avi": "video/x-msvideo", + ".mkv": "video/x-matroska", +}; diff --git a/apps/video-transcript/src/extract-transcript.test.ts b/apps/video-transcript/src/extract-transcript.test.ts new file mode 100644 index 000000000..c289b70a8 --- /dev/null +++ b/apps/video-transcript/src/extract-transcript.test.ts @@ -0,0 +1,76 @@ +import { describe, it, expect, vi, beforeAll, beforeEach, afterAll } from "vitest"; + +vi.mock("@ai-sdk/gateway", () => ({ + gateway: (model: string) => ({ modelId: model, provider: "gateway" }), +})); + +vi.mock("ai", () => ({ + generateText: vi.fn().mockResolvedValue({ + text: "## Login\n\n[00:03] ACTION: User clicks Sign In\n TARGET: Sign In button\n RESULT: Login form appears", + }), +})); + +import { extractTranscript } from "./extract-transcript"; +import { generateText } from "ai"; +import path from "node:path"; +import { writeFileSync, mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; + +describe("extractTranscript", () => { + let tempDir: string; + let videoPath: string; + + beforeAll(() => { + tempDir = mkdtempSync(path.join(tmpdir(), "extract-test-")); + videoPath = path.join(tempDir, "test.mp4"); + writeFileSync(videoPath, Buffer.from("fake video content")); + }); + + beforeEach(() => { + vi.mocked(generateText).mockClear(); + }); + + afterAll(() => { + rmSync(tempDir, { recursive: true, force: true }); + }); + + it("calls generateText with gateway model", async () => { + const result = await extractTranscript(videoPath, undefined); + + expect(generateText).toHaveBeenCalledOnce(); + const call = vi.mocked(generateText).mock.calls[0]![0]; + expect((call.model as { modelId: string }).modelId).toBe("google/gemini-2.5-flash"); + expect(result).toContain("Login"); + }); + + it("passes video data as file part with correct media type", async () => { + await extractTranscript(videoPath, undefined); + + const call = vi.mocked(generateText).mock.calls[0]![0]; + const messages = call.messages as Array<{ + content: Array<{ type: string; mediaType?: string }>; + }>; + const filePart = messages[0]!.content.find((part) => part.type === "file"); + expect(filePart).toBeDefined(); + expect(filePart!.mediaType).toBe("video/mp4"); + }); + + it("includes timeline in prompt when provided", async () => { + const timeline = [ + { type: "active" as const, startSeconds: 0, endSeconds: 10 }, + { type: "idle" as const, startSeconds: 10, endSeconds: 15 }, + ]; + + await extractTranscript(videoPath, timeline); + + const call = vi.mocked(generateText).mock.calls[0]![0]; + const messages = call.messages as Array<{ content: Array<{ type: string; text?: string }> }>; + const textPart = messages[0]!.content.find((part) => part.type === "text"); + expect(textPart!.text).toContain("Activity timeline"); + }); + + it("returns transcript text from generateText response", async () => { + const result = await extractTranscript(videoPath, undefined); + expect(result).toContain("ACTION: User clicks Sign In"); + }); +}); diff --git a/apps/video-transcript/src/extract-transcript.ts b/apps/video-transcript/src/extract-transcript.ts new file mode 100644 index 000000000..0fc39dd07 --- /dev/null +++ b/apps/video-transcript/src/extract-transcript.ts @@ -0,0 +1,38 @@ +import { readFileSync } from "node:fs"; +import path from "node:path"; +import { gateway } from "@ai-sdk/gateway"; +import { generateText } from "ai"; +import { SUPPORTED_MIME_TYPES } from "./constants"; +import { buildTranscriptPrompt } from "./transcript-prompt"; +import type { ActivityTimeline } from "./types"; + +const MODEL = "google/gemini-2.5-flash"; + +const getMimeType = (videoPath: string): string => { + const extension = path.extname(videoPath).toLowerCase(); + return SUPPORTED_MIME_TYPES[extension] ?? "video/mp4"; +}; + +export const extractTranscript = async ( + videoPath: string, + timeline: ActivityTimeline | undefined, +): Promise => { + const mimeType = getMimeType(videoPath); + const videoData = readFileSync(videoPath); + const prompt = buildTranscriptPrompt(timeline); + + const { text } = await generateText({ + model: gateway(MODEL), + messages: [ + { + role: "user", + content: [ + { type: "file", data: videoData, mediaType: mimeType }, + { type: "text", text: prompt }, + ], + }, + ], + }); + + return text; +}; diff --git a/apps/video-transcript/src/index.ts b/apps/video-transcript/src/index.ts new file mode 100644 index 000000000..d11da961f --- /dev/null +++ b/apps/video-transcript/src/index.ts @@ -0,0 +1,128 @@ +#!/usr/bin/env node +import { existsSync, writeFileSync } from "node:fs"; +import path from "node:path"; +import { Command } from "commander"; +import pc from "picocolors"; +import { SUPPORTED_VIDEO_EXTENSIONS } from "./constants"; +import { + analyzeActivity, + buildTrimmedVideo, + checkFfmpegAvailable, + formatTimeline, +} from "./activity-analyzer"; +import { extractTranscript } from "./extract-transcript"; + +const program = new Command() + .name("video-transcript") + .description("Extract structured interaction transcripts from screen recordings") + .version("0.0.1") + .argument("