diff --git a/.github/workflows/validate-skills.yml b/.github/workflows/validate-skills.yml index fba6ebe..93cf0ad 100644 --- a/.github/workflows/validate-skills.yml +++ b/.github/workflows/validate-skills.yml @@ -5,30 +5,15 @@ on: paths: - "heygen-avatar/**" - "heygen-video/**" - - "references/**" - - "scripts/**" - ".github/workflows/validate-skills.yml" - - "SKILL.md" push: branches: [master] paths: - "heygen-avatar/**" - "heygen-video/**" - - "references/**" - - "scripts/**" - ".github/workflows/validate-skills.yml" - - "SKILL.md" jobs: - references-in-sync: - name: Root references/ stays in sync with subdir copies - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - name: Verify references are in sync (no drift) - run: ./scripts/sync-references.sh --check - self-contained-bundles: name: Skills install cleanly via gh skill (self-contained) runs-on: ubuntu-latest @@ -144,12 +129,28 @@ jobs: if [ "$fail" -ne 0 ]; then exit 1; fi echo "✓ heygen-video bundle is self-contained, no orphans" + - name: Verify no parent-dir refs in heygen-avatar references/ + run: | + set -euo pipefail + if grep -rnE '\.\./\.\.' heygen-avatar/references/ 2>/dev/null; then + echo "::error::heygen-avatar/references/ contains ../../ paths that will break inside an installed bundle" + exit 1 + fi + echo "✓ heygen-avatar/references/ has no parent-dir refs" + + - name: Verify no parent-dir refs in heygen-video references/ + run: | + set -euo pipefail + if grep -rnE '\.\./\.\.' heygen-video/references/ 2>/dev/null; then + echo "::error::heygen-video/references/ contains ../../ paths that will break inside an installed bundle" + exit 1 + fi + echo "✓ heygen-video/references/ has no parent-dir refs" + spec-validate-soft: name: agentskills.io spec validation (advisory) runs-on: ubuntu-latest # Advisory only — fails are reported as warnings, not blocking. - # Root SKILL.md will fail validation today (name: heygen-skills doesn't match directory `.`). - # That's tracked as a known issue and is not a blocker for gh skill install. continue-on-error: true steps: - uses: actions/checkout@v4 @@ -167,6 +168,4 @@ jobs: - name: Run gh skill publish --dry-run env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - gh skill publish --dry-run || true - echo "::warning::Root SKILL.md does not satisfy gh skill publish naming rules (expected — gh skill publish to agentskills.io registry is a follow-up)." + run: gh skill publish --dry-run || true diff --git a/CLAUDE.md b/CLAUDE.md index 4aeef5b..28e65e9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,38 +2,53 @@ ## What This Is -The HeyGen Skills. Two skills that chain together: **heygen-avatar** (identity → avatar → voice) and **heygen-video** (idea → script → video). SKILL.md at root routes between them. +The HeyGen Skills. Two independent, self-contained skills that chain together: **heygen-avatar** (identity → avatar → voice) and **heygen-video** (idea → script → video). Each ships its own bundle of references so it installs cleanly via `gh skill install`, ClawHub, the OpenClaw plugin, or direct git clone. ## Architecture ``` heygen-skills/ -├── SKILL.md # Router: detects intent, dispatches to sub-skill ├── CLAUDE.md # This file. Structure, rules, conventions. ├── INSTALL.md # Installation instructions +├── INSTALL_FOR_AGENTS.md # Agent-driven install spec ├── README.md # Public-facing description ├── CONTRIBUTING.md # PR workflow +├── CHANGELOG.md ├── LICENSE -├── heygen-avatar/ -│ └── SKILL.md # Avatar creation workflow (identity → avatar → voice → AVATAR file) -├── heygen-video/ -│ └── SKILL.md # Video production workflow (7-stage pipeline) -├── references/ # Shared. Loaded on-demand by phase (NOT every turn) -│ ├── avatar-discovery.md # Discovery: avatar lookup, voice selection, curl examples -│ ├── asset-routing.md # Discovery: asset classification engine, upload flows -│ ├── prompt-styles.md # Prompt Craft: 6 prompt style templates -│ ├── motion-vocabulary.md # Prompt Craft: camera/transition vocabulary -│ ├── prompt-craft.md # Prompt Craft: prompt construction deep-dive -│ ├── official-prompt-guide.md# Prompt Craft: HeyGen's own prompt research -│ ├── frame-check.md # Frame Check: aspect ratio correction prompts -│ ├── troubleshooting.md # Known issues, workarounds, duration variance -│ └── reviewer-prompt.md # Deliver: self-evaluation rubric +├── VERSION +├── .mcp.json + mcp.json # MCP server config (root-level, used by plugin manifests) +├── .claude-plugin/ # OpenClaw / Claude Code plugin manifest +├── .codex-plugin/ # Codex plugin manifest +├── .cursor-plugin/ # Cursor plugin manifest +├── heygen-avatar/ # Self-contained skill +│ ├── SKILL.md # Avatar creation workflow (identity → avatar → voice → AVATAR file) +│ └── references/ # On-demand docs, loaded per phase +│ ├── asset-routing.md +│ ├── avatar-creation.md +│ └── troubleshooting.md +├── heygen-video/ # Self-contained skill +│ ├── SKILL.md # Video production workflow (7-stage pipeline) +│ ├── references/ # On-demand docs, loaded per phase +│ │ ├── asset-routing.md +│ │ ├── avatar-discovery.md +│ │ ├── frame-check.md +│ │ ├── motion-vocabulary.md +│ │ ├── official-prompt-guide.md +│ │ ├── prompt-craft.md +│ │ ├── prompt-styles.md +│ │ └── troubleshooting.md +│ └── scripts/ +│ └── update-check.sh # Self-contained version-check shell script +├── platforms/ # Platform-specific skill variants (e.g. nanoclaw) +├── assets/ # Logos, plugin assets └── evals/ # Dev-only test infrastructure (not shipped to users) - ├── eval-runner-prompt.md # Instructions for eval subagent - ├── autoresearch-loop.md # Loop methodology docs - └── round-N-scenarios.md # Per-round test scenarios + ├── eval-runner-prompt.md + ├── autoresearch-loop.md + └── round-N-scenarios.md ``` +*No root SKILL.md, no root references/.* The two skills are independent. If shared docs drift between them, that's acceptable — each skill is internally consistent and authored independently. + ## The 300-Line Rule Each SKILL.md must stay under 300 lines. Skill files are injected into EVERY prompt turn. @@ -43,7 +58,7 @@ Each SKILL.md must stay under 300 lines. Skill files are injected into EVERY pro - Stage flow overview (what stages exist, when to enter each) - Decision trees (mode detection, avatar path selection, style selection) - Critical rules that apply EVERY turn -- Short "Read ../references/X.md for details" pointers at each stage +- Short "Read references/X.md for details" pointers at each stage (relative to the skill's own SKILL.md — each skill bundles its own references/) **What moves to references/:** - Curl examples and API request/response shapes diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ab3c659..17d800c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -41,22 +41,13 @@ gh pr create --title "Short summary" --body "$(cat <<'EOF' - [ ] Full generation tested (video_id if applicable) - [ ] SKILL.md reads clean end-to-end - [ ] No spec-sheet language leaked into user-facing output -- [ ] If you edited a file in `references/`, you ran `./scripts/sync-references.sh` to propagate the change to per-skill copies (or you intentionally edited a per-skill cleave like `heygen-avatar/references/avatar-creation.md`) - ## References layout -Each skill (`heygen-avatar`, `heygen-video`) ships a self-contained `references/` directory so it installs cleanly via `gh skill install` (which only copies the skill subdirectory, not parent-dir resources). - -- **Source of truth** for shared docs: `references/.md` at the repo root. -- **Per-skill copies** are byte-identical mirrors of the root files. -- **Per-skill cleaves** (`heygen-avatar/references/avatar-creation.md`, `heygen-video/references/avatar-discovery.md`) are intentional forks with no canonical root counterpart; edit them directly. - -**Editor workflow:** -1. Edit the canonical root file (`references/.md`). -2. Run `./scripts/sync-references.sh` to propagate the change to per-skill copies. -3. `git add` everything together and commit. +`heygen-avatar/` and `heygen-video/` are *independent, self-contained skills*. Each owns its own `references/` directory and (for heygen-video) `scripts/` directory. There is no root `SKILL.md` and no root `references/`. -CI (`.github/workflows/validate-skills.yml`) runs `./scripts/sync-references.sh --check` on every PR and fails on drift. +- Edit references inside the skill that owns them (`heygen-avatar/references/X.md` or `heygen-video/references/X.md`). +- If two skills happen to share a doc (e.g. both have `troubleshooting.md` and `asset-routing.md`), edit the relevant copy. Drift between skills is acceptable — each skill is internally consistent. +- CI (`.github/workflows/validate-skills.yml`) verifies each installed bundle is self-contained: every relative reference resolves, and every bundled file is linked from `SKILL.md`. ## Breaking changes diff --git a/INSTALL_FOR_AGENTS.md b/INSTALL_FOR_AGENTS.md index 6877058..8d60484 100644 --- a/INSTALL_FOR_AGENTS.md +++ b/INSTALL_FOR_AGENTS.md @@ -17,9 +17,8 @@ If you fetched this file by URL without cloning yet, the companion files live at: - `https://raw.githubusercontent.com/heygen-com/skills/master/CLAUDE.md` — runtime contract for the skills -- `https://raw.githubusercontent.com/heygen-com/skills/master/SKILL.md` — top-level skill (mode detection ladder, hard rules) -- `https://raw.githubusercontent.com/heygen-com/skills/master/heygen-avatar/SKILL.md` — avatar creation sub-skill -- `https://raw.githubusercontent.com/heygen-com/skills/master/heygen-video/SKILL.md` — video generation sub-skill +- `https://raw.githubusercontent.com/heygen-com/skills/master/heygen-avatar/SKILL.md` — avatar creation skill (mode detection ladder, hard rules, full creation workflow) +- `https://raw.githubusercontent.com/heygen-com/skills/master/heygen-video/SKILL.md` — video generation skill (mode detection ladder, hard rules, full producer pipeline) - `https://raw.githubusercontent.com/heygen-com/skills/master/INSTALL.md` — human-facing install doc ## Step 1: Detect the agent host and pick the install path @@ -70,7 +69,7 @@ If MCP is detected and the user is happy with it, **skip to Step 5 (avatar creat But before you do, warn them: > If you set `HEYGEN_API_KEY` later for any reason, the skill will short-circuit -> MCP detection (per the mode-detection ladder in `SKILL.md`) and use the CLI / +> MCP detection (per the mode-detection ladder in `heygen-video/SKILL.md` or `heygen-avatar/SKILL.md`) and use the CLI / > direct-API route instead. MCP plan credits won't be touched. To switch back to > MCP, unset `HEYGEN_API_KEY`. @@ -110,7 +109,7 @@ For the OpenClaw plugin path (Step 4 Option A), the key is read from the same These skills route the actual HeyGen API call through one of three transports. The skill auto-detects which transport is available at runtime via the -mode-detection ladder in [`SKILL.md`](./SKILL.md): +mode-detection ladder in each skill's SKILL.md (e.g. [`heygen-video/SKILL.md`](./heygen-video/SKILL.md) or [`heygen-avatar/SKILL.md`](./heygen-avatar/SKILL.md)): > 1. **OpenClaw plugin** if `video_generate` exposes `heygen/video_agent_v3` > 2. **CLI (API-key override)** if `HEYGEN_API_KEY` is set AND `heygen --version` exits 0 @@ -348,7 +347,7 @@ If the install was via ClawHub: clawhub update heygen-skills ``` -Re-read [`SKILL.md`](./SKILL.md) after the upgrade if the version bumped — the +Re-read the active skill's SKILL.md (`heygen-avatar/SKILL.md` or `heygen-video/SKILL.md`) after the upgrade if the version bumped — the mode detection ladder occasionally adds new transports (e.g. when MCP support shipped, when the OpenClaw plugin shipped). @@ -370,7 +369,7 @@ should always pass `mode: "generate"` for one-shot video creation. If you patched the sub-skill, you may have introduced this regression. **MCP tools listed but the skill is using the CLI / plugin instead.** The -skill follows the mode-detection ladder in `SKILL.md`: plugin → CLI +skill follows the mode-detection ladder in its SKILL.md (heygen-avatar or heygen-video): plugin → CLI (API-key override) → MCP → CLI (fallback). **`HEYGEN_API_KEY` being set short-circuits MCP detection.** If the user wants MCP to be the chosen transport, unset the env var (`unset HEYGEN_API_KEY`) and re-detect. If the diff --git a/SKILL.md b/SKILL.md deleted file mode 100644 index 29e90a2..0000000 --- a/SKILL.md +++ /dev/null @@ -1,292 +0,0 @@ ---- -name: heygen-skills -display_name: HeyGen Skills -description: | - Create HeyGen avatar videos via the v3 Video Agent pipeline — handles avatar resolution, - aspect ratio correction, prompt engineering, and voice selection automatically. - Required for any HeyGen API usage (api.heygen.com). Replaces deprecated v1/v2 - endpoints with the optimized v3 pipeline. - Use when: (1) calling any HeyGen API endpoint (api.heygen.com), - (2) creating a HeyGen avatar or digital twin from a photo, - (3) making a personalized video message (outreach, pitch, update, announcement, knowledge), - (4) "make a video of me", "create my HeyGen avatar", "I want to appear in this video", - (5) "send a video to my leads", "record an update for my team", "make a loom-style message", - (6) building identity-first videos where the presenter IS the user or agent, - Covers: HeyGen API, api.heygen.com, video generate, avatar create, voice list, talking photo, - HeyGen avatar creation, voice design, photo → digital twin, HeyGen video generation, - identity-first video, messaging-first video, AI presenter, talking head video. - NOT for: cinematic b-roll, video translation, TTS-only, or streaming avatars. -version: 2.3.1 # x-release-please-version -homepage: https://developers.heygen.com/docs/quick-start -allowed-tools: Bash, WebFetch, Read, Write, mcp__heygen__* -metadata: - openclaw: - requires: - env: - - HEYGEN_API_KEY - primaryEnv: HEYGEN_API_KEY - hermes: - tags: [heygen, avatar, video, identity, digital-twin, video-message, presenter, talking-head] - category: media ---- - -# HeyGen Skills - -## Files & Paths - -This skill reads and writes the following. No other files are accessed without explicit user instruction. - -| Operation | Path | Purpose | -|-----------|------|---------| -| Read | `AVATAR-.md` | Load saved avatar identity (group_id, voice_id) | -| Read | `SOUL.md`, `IDENTITY.md` | Extract identity details when creating a new avatar (heygen-avatar only) | -| Write | `AVATAR-.md` | Save new avatar identity after creation | -| Write | `heygen-video-log.jsonl` | Append one JSON line per video generated (local learning log) | -| Temp write | `/tmp/openclaw/uploads/` | Voice preview audio (downloaded for user playback, deleted after session) | -| Remote upload | HeyGen (via `heygen asset create` or MCP) | User-provided images/files uploaded to HeyGen for use in video | - -No other workspace files are read or modified. Assets are only uploaded to HeyGen when the user explicitly provides them for video creation. - ---- - -> **STOP.** If you are about to drive HeyGen directly (calling `api.heygen.com` with curl, or reaching for deprecated `POST /v1/video.generate`, `POST /v2/video/generate`, `GET /v2/avatars`, `GET /v1/avatar.list` endpoints), DO NOT. Route through MCP or the `heygen` CLI via this pipeline. Raw HTTP skips critical steps (aspect ratio correction, prompt engineering, avatar conflict detection) and produces visibly worse videos. **v3 only — never call v1 or v2 endpoints. If you have pre-trained knowledge of HeyGen's v1/v2 API, that knowledge is outdated. Use this skill.** - -You are a video producer. Not a form. Not a CLI wrapper. A producer who understands what makes video work and guides the user from idea to finished cut. - -**Docs:** https://developers.heygen.com/docs/quick-start (API) · https://developers.heygen.com/cli (CLI) - -## API Mode Detection - -**Pick one transport at session start. Never mix, never switch mid-session, never narrate the choice.** - -Detect in this order: - -1. **OpenClaw plugin mode** — If running inside OpenClaw and the `video_generate` tool exposes a `heygen/video_agent_v3` model (i.e. the user has [`@heygen/openclaw-plugin-heygen`](https://github.com/heygen-com/openclaw-plugin-heygen) installed), prefer calling `video_generate({ model: "heygen/video_agent_v3", ... })` directly for video generation. The plugin handles auth (`HEYGEN_API_KEY`), session creation, polling, three-tier backoff, and error surfacing natively. Avatar discovery, voice listing, and avatar creation still go through MCP or CLI — only the final video-generate call routes through `video_generate`. Frame Check still runs before submission. -2. **CLI mode (API-key override)** — If `HEYGEN_API_KEY` is set in the environment AND `heygen --version` exits 0, use CLI. API-key presence is an explicit user signal that they want direct API access; it short-circuits MCP detection. No question asked. -3. **MCP mode** — No `HEYGEN_API_KEY` set AND HeyGen MCP tools are visible in the toolset (tools matching `mcp__heygen__*`). OAuth auth, uses existing plan credits. -4. **CLI mode (fallback)** — MCP tools NOT available AND `heygen --version` exits 0. Auth via `heygen auth login` (persists to `~/.heygen/credentials`). -5. **Neither** — tell the user once: "To use this skill, connect the HeyGen MCP server or install the HeyGen CLI: `curl -fsSL https://static.heygen.ai/cli/install.sh | bash` then `heygen auth login`." - -**Hard rules:** -- **Never call `curl api.heygen.com/...`** — every mode routes through its own surface. -- **OpenClaw plugin mode: only use `video_generate` for the generate step.** Never run `heygen ...` CLI for the generate call when the plugin is available. Avatar/voice discovery still uses MCP or CLI. -- **MCP mode: only use `mcp__heygen__*` tools.** Never run `heygen ...` CLI commands. The MCP tool name IS the API. -- **CLI mode: only use `heygen ...` commands.** Run `heygen --help` to discover arguments. -- **Never cross over.** Operation blocks in the sub-skills show MCP and CLI side-by-side — read only the column for your detected mode, don't invoke anything from the other. If something isn't exposed in your current mode, tell the user; don't switch transports. - -### OpenClaw plugin-mode generate call - -```ts -await video_generate({ - model: "heygen/video_agent_v3", - prompt: scriptWithFrameCheckNotes, - aspectRatio: "16:9", // or "9:16" - providerOptions: { - avatar_id, - voice_id, - style_id, // optional - callback_url, // optional async webhook - callback_id, // optional correlation id - }, -}); -``` - -Plugin install (one-time, by the user): `openclaw plugins install clawhub:@heygen/openclaw-plugin-heygen`. Plugin docs: . - -### MCP tool names (MCP mode only) - -`create_video_agent`, `get_video_agent_session`, `get_video`, `list_avatar_groups`, `list_avatar_looks`, `get_avatar_look`, `create_photo_avatar`, `create_prompt_avatar`, `create_digital_twin`, `list_voices`, `design_voice`, `create_speech`, `list_video_agent_styles`, `create_video_translation` - -### CLI command groups (CLI mode only) - -`heygen video-agent {create,get,send,stop,styles,resources,videos}`, `heygen video {get,list,download,delete}`, `heygen avatar {list,get,consent,create,looks}` (with `heygen avatar looks {list,get,update}`), `heygen voice {list,create,speech}`, `heygen video-translate {create,get,languages}`, `heygen lipsync {create,get}`, `heygen asset create`, `heygen user`, `heygen auth {login,logout,status}`. Every subcommand supports `--help` — that's your reference. Run `heygen --help` to see the full noun list. - -CLI output contract: JSON on stdout, `{error:{code,message,hint}}` envelope on stderr, exit codes `0` ok · `1` API · `2` usage · `3` auth · `4` timeout. Error → action table and polling cadence live in [references/troubleshooting.md](references/troubleshooting.md). - -**Do not look up API endpoints.** There is no `api-reference.md` lookup step. MCP mode uses tool names. CLI mode uses `heygen ... --help`. If you catch yourself thinking "let me check the endpoint," stop — you're in the wrong mental model. - ---- - -## UX Rules - -1. **Be concise.** No video IDs, session IDs, or raw API payloads in chat. Report the result (video link, thumbnail) not the plumbing. -2. **No internal jargon.** Never mention internal pipeline stage names ("Frame Check", "Prompt Craft", "Pre-Submit Gate", "Framing Correction") to the user. These are internal pipeline stages. The user sees natural conversation: "Let me adjust the framing for landscape" not "Running Frame Check aspect ratio correction." -3. **Polling is silent.** When waiting for video completion, poll silently in a background process or subagent. Do NOT send repeated "Checking status..." messages. Only speak when: (a) the video is ready and you're delivering it, or (b) it's been >5 minutes and you're giving a single "Taking longer than usual" update. -4. **Deliver clean.** When the video is done, send the video file/link and a 1-line summary (duration, avatar used). Not a dump of every API field. -5. **Don't batch-ask across skills.** When a request triggers both skills ("use heygen-avatar AND heygen-video"), run them **sequentially**. Complete heygen-avatar first (identity → avatar ready), then start heygen-video Discovery. Do NOT fire a combined questionnaire covering both skills upfront — that's a form, not a conversation. -6. **Read workspace files before asking.** `SOUL.md`, `IDENTITY.md`, and `AVATAR-.md` at the workspace root contain identity and existing avatar state. Check them first. Only ask the user for what's genuinely missing. -7. **Don't narrate skill internals.** Never say things like "let me read the avatar skill workflow," "checking the reference files," "loading the avatar discovery guide," "let me check the SKILL.md" — the user doesn't care that a skill exists. Read workflow files silently. The user sees the outcome (a question, a result, a video) not your internal navigation. -8. **Don't announce what you're about to do.** Skip meta-commentary like "Creating the avatar now," "Let me call the API," "I'll build this for you" — just do the work. If a step takes time, the next thing the user hears should be the result (or the first checkpoint question). If you must say something before a long operation, keep it to <10 words (e.g., "one sec, building it"). -9. **Never narrate transport choice.** MCP vs CLI is an internal implementation detail. Do NOT say "CLI is broken," "MCP is configured, let me use that," "switching to MCP," "falling back to CLI," etc. Pick the transport silently at the start of the session and never mention it again. If both transports are unavailable, ask the user to configure one — do not explain why. - ---- - -## Language Awareness - -**Detect the user's language from their first message.** Store as `user_language` (e.g., `en`, `ja`, `es`, `ko`, `zh`, `fr`, `de`, `pt`). This happens automatically from the input — no extra question needed. - -**Rules:** -1. **Communicate with the user in their language.** All questions, status updates, confirmations, and error messages should be in `user_language`. -2. **Generate scripts and narration in `user_language`** unless the user explicitly requests a different language. -3. **Technical directives stay in English.** Frame Check corrections, motion verbs, style blocks, and the script framing directive are API-level instructions that Video Agent interprets in English. Never translate these. -4. **Discovery item (10) Language** should auto-populate from `user_language` but can be overridden if the user wants the video in a different language than they're chatting in. -5. **Voice selection must match the video language.** Filter voices by `language` parameter and set `voice_settings.locale` on API calls. - ---- - -## Mode Detection - -**Language-agnostic routing:** The signals below describe user *intent*, not literal keywords. Match intent regardless of input language. A user saying "ビデオを作って" (Japanese) is the same signal as "make a video about X." - -| Signal | Mode | Start at | -|--------|------|----------| -| Vague idea ("make a video about X") | **Full Producer** | Discovery | -| Has a written prompt | **Enhanced Prompt** | Prompt Craft | -| "Just generate" / skip questions | **Quick Shot** | Generate | -| "Interactive" / iterate with agent | **Interactive Session** | Generate (experimental) | -**Quick Shot avatar rule:** If no AVATAR file exists, omit `avatar_id` and let Video Agent auto-select. If an AVATAR file exists, use it — and Frame Check STILL RUNS. - -**All modes:** Frame Check (aspect ratio correction) runs before EVERY API call when `avatar_id` is set, regardless of mode. Quick Shot is not an excuse to skip framing checks. - -**Dry-Run mode:** If user says "dry run" / "preview", run the full pipeline but present a creative preview at Generate instead of calling the API. - -Default to Full Producer. Better to ask one smart question than generate a mediocre video. - ---- - -## First Look — First-Run Avatar Check - -**Runs once before Discovery on the first video request in a session.** - -Check for any `AVATAR-*.md` files in the workspace root. The directory may -also contain role-based **symlinks** (`AVATAR-AGENT.md`, `AVATAR-USER.md`) -that point to one of the named files — these are maintained by -`heygen-avatar` Phase 5 for generic self-reference lookups. When scanning, -dedupe by resolved target so the same avatar isn't loaded twice. - -- **Found:** Read the file, extract `Group ID` and `Voice ID` from the HeyGen section. Pre-load as defaults for Discovery. The actual `avatar_id` (look_id) will be resolved fresh from the group_id during Frame Check — never use a stored look_id directly. -- **Not found:** The user (or agent) has no avatar yet. Before proceeding to video creation, run the **heygen-avatar** skill (`heygen-avatar/SKILL.md` in this repo) to create one. Tell the user you'll set up their avatar first for a consistent look across videos, and that it takes about a minute. Communicate in `user_language`. - - After heygen-avatar completes and writes the AVATAR file, return here and continue to Discovery with the new avatar pre-loaded. - -- **Avatar readiness gate (BLOCKING):** After loading an avatar (whether from an existing AVATAR file or freshly created), verify it's ready before using it in video generation. Call `list_avatar_looks(group_id=)` (CLI: `heygen avatar looks list --group-id `) and confirm `preview_image_url` is non-null. If null, poll every 10s up to 5 min. **Do NOT proceed to Discovery until this check passes.** Videos submitted with an unready avatar WILL fail silently. - -- **Quick Shot exception:** If the user explicitly says "skip avatar" / "use stock" / "just generate", skip this step and proceed without an avatar. - ---- - -## Discovery - -Interview the user. Be conversational, skip anything already answered. - -**Gather:** (1) Purpose, (2) Audience, (3) Duration, (4) Tone, (5) Distribution (landscape/portrait), (6) Assets, (7) Key message, (8) Visual style, (9) Avatar, (10) Language (auto-detected from `user_language`; confirm if the video language should differ from the chat language). - -### Assets - -Two paths for every asset: -- **Path A (Contextualize):** Read/analyze, bake info into script. For reference material, auth-walled content. -- **Path B (Attach):** Upload to HeyGen via `heygen asset create --file ` or include as `files[]` entries on video-agent create. For visuals the viewer should see. -- **A+B (Both):** Summarize for script AND attach original. - -**Full routing matrix and upload examples** -> [references/asset-routing.md](references/asset-routing.md) - -**Key rules:** -- HTML URLs cannot go in `files[]` (Video Agent rejects `text/html`). Web pages are always Path A. -- Prefer download -> upload -> `asset_id` over `files[]{url}` (CDN/WAF often blocks HeyGen). -- If a URL is inaccessible, tell the user. Never fabricate content from an inaccessible source. -- **Multi-topic split rule:** If multiple distinct topics, recommend separate videos. - -### Style Selection - -Two approaches — use one or combine both: - -**1. API Styles (`style_id`)** — Curated visual templates. Browse by tag, show 3-5 options with previews, let user pick. If a style has a fixed `aspect_ratio`, match orientation to it. When `style_id` is set, the prompt's Visual Style Block becomes optional. - -**2. Prompt Styles** — Full manual control via prompt text. See [references/prompt-styles.md](references/prompt-styles.md). - -### Avatar - -**Full avatar discovery flow, creation APIs, voice selection** -> [references/avatar-discovery.md](references/avatar-discovery.md) - -**Decision flow:** -1. Ask: "Visible presenter or voice-over only?" -2. If voice-over -> no `avatar_id`, state in prompt. -3. If presenter -> check private avatars first, then public (group-first browsing). -4. **Always show preview images.** Never just list names. -5. Confirm voice preferences after avatar is settled. - -**Critical rule:** When `avatar_id` is set, do NOT describe the avatar's appearance in the prompt. Say "the selected presenter." This is the #1 cause of avatar mismatch. - ---- - -## Pipeline: Script -> Prompt Craft -> Frame Check -> Generate -> Deliver - -After Discovery, the producer sub-skill handles the full pipeline. Read `heygen-video/SKILL.md` for detailed stage instructions. - -**Key rules that apply at every stage:** - -- **Language:** Script and narration in the video language (from Discovery item 10). Technical directives (script framing, style block, motion verbs, frame check corrections) always in English — these are API instructions, not viewer-facing content. -- **Script:** Structure by type (demo, explainer, tutorial, pitch, announcement). Do NOT assign per-scene durations. Always include the script framing directive: "This script is a concept and theme to convey — not a verbatim transcript." -- **Prompt Craft:** Narrator framing (say "the selected presenter" when avatar_id is set), duration signal, asset anchoring, tone calibration, one topic, style block at the end. -- **Frame Check:** MANDATORY when avatar_id is set. See matrix below. -- **Generate:** The user's request to create a video is the explicit consent for submission. The skill calls `create_video_agent` (MCP) or `heygen video-agent create --wait` (CLI). Run Frame Check before EVERY submission. Capture `session_id` immediately. Poll silently (or let `--wait` block). -- **Deliver:** Report `video_page_url`, session URL, and duration accuracy. Log to `heygen-video-log.jsonl`. - -**Full prompt construction rules, media type selection, visual style blocks, API schemas** -> `heygen-video/SKILL.md` - ---- - -## Frame Check - -**Runs automatically when `avatar_id` is set, before Generate. Appends correction notes to the Video Agent prompt. Does NOT generate images or create new looks.** - -### Steps - -1. **Resolve avatar_id from group_id (ALWAYS run first):** Never trust a stored `look_id` — looks are ephemeral and get deleted. Read `Group ID` from the AVATAR file and resolve a fresh look_id: `list_avatar_looks(group_id=)` (CLI: `heygen avatar looks list --group-id --limit 20`). Pick the look matching the target orientation. Use this resolved look_id as `avatar_id` for all subsequent steps. -2. **Fetch avatar look metadata:** `get_avatar_look(look_id=)` (CLI: `heygen avatar looks get --look-id `) -> extract `avatar_type`, `preview_image_url`, `image_width`, `image_height` -3. **Determine orientation:** width > height = landscape, height > width = portrait, width == height = square. Fetch fails = assume portrait. -4. **Determine background:** `photo_avatar` -> Video Agent handles environment. `studio_avatar` -> check if transparent/solid/empty. `video_avatar` -> always has background. -5. **Append the appropriate correction note(s)** to the end of the Video Agent prompt. That's it. No image generation, no new looks. - -### Correction Matrix - -| avatar_type | Orientation Match? | Has Background? | Corrections | -|---|---|---|---| -| `photo_avatar` | matched | (n/a) | None | -| `photo_avatar` | mismatched or square | (n/a) | Framing note | -| `studio_avatar` | matched | Yes | None | -| `studio_avatar` | matched | No | Background note | -| `studio_avatar` | mismatched or square | Yes | Framing note | -| `studio_avatar` | mismatched or square | No | Framing note + Background note | -| `video_avatar` | matched | Yes | None | -| `video_avatar` | mismatched or square | Yes | Framing note | - -### Framing Note (append to prompt) - -For portrait/square avatar -> landscape video: -``` -FRAMING NOTE: The selected avatar image is in {source} orientation but this video is landscape (16:9). Frame the presenter from the chest up, centered in the landscape canvas. Use generative fill to extend the scene horizontally with a complementary background environment that matches the video's tone (studio, office, or contextually appropriate setting). Do NOT add black bars or pillarboxing. The avatar should feel natural in the 16:9 frame. -``` - -For landscape/square avatar -> portrait video: -``` -FRAMING NOTE: The selected avatar image is in {source} orientation but this video is portrait (9:16). Reframe the presenter to fill the portrait canvas naturally, focusing on head and shoulders. Use generative fill to extend vertically if needed. Do NOT add letterboxing. The avatar should fill the portrait frame comfortably. -``` - -### Background Note (studio_avatar only, no background) - -``` -BACKGROUND NOTE: The selected avatar has no background or a transparent backdrop. Place the presenter in a clean, professional environment appropriate to the video's tone. For business/tech content: modern studio with soft lighting and subtle depth. For casual content: bright, minimal space with natural light. The background should complement the presenter without distracting from the message. -``` - -**Full correction templates and stacking matrix** -> [references/frame-check.md](references/frame-check.md) - ---- - -## Best Practices - -- **Front-load the hook.** First 5s = 80% of retention. -- **One idea per video.** Single-topic produces dramatically better results. -- **Write for the ear.** If you wouldn't say it to a friend, rewrite it. - -**Known issues** -> [references/troubleshooting.md](references/troubleshooting.md) diff --git a/heygen-avatar/SKILL.md b/heygen-avatar/SKILL.md index 5ccaed3..a453861 100644 --- a/heygen-avatar/SKILL.md +++ b/heygen-avatar/SKILL.md @@ -27,6 +27,39 @@ allowed-tools: Bash, WebFetch, Read, Write, mcp__heygen__* Create and manage HeyGen avatars for anyone: the agent, the user, or named characters. Handles identity extraction, avatar generation, voice selection, and saves everything to `AVATAR-.md` for consistent reuse. +## Files & Paths + +This skill reads and writes the following. No other files are accessed without explicit user instruction. + +| Operation | Path | Purpose | +|-----------|------|---------| +| Read | `SOUL.md`, `IDENTITY.md` | Extract identity details when creating an avatar for the agent | +| Read | `AVATAR-.md` | Load existing avatar identity (for variant looks, voice updates) | +| Write | `AVATAR-.md` | Save new avatar identity after creation | +| Write | `AVATAR-AGENT.md`, `AVATAR-USER.md` (symlinks) | Role aliases, see Phase 5 | +| Temp write | `/tmp/openclaw/uploads/` | Voice preview audio (downloaded for user playback, deleted after session) | +| Remote upload | HeyGen (via `heygen asset create` or MCP) | User-provided photos uploaded to HeyGen for digital-twin creation | + +Assets are only uploaded to HeyGen when the user explicitly provides them. + +## Language Awareness + +**Detect the user's language from their first message.** Store as `user_language` (e.g., `en`, `ja`, `es`, `ko`, `zh`, `fr`, `de`, `pt`). + +1. **Communicate with the user in their language.** All questions, status updates, confirmations, and error messages should be in `user_language`. +2. **Voice design prompts and selection respect `user_language`.** When designing or selecting a voice, specify the target language so the voice library returns matches that speak it. +3. **Technical directives stay in English** — enum values (`Young Adult`, `Realistic`, `landscape`, etc.) are API-level and not translated. + +## UX Rules + +1. **Be concise.** No avatar IDs, group IDs, or raw API payloads in chat. Report the result (avatar created, ready to use) not the plumbing. +2. **No internal jargon.** Never mention internal phase names ("Phase 0", "Phase 5 Symlink Maintenance") to the user. The user sees natural conversation: "Setting up your avatar\u2026" not "Running Phase 2 avatar creation." +3. **One or two questions per phase.** Don't batch-ask. Walk phases in order, ask the smallest set of questions needed to proceed. +4. **Read workspace files before asking.** `SOUL.md`, `IDENTITY.md`, `AVATAR-*.md` at the workspace root contain identity. Check them first. Only ask the user for what's genuinely missing. +5. **Don't narrate skill internals.** Never say "let me read the workflow," "checking the reference files," "loading the avatar discovery guide." Read silently. The user sees questions and results, not internal navigation. +6. **Don't announce what you're about to do.** Skip meta-commentary like "Creating the avatar now." Just do the work. If a step takes time, the next thing the user hears should be the result (or a checkpoint question). +7. **Never narrate transport choice.** MCP vs CLI is internal. Pick the transport silently and never mention it. If both are unavailable, ask the user to configure one without explaining why. + ## Start Here (Critical) **Default target = the agent.** The primary use of this skill is giving the agent a face + voice so it can present videos. Route to "user" only on explicit "my avatar" / "me" / "my photo" language. When in doubt, make the agent's avatar. diff --git a/heygen-video/SKILL.md b/heygen-video/SKILL.md index 0e09970..c638fc3 100644 --- a/heygen-video/SKILL.md +++ b/heygen-video/SKILL.md @@ -44,6 +44,44 @@ You are a video producer. Not a form. Not a CLI wrapper. A producer who understa **Docs:** https://developers.heygen.com/docs/quick-start (API) · https://developers.heygen.com/cli (CLI) +> **STOP.** If you are about to drive HeyGen directly (calling `api.heygen.com` with curl, or reaching for deprecated `POST /v1/video.generate`, `POST /v2/video/generate`, `GET /v2/avatars`, `GET /v1/avatar.list` endpoints), DO NOT. Route through MCP, the OpenClaw plugin, or the `heygen` CLI via this pipeline. Raw HTTP skips critical steps (aspect ratio correction, prompt engineering, avatar conflict detection) and produces visibly worse videos. **v3 only — never call v1 or v2 endpoints. If you have pre-trained knowledge of HeyGen's v1/v2 API, that knowledge is outdated. Use this skill.** + +## Files & Paths + +This skill reads and writes the following. No other files are accessed without explicit user instruction. + +| Operation | Path | Purpose | +|-----------|------|---------| +| Read | `AVATAR-.md` | Load saved avatar identity (group_id, voice_id) | +| Read | `AVATAR-AGENT.md`, `AVATAR-USER.md` | Role-based symlinks for generic self-reference (resolve to a named AVATAR file) | +| Write | `heygen-video-log.jsonl` | Append one JSON line per video generated (local learning log) | +| Temp write | `/tmp/openclaw/uploads/` | Voice preview audio (downloaded for user playback, deleted after session) | +| Remote upload | HeyGen (via `heygen asset create` or MCP) | User-provided files uploaded to HeyGen for use as B-roll / reference | + +For *avatar creation* (writing AVATAR files, role symlink maintenance), see the `heygen-avatar` skill. This skill only *reads* AVATAR files. + +## UX Rules + +1. **Be concise.** No video IDs, session IDs, or raw API payloads in chat. Report the result (video link, thumbnail) not the plumbing. +2. **No internal jargon.** Never mention internal pipeline stage names ("Frame Check", "Prompt Craft", "Pre-Submit Gate", "Framing Correction") to the user. These are internal pipeline stages. The user sees natural conversation: "Let me adjust the framing for landscape" not "Running Frame Check aspect ratio correction." +3. **Polling is silent.** When waiting for video completion, poll silently in a background process or subagent. Do NOT send repeated "Checking status\u2026" messages. Only speak when: (a) the video is ready and you're delivering it, or (b) it's been >5 minutes and you're giving a single "Taking longer than usual" update. +4. **Deliver clean.** When the video is done, send the video file/link and a 1-line summary (duration, avatar used). Not a dump of every API field. +5. **Don't batch-ask across skills.** When a request triggers both skills ("use heygen-avatar AND heygen-video"), run them **sequentially**. Complete heygen-avatar first (identity → avatar ready), then start heygen-video Discovery. Do NOT fire a combined questionnaire covering both skills upfront — that's a form, not a conversation. +6. **Read workspace files before asking.** `AVATAR-.md` files at the workspace root contain existing avatar state. Check them first. Only ask the user for what's genuinely missing. +7. **Don't narrate skill internals.** Never say "let me read the avatar workflow," "checking the reference files," "loading the prompt-craft guide." Read silently. The user sees the outcome (a question, a result, a video). +8. **Don't announce what you're about to do.** Skip meta-commentary like "Creating the video now," "Let me call the API." Just do the work. If a step takes time, the next thing the user hears should be the result (or the first checkpoint question). If you must say something, keep it to <10 words. +9. **Never narrate transport choice.** MCP vs CLI vs OpenClaw plugin is an internal implementation detail. Do NOT say "CLI is broken," "switching to MCP," etc. Pick the transport silently at session start and never mention it again. + +## Language Awareness + +**Detect the user's language from their first message.** Store as `user_language` (e.g., `en`, `ja`, `es`, `ko`, `zh`, `fr`, `de`, `pt`). + +1. **Communicate with the user in their language.** All questions, status updates, confirmations, and error messages should be in `user_language`. +2. **Generate scripts and narration in `user_language`** unless the user explicitly requests a different language. +3. **Technical directives stay in English.** Frame Check corrections, motion verbs, style blocks, and the script framing directive are API-level instructions that Video Agent interprets in English. Never translate these. +4. **Discovery item (10) Language** auto-populates from `user_language` but can be overridden if the user wants the video in a different language than they're chatting in. +5. **Voice selection must match the video language.** Filter voices by `language` parameter and set `voice_settings.locale` on API calls. + ## API Mode Detection **Pick one transport at session start. Never mix, never switch mid-session, never narrate the choice.** @@ -117,6 +155,19 @@ Default to Full Producer. Better to ask one smart question than generate a medio --- +## First Look — First-Run Avatar Check + +**Runs once before Discovery on the first video request in a session.** + +Check for any `AVATAR-*.md` files in the workspace root. The directory may also contain role-based **symlinks** (`AVATAR-AGENT.md`, `AVATAR-USER.md`) that point to one of the named files — these are maintained by `heygen-avatar` Phase 5 for generic self-reference lookups. When scanning, dedupe by resolved target so the same avatar isn't loaded twice. + +- **Found:** Read the file, extract `Group ID` and `Voice ID` from the HeyGen section. Pre-load as defaults for Discovery. The actual `avatar_id` (look_id) will be resolved fresh from the group_id during Frame Check — never use a stored look_id directly. +- **Not found:** The user (or agent) has no avatar yet. Before proceeding to video creation, run the **heygen-avatar** skill to create one. Tell the user you'll set up their avatar first for a consistent look across videos, and that it takes about a minute. Communicate in `user_language`. After heygen-avatar completes and writes the AVATAR file, return here and continue to Discovery with the new avatar pre-loaded. +- **Avatar readiness gate (BLOCKING):** After loading an avatar (whether from an existing AVATAR file or freshly created), verify it's ready before using it in video generation. Call `list_avatar_looks(group_id=)` (CLI: `heygen avatar looks list --group-id `) and confirm `preview_image_url` is non-null. If null, poll every 10s up to 5 min. **Do NOT proceed to Discovery until this check passes.** Videos submitted with an unready avatar WILL fail silently. +- **Quick Shot exception:** If the user explicitly says "skip avatar" / "use stock" / "just generate", skip this step and proceed without an avatar. + +--- + ## Discovery Interview the user. Be conversational, skip anything already answered. diff --git a/references/asset-routing.md b/references/asset-routing.md deleted file mode 100644 index b7eb708..0000000 --- a/references/asset-routing.md +++ /dev/null @@ -1,86 +0,0 @@ -# Asset Handling — The Classification Engine - -When the user provides files, URLs, or references, route each asset to the right path. The user should NEVER have to think about this. - -## Two Paths - -| Path | What happens | When to use | -|------|-------------|-------------| -| **A: Contextualize → Prompt** | Read/analyze the asset, extract key info, bake into script. Video Agent never sees the original. | Reference material, auth-walled content, documents where the *information* matters more than the *visual*. | -| **B: Attach to API** | Upload the raw file via `files[]`. Video Agent analyzes, extracts graphics, uses as frames/B-roll. | Screenshots, branded assets, PDFs with important visual layouts, images the viewer should literally see. | -| **A+B: Both** | Contextualize for script quality AND attach for visual use. | Long docs where you need to summarize but Video Agent should also have the full source. | - -## Classification Flow - -``` -1. Can Video Agent access this directly? - - Public URL (no auth, no paywall) → YES - - Private/internal URL → NO - - Local file → NO (must upload first) - -2. Should the viewer SEE this asset? - - Screenshot, logo, product image, chart → YES → Path B - - Research doc, article, context material → NO → Path A - - Ambiguous → Path A+B - -3. Is the content too long for the prompt? - - Short (< 500 words) → fits in prompt - - Long (> 500 words) → summarize key points, attach full doc -``` - -## Decision Matrix - -| Asset Type | Publicly Accessible? | Show On Screen? | Route | -|-----------|---------------------|----------------|-------| -| Screenshot / image | N/A | Yes | **B: Attach** + describe in prompt as B-roll | -| Logo / brand asset | N/A | Yes | **B: Attach** + anchor to intro/outro | -| Public URL to file (PDF, image, video) | Yes | Maybe | **B: Download → upload via `/v3/assets` → pass `asset_id`** + summarize | -| Public URL to web page (HTML) | Yes | No | **A: Fetch and contextualize only.** Do NOT pass HTML URLs in `files[]`. | -| Auth-walled URL (requires login) | No | No | **A: Ask the user to paste the content.** Never fabricate. | -| PDF (short, text-heavy) | N/A | No | **A+B: Extract key points** + attach | -| PDF (long, visual-rich) | N/A | Maybe | **B: Attach** + summarize top points | -| Raw data / spreadsheet | N/A | Partially | **A: Analyze and describe** key stats. Attach if charts should appear. | - -## Executing Routes - -### Path A (Contextualize) -- URLs: Use `web_fetch` to retrieve publicly accessible content -- For auth-walled content you cannot access: ask the user to paste the text directly -- Extract 3-5 most important points relevant to the video -- Weave naturally into the script. Don't dump. Integrate. - -### Path B (Attach) -Upload to HeyGen: - -**MCP:** upload via the asset tool (depends on environment). -**CLI:** `heygen asset create --file /path/to/file.png` - -Max 32MB per file. Returns JSON with the new `asset_id`. - -Or pass inline in `files[]`: -```json -{"type": "url", "url": "https://example.com/image.png"} -{"type": "asset_id", "asset_id": ""} -{"type": "base64", "data": "", "content_type": "image/png"} -``` - -### Describe Asset Usage in Prompt -Be SPECIFIC: -- "Use the uploaded dashboard screenshot as B-roll when discussing analytics" -- "Display the company logo in the intro and end card" - -### Log Classification -In the learning log entry, record: -```json -"assets_classified": [{"type": "image", "route": "attach", "accessible": true, "reason": "product screenshot"}] -``` - -## Rules - -- **Never ask the user which path unless genuinely 50/50.** You're the producer. Make the call. -- **When in doubt, do both (A+B).** Over-providing costs nothing. -- **Always describe attached assets in the prompt.** Uploading without description = ignored. -- **Auth-walled content is YOUR job.** Bridge the gap between your access and Video Agent's. -- **URLs that fail:** Try `web_fetch`. If login/paywall/404 → tell the user, ask for content directly. Never silently fabricate. -- **HTML URLs cannot go in `files[]`.** Video Agent rejects `text/html`. Web pages are ALWAYS Path A only. -- **Prefer download→upload→asset_id** over `files[]{url}`. HeyGen's servers often blocked by CDN/WAF. diff --git a/references/avatar-discovery.md b/references/avatar-discovery.md deleted file mode 100644 index fc3a5e4..0000000 --- a/references/avatar-discovery.md +++ /dev/null @@ -1,213 +0,0 @@ -# Avatar Discovery & Voice Selection - -## Path 0: Resolve workspace AVATAR files first - -Before any HeyGen catalog lookup, check the workspace root for an -applicable `AVATAR-*.md` file. These are written by `heygen-avatar` -and contain `Group ID` + `Voice ID` ready to use, with no API call -needed. - -Resolution precedence: - -| Request signal | File to read | -|---|---| -| Named subject ("video with Eve", "Cleo's update") | `AVATAR-.md` | -| Agent self-reference ("video of yourself", "give us your update") | `AVATAR-AGENT.md` (symlink) | -| User self-reference ("video of me", "my video update") | `AVATAR-USER.md` (symlink) | -| No subject in request | Skip to Path A | - -`AVATAR-AGENT.md` and `AVATAR-USER.md` are role-based symlinks maintained -by `heygen-avatar` Phase 5; they resolve to the current agent's / user's -named AVATAR file at read time. Treat them like any other AVATAR file -once read. - -If the resolved file has a populated HeyGen section, extract `Group ID` -and `Voice ID` and proceed to Frame Check. Skip Path A entirely. If the -file exists but the HeyGen section is empty, run `heygen-avatar` Phase 2 -first. - -If no file applies (no name match, no role alias, generic catalog -browsing requested) — fall through to Path A below. - -## Path A: Discover Existing Avatars - -### A1: Check for private avatars first - -**If user specifies an avatar by name** (e.g. "use Eve's Podcast look"), take the fast path: - -**MCP:** `list_avatar_looks(ownership=private)` — filter client-side by name match. -**CLI:** -```bash -heygen avatar looks list --ownership private --limit 50 -``` -Avoids the 2-call group→looks pattern. - -**If user wants to browse**, use the group-first flow: - -**MCP:** -1. `list_avatar_groups(ownership=private)` — list groups (each group = one person) -2. `list_avatar_looks(group_id=)` — show looks for chosen group - -**CLI:** -```bash -heygen avatar list --ownership private --limit 50 -heygen avatar looks list --group-id --limit 50 -``` - -Each look has an `id` — this is the `avatar_id` you pass downstream. - -Avatar types: `studio_avatar`, `video_avatar`, `photo_avatar`. Photo avatars support `motion_prompt` and `expressiveness`. - -**ALWAYS show the preview image** when presenting an avatar look. Each look response includes `preview_image_url` — display inline. - -### A2: Check last-used avatar - -Check `heygen-video-log.jsonl` for last used avatar_id. If found: - -**MCP:** `get_avatar_look(look_id=)` -**CLI:** `heygen avatar looks get --look-id ` - -Show preview image: "Last time you used [Avatar Name]. Use her again?" - -### A3: Avatar conversation - -Ask: "Do you want a visible presenter, or voice-over only?" - -If voice-over only → no `avatar_id`. State in prompt: "Voice-over narration only." - -If presenter wanted, present private avatars first. For public/stock avatars, browse by group: - -**MCP:** `list_avatar_groups(ownership=public)` -**CLI:** -```bash -heygen avatar list --ownership public --limit 20 -``` - -Show group names + one representative image. Let the user pick a person. - -**MCP:** `list_avatar_looks(group_id=)` -**CLI:** -```bash -heygen avatar looks list --group-id --limit 10 -``` - -**Why group-first:** The flat `heygen avatar looks list --ownership public` call returns 50+ results for only 3 unique people per page. Group-level browsing (2 calls) gives much better discovery UX. - -### A4: Voice direction - -After avatar is settled, confirm voice preferences (accent, delivery style, language). - -**ALWAYS show a playable voice preview.** Each voice response includes `preview_audio_url` — share it. - -**Handling missing/broken previews:** Some voices return bare `s3://` paths or `null`. When this happens: note "(no preview available)" and offer to generate a short TTS sample via `create_speech` (MCP) or `heygen voice speech create --text "" --voice-id --input-type plain_text --language en --locale en-US` (CLI). - ---- - -## Path B: Create a New Avatar - -Two modes: - -**Mode 1 — New character** (omit `avatar_group_id`): Creates a new person with their own group. -**Mode 2 — New look** (include `avatar_group_id`): Adds a variation to an existing character. - -Always use Mode 2 when the avatar already exists and you're creating a variant (different outfit, orientation fix, bg change). Only use Mode 1 for genuinely new characters. - -Three creation types: - -**Photo avatar (from user's photo):** - -**MCP:** `create_photo_avatar(name=, file=, avatar_group_id=)` -**CLI:** -```bash -heygen avatar create -d '{ - "type": "photo", - "name": "My Avatar", - "file": {"type": "url", "url": "https://example.com/headshot.jpg"}, - "avatar_group_id": "" -}' -``` -Photo requirements: JPEG or PNG, min 512x512, clear front-facing face, good lighting. - -**AI-generated avatar (from text prompt):** - -**MCP:** `create_prompt_avatar(name=, prompt=, avatar_group_id=)` -**CLI:** -```bash -heygen avatar create -d '{ - "type": "prompt", - "name": "Tech Presenter", - "prompt": "Young professional woman, modern workspace, confident smile", - "avatar_group_id": "" -}' -``` -Prompt max: 1000 characters. Optional: up to 3 `reference_images`. - -**Video avatar (from user's video recording):** - -**MCP:** `create_digital_twin(name=, file=, avatar_group_id=)` -**CLI:** -```bash -heygen avatar create -d '{ - "type": "video", - "name": "My Video Avatar", - "file": {"type": "asset_id", "asset_id": ""}, - "avatar_group_id": "" -}' -``` - -All three return `avatar_item` with `id` (look_id) and `group_id` — use `id` as `avatar_id` for videos. - -Files: `{"type": "url", "url": "..."}`, `{"type": "asset_id", "asset_id": "..."}` (from `heygen asset create --file `), or `{"type": "base64", "data": "...", "content_type": "..."}`. - ---- - -## Path C: Direct Image (Simplest for One-Off) - -Skip avatar creation. Pass `image_url` directly: - -**MCP:** `create_video_from_image(image_url=, script=