diff --git a/.claude/launch.json b/.claude/launch.json new file mode 100644 index 0000000..6e056f3 --- /dev/null +++ b/.claude/launch.json @@ -0,0 +1,11 @@ +{ + "version": "0.0.1", + "configurations": [ + { + "name": "bioclaw-dev", + "runtimeExecutable": "npm", + "runtimeArgs": ["run", "dev"], + "port": 3847 + } + ] +} diff --git a/.env.example b/.env.example index 43964cd..d925fd2 100644 --- a/.env.example +++ b/.env.example @@ -41,6 +41,13 @@ WECOM_SECRET=your-secret # WECOM_AGENT_ID=your-agent-id # WECOM_CORP_SECRET=your-corp-secret +# ─── Feishu / Lark (optional) ───────────────── +# Create a self-built app at https://open.feishu.cn/ +# Enable: Bot capability, im:message, im:message:send_as_bot, im:resource +# Under Events & Callbacks → choose "Long connection" (长连接) +FEISHU_APP_ID=cli_your_app_id +FEISHU_APP_SECRET=your_app_secret + # ─── Discord — Optional ────────────────────── # To enable Discord bot: diff --git a/.gitignore b/.gitignore index 5f6a0fe..8b234d6 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,9 @@ groups/global/* *.keys.json .env +# Local startup scripts (contain machine-specific paths) +start.sh + # OS .DS_Store diff --git a/README.md b/README.md index f2c2513..5a69245 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,46 @@ WECOM_CORP_SECRET=your-corp-secret ``` The server IP must be added to the app's trusted IP whitelist. +### Feishu / Lark (飞书) + +1. Go to the [Feishu Open Platform](https://open.feishu.cn/) and create a **self-built app** (企业自建应用) +2. Enable **Bot** capability under **Add Capabilities** +3. Under **Permissions & Scopes**, grant: + - `im:message` — Receive messages + - `im:message:send_as_bot` — Send messages as bot + - `im:resource` — Download images/files from messages + - `im:message.group_msg` — Receive group messages (if using in groups) +4. Under **Events & Callbacks**, select **Long Connection** (长连接) mode +5. Subscribe to event: `im.message.receive_v1` +6. Copy the **App ID** and **App Secret**, add to `.env`: + ``` + FEISHU_APP_ID=cli_your_app_id + FEISHU_APP_SECRET=your_app_secret + ``` +7. Publish the app version and have the admin approve it +8. Add the bot to a group or send it a direct message to start chatting + +**Auto-registration:** New chats are automatically registered — no manual setup needed. By default, they use the `main` group folder. Override with: +``` +FEISHU_DEFAULT_FOLDER=my-folder +``` + +**Multi-bot support:** Up to 3 Feishu bots can run simultaneously (e.g., different agents for different groups): +``` +FEISHU2_APP_ID=cli_second_app_id +FEISHU2_APP_SECRET=second_app_secret +FEISHU2_DEFAULT_FOLDER=literature + +FEISHU3_APP_ID=cli_third_app_id +FEISHU3_APP_SECRET=third_app_secret +FEISHU3_DEFAULT_FOLDER=qwen-agent +``` + +

+ Feishu Chat Example 1 + Feishu Chat Example 2 +

+ ### Discord 1. Go to the [Discord Developer Portal](https://discord.com/developers/applications) diff --git a/README.zh-CN.md b/README.zh-CN.md index d24f464..39ae277 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -41,7 +41,7 @@ BioClaw 将常见的生物信息学任务带到聊天界面中。研究者可以 ## 快速开始 -> 说明:当前仓库中已经实现的消息通道是 WhatsApp。文档中的 QQ / 飞书截图展示的是扩展方向,不代表仓库里已经内置了可直接运行的 QQ / 飞书通道。 +> 说明:当前仓库中已实现的消息通道包括 WhatsApp、WeCom(企业微信)、飞书(Lark)和 Discord。QQ 通道展示的是扩展方向,尚未内置。 > 现在也支持一个更适合 Windows 用户的本地网页聊天入口。若你在中国、或者暂时不想接 WhatsApp,可直接走 `HTTP webhook + 本地网页聊天`。 @@ -147,6 +147,41 @@ WECOM_CORP_SECRET=应用Secret ``` 服务器 IP 需加入应用的企业可信 IP 白名单。 +### 飞书(Lark) + +1. 前往 [飞书开放平台](https://open.feishu.cn/) 创建 **企业自建应用** +2. 在 **添加应用能力** 中启用 **机器人** +3. 在 **权限管理** 中开通以下权限: + - `im:message` — 接收消息 + - `im:message:send_as_bot` — 以机器人身份发送消息 + - `im:resource` — 下载消息中的图片和文件 + - `im:message.group_msg` — 接收群聊消息 +4. 在 **事件与回调** 中选择 **长连接** 模式 +5. 订阅事件:`im.message.receive_v1` +6. 复制 **App ID** 和 **App Secret**,添加到 `.env`: + ``` + FEISHU_APP_ID=cli_your_app_id + FEISHU_APP_SECRET=your_app_secret + ``` +7. 发布应用版本并通过管理员审批 +8. 将机器人添加到群聊或直接发送私聊消息即可开始对话 + +**自动注册:** 新对话会自动注册,无需手动配置。默认使用 `main` 文件夹,可通过以下配置覆盖: +``` +FEISHU_DEFAULT_FOLDER=my-folder +``` + +**多机器人支持:** 最多可同时运行 3 个飞书机器人(例如不同群使用不同 agent): +``` +FEISHU2_APP_ID=cli_second_app_id +FEISHU2_APP_SECRET=second_app_secret +FEISHU2_DEFAULT_FOLDER=literature + +FEISHU3_APP_ID=cli_third_app_id +FEISHU3_APP_SECRET=third_app_secret +FEISHU3_DEFAULT_FOLDER=qwen-agent +``` + ### Discord 1. 打开 [Discord Developer Portal](https://discord.com/developers/applications) @@ -195,7 +230,7 @@ install https://github.com/Runchuan-BU/BioClaw 更多任务示例见 [ExampleTask/ExampleTask.md](ExampleTask/ExampleTask.md)。 -> 注意:上面的 QQ / 飞书图片目前是产品展示示例,不是仓库内现成可启用的接入实现。 +> 注意:QQ 通道目前是展示示例,尚未内置实现。飞书通道已内置支持,配置 `.env` 即可使用。 ## 系统架构 diff --git a/container/Dockerfile b/container/Dockerfile index 5eeef41..8393438 100644 --- a/container/Dockerfile +++ b/container/Dockerfile @@ -70,6 +70,12 @@ RUN pip3 install --no-cache-dir --break-system-packages \ cnsplots \ pyGenomeTracks +# Install AutoResearchClaw and LiteLLM (OpenAI-compat proxy for Anthropic API) +RUN pip3 install --no-cache-dir --break-system-packages \ + litellm \ + pyyaml \ + git+https://github.com/aiming-lab/AutoResearchClaw.git + # Install PyMOL (headless) via apt RUN apt-get update && apt-get install -y \ pymol \ diff --git a/container/agent-runner/src/index.ts b/container/agent-runner/src/index.ts index d88e36b..8c592e2 100644 --- a/container/agent-runner/src/index.ts +++ b/container/agent-runner/src/index.ts @@ -37,6 +37,17 @@ interface ContainerOutput { result: string | null; newSessionId?: string; error?: string; + usage?: TokenUsageSummary; +} + +interface TokenUsageSummary { + input_tokens: number; + output_tokens: number; + cache_read_tokens: number; + cache_creation_tokens: number; + cost_usd: number; + duration_ms: number; + num_turns: number; } interface SessionEntry { @@ -157,6 +168,17 @@ async function readStdin(): Promise { const OUTPUT_START_MARKER = '---BIOCLAW_OUTPUT_START---'; const OUTPUT_END_MARKER = '---BIOCLAW_OUTPUT_END---'; +const EVENT_START_MARKER = '---BIOCLAW_EVENT_START---'; +const EVENT_END_MARKER = '---BIOCLAW_EVENT_END---'; + +interface ContainerEvent { + type: 'tool_call' | 'tool_result' | 'text'; + id?: string; + tool?: string; + input?: Record; + output?: string; + text?: string; +} function writeOutput(output: ContainerOutput): void { console.log(OUTPUT_START_MARKER); @@ -164,6 +186,12 @@ function writeOutput(output: ContainerOutput): void { console.log(OUTPUT_END_MARKER); } +function writeEvent(event: ContainerEvent): void { + console.log(EVENT_START_MARKER); + console.log(JSON.stringify(event)); + console.log(EVENT_END_MARKER); +} + function log(message: string): void { console.error(`[agent-runner] ${message}`); } @@ -680,6 +708,38 @@ async function runQuery( lastAssistantUuid = (message as { uuid: string }).uuid; } + // Emit events for display in dashboard chat + if (message.type === 'assistant') { + const content = (message as any).message?.content; + if (Array.isArray(content)) { + for (const block of content) { + if (block.type === 'tool_use') { + writeEvent({ type: 'tool_call', id: block.id, tool: block.name, input: block.input }); + } else if (block.type === 'text' && block.text && !block.text.includes('No response requested')) { + writeEvent({ type: 'text', text: block.text }); + } + } + } + } + + // Emit tool result events + if (message.type === 'user') { + const content = (message as any).message?.content; + if (Array.isArray(content)) { + for (const block of content) { + if (block.type === 'tool_result') { + const rawOutput = block.content; + const outputText = typeof rawOutput === 'string' + ? rawOutput + : Array.isArray(rawOutput) + ? rawOutput.map((c: any) => c.text || '').join('') + : JSON.stringify(rawOutput); + writeEvent({ type: 'tool_result', id: block.tool_use_id, output: (outputText || '').slice(0, 3000) }); + } + } + } + } + if (message.type === 'system' && message.subtype === 'init') { newSessionId = message.session_id; log(`Session initialized: ${newSessionId}`); @@ -693,11 +753,23 @@ async function runQuery( if (message.type === 'result') { resultCount++; const textResult = 'result' in message ? (message as { result?: string }).result : null; - log(`Result #${resultCount}: subtype=${message.subtype}${textResult ? ` text=${textResult.slice(0, 200)}` : ''}`); + const resultMsg = message as any; + // Build per-result usage from this result message + const resultUsage: TokenUsageSummary = { + input_tokens: resultMsg.usage?.input_tokens || 0, + output_tokens: resultMsg.usage?.output_tokens || 0, + cache_read_tokens: resultMsg.usage?.cache_read_input_tokens || 0, + cache_creation_tokens: resultMsg.usage?.cache_creation_input_tokens || 0, + cost_usd: resultMsg.total_cost_usd || 0, + duration_ms: resultMsg.duration_ms || 0, + num_turns: resultMsg.num_turns || 0, + }; + log(`Result #${resultCount}: subtype=${message.subtype} tokens=${resultUsage.input_tokens}/${resultUsage.output_tokens} cost=$${resultUsage.cost_usd.toFixed(4)}${textResult ? ` text=${textResult.slice(0, 200)}` : ''}`); writeOutput({ status: 'success', result: textResult || null, - newSessionId + newSessionId, + usage: (resultUsage.input_tokens > 0 || resultUsage.output_tokens > 0) ? resultUsage : undefined, }); } } @@ -1060,6 +1132,7 @@ async function main(): Promise { }); process.exit(1); } + } main(); diff --git a/container/skills.dev-backup/agent-browser/SKILL.md b/container/skills.dev-backup/agent-browser/SKILL.md new file mode 100644 index 0000000..dd6c6bf --- /dev/null +++ b/container/skills.dev-backup/agent-browser/SKILL.md @@ -0,0 +1,159 @@ +--- +name: agent-browser +description: Browse the web for any task — research topics, read articles, interact with web apps, fill forms, take screenshots, extract data, and test web pages. Use whenever a browser would be useful, not just when the user explicitly asks. +allowed-tools: Bash(agent-browser:*) +--- + +# Browser Automation with agent-browser + +## Quick start + +```bash +agent-browser open # Navigate to page +agent-browser snapshot -i # Get interactive elements with refs +agent-browser click @e1 # Click element by ref +agent-browser fill @e2 "text" # Fill input by ref +agent-browser close # Close browser +``` + +## Core workflow + +1. Navigate: `agent-browser open ` +2. Snapshot: `agent-browser snapshot -i` (returns elements with refs like `@e1`, `@e2`) +3. Interact using refs from the snapshot +4. Re-snapshot after navigation or significant DOM changes + +## Commands + +### Navigation + +```bash +agent-browser open # Navigate to URL +agent-browser back # Go back +agent-browser forward # Go forward +agent-browser reload # Reload page +agent-browser close # Close browser +``` + +### Snapshot (page analysis) + +```bash +agent-browser snapshot # Full accessibility tree +agent-browser snapshot -i # Interactive elements only (recommended) +agent-browser snapshot -c # Compact output +agent-browser snapshot -d 3 # Limit depth to 3 +agent-browser snapshot -s "#main" # Scope to CSS selector +``` + +### Interactions (use @refs from snapshot) + +```bash +agent-browser click @e1 # Click +agent-browser dblclick @e1 # Double-click +agent-browser fill @e2 "text" # Clear and type +agent-browser type @e2 "text" # Type without clearing +agent-browser press Enter # Press key +agent-browser hover @e1 # Hover +agent-browser check @e1 # Check checkbox +agent-browser uncheck @e1 # Uncheck checkbox +agent-browser select @e1 "value" # Select dropdown option +agent-browser scroll down 500 # Scroll page +agent-browser upload @e1 file.pdf # Upload files +``` + +### Get information + +```bash +agent-browser get text @e1 # Get element text +agent-browser get html @e1 # Get innerHTML +agent-browser get value @e1 # Get input value +agent-browser get attr @e1 href # Get attribute +agent-browser get title # Get page title +agent-browser get url # Get current URL +agent-browser get count ".item" # Count matching elements +``` + +### Screenshots & PDF + +```bash +agent-browser screenshot # Save to temp directory +agent-browser screenshot path.png # Save to specific path +agent-browser screenshot --full # Full page +agent-browser pdf output.pdf # Save as PDF +``` + +### Wait + +```bash +agent-browser wait @e1 # Wait for element +agent-browser wait 2000 # Wait milliseconds +agent-browser wait --text "Success" # Wait for text +agent-browser wait --url "**/dashboard" # Wait for URL pattern +agent-browser wait --load networkidle # Wait for network idle +``` + +### Semantic locators (alternative to refs) + +```bash +agent-browser find role button click --name "Submit" +agent-browser find text "Sign In" click +agent-browser find label "Email" fill "user@test.com" +agent-browser find placeholder "Search" type "query" +``` + +### Authentication with saved state + +```bash +# Login once +agent-browser open https://app.example.com/login +agent-browser snapshot -i +agent-browser fill @e1 "username" +agent-browser fill @e2 "password" +agent-browser click @e3 +agent-browser wait --url "**/dashboard" +agent-browser state save auth.json + +# Later: load saved state +agent-browser state load auth.json +agent-browser open https://app.example.com/dashboard +``` + +### Cookies & Storage + +```bash +agent-browser cookies # Get all cookies +agent-browser cookies set name value # Set cookie +agent-browser cookies clear # Clear cookies +agent-browser storage local # Get localStorage +agent-browser storage local set k v # Set value +``` + +### JavaScript + +```bash +agent-browser eval "document.title" # Run JavaScript +``` + +## Example: Form submission + +```bash +agent-browser open https://example.com/form +agent-browser snapshot -i +# Output shows: textbox "Email" [ref=e1], textbox "Password" [ref=e2], button "Submit" [ref=e3] + +agent-browser fill @e1 "user@example.com" +agent-browser fill @e2 "password123" +agent-browser click @e3 +agent-browser wait --load networkidle +agent-browser snapshot -i # Check result +``` + +## Example: Data extraction + +```bash +agent-browser open https://example.com/products +agent-browser snapshot -i +agent-browser get text @e1 # Get product title +agent-browser get attr @e2 href # Get link URL +agent-browser screenshot products.png +``` diff --git a/container/skills.dev-backup/bio-research-pipeline/SKILL.md b/container/skills.dev-backup/bio-research-pipeline/SKILL.md new file mode 100644 index 0000000..5ca8eb8 --- /dev/null +++ b/container/skills.dev-backup/bio-research-pipeline/SKILL.md @@ -0,0 +1,496 @@ +--- +name: bio-research-pipeline +description: > + 生物研究假说生成流程 / Biological hypothesis generation pipeline. + 用户提出研究方向后,自动并行搜索文献(PubMed + 预印本 + 通路数据库), + 生成≥5条机制假说,多角色辩论筛出Top3,并设计湿实验方案。 + Trigger when user asks: 帮我研究、提假说、设计实验方向、research direction, + generate hypotheses, literature review + experimental design. +keywords: + - bio-research-pipeline + - 研究假说 + - 假说生成 + - 文献综述 + - 实验设计 + - 湿实验 + - hypothesis-generation + - literature-review + - experimental-design + - wet-lab + - research-direction + - pathway-analysis + - 研究方向 + - 通路分析 +--- + +# Biological Research Hypothesis Pipeline + +> **自动触发条件(必须主动调用此 skill):** +> 当用户说以下任何内容时,**立即运行此 skill,不要直接回答**: +> - "帮我研究一下 X"、"研究一下 X 机制"、"X 方向帮我提假说" +> - "针对 X 设计研究方向/实验方案" +> - "X 通路的假说"、"X 相关的研究思路" +> - "run bio-research-pipeline"、"generate hypotheses for X" +> +> **执行本 skill 时,严格按照下面 Stage 1→5 的顺序完整跑完,不要跳过任何阶段。** + +You are a research coordinator orchestrating a multi-stage biological research pipeline. +This skill is triggered when a user provides a broad biological research direction and wants: +- A thorough literature review +- Multiple mechanistic hypotheses +- Multi-perspective critique and ranking +- Wet-lab experimental designs + +--- + +## SCRIPT PATH RESOLUTION + +This skill includes three helper scripts. Before running them, locate them: + +```bash +SKILL_DIR=$(find ~/.claude/skills -name "SKILL.md" -path "*/bio-research-pipeline/*" | xargs dirname 2>/dev/null | head -1) +PUBMED_SCRIPT="$SKILL_DIR/scripts/pubmed-fetch" +PREPRINT_SCRIPT="$SKILL_DIR/scripts/preprint-fetch" +PATHWAY_SCRIPT="$SKILL_DIR/scripts/pathway-search" +echo "Skill dir: $SKILL_DIR" +ls "$SKILL_DIR/scripts/" +``` + +Then invoke scripts as: +```bash +python3 "$PUBMED_SCRIPT" "your topic" --max 40 --years 5 +python3 "$PREPRINT_SCRIPT" "your topic" --max 30 --days 180 +python3 "$PATHWAY_SCRIPT" "your topic" --gene GENE_SYMBOL +``` + +--- + +## HOW TO INVOKE THIS PIPELINE + +When a user asks something like: +- "帮我研究一下 [方向]" +- "我想研究 [X] 机制,帮我提假说" +- "针对 [疾病/通路/基因],设计一个研究方向" +- "run bio-research-pipeline on [topic]" + +Parse the research direction from the user's message, then execute all 5 stages below **in order**. + +--- + +## STAGE 1 — PARALLEL LITERATURE SEARCH (3 MODELS) + +**Goal:** Cast a wide net using three models simultaneously — each with a different strength. + +| Role | Model | Responsibility | +|------|-------|---------------| +| Task A | **Claude** (you) | Real data fetch — PubMed API + bioRxiv API, runs Python scripts | +| Task B | **MiniMax** | Biomedical knowledge analysis, Chinese literature context, cross-checking | +| Task C | **Qwen** | Pathway landscape mapping, fast synthesis, regulatory network overview | + +Use the `Task` tool to launch **all 3 tasks in the same message** (parallel). Do NOT wait for one before starting the next. + +--- + +### Task A — Claude: Real Literature Fetch (PubMed + Preprints) + +This task runs actual API calls using the helper scripts in this skill. + +Prompt for Task A: +``` +You are doing a real literature fetch for the bio-research-pipeline. +Research direction: {RESEARCH_DIRECTION} + +Step 1 — Locate skill scripts: +SKILL_DIR=$(find ~/.claude/skills -name "SKILL.md" -path "*/bio-research-pipeline/*" | xargs dirname 2>/dev/null | head -1) + +Step 2 — Run PubMed fetch: +python3 "$SKILL_DIR/scripts/pubmed-fetch" "{RESEARCH_DIRECTION}" --max 40 --years 5 + +Step 3 — Run preprint fetch: +python3 "$SKILL_DIR/scripts/preprint-fetch" "{RESEARCH_DIRECTION}" --max 25 --days 180 + +Step 4 — Synthesize into structured summary: +- Key molecular mechanisms found +- Key proteins/genes/pathways mentioned +- Most significant recent findings (2023-2025) +- Any contradictions or open debates +- Top 5 most relevant papers with PMID/DOI + +Output all results clearly labeled with section headers. +``` + +--- + +### Task B — MiniMax: Biomedical Knowledge Analysis + +This task calls MiniMax via `mcp__bioclaw__call_minimax`. MiniMax contributes its own training knowledge — particularly strong on Chinese biomedical literature, clinical context, and TCM-related pathways. + +Prompt for Task B: +``` +Call mcp__bioclaw__call_minimax with this prompt: + +system: "You are an expert biomedical research analyst with deep knowledge of molecular biology, disease mechanisms, and the latest research trends in both Western and Chinese scientific literature." + +prompt: "Research direction: {RESEARCH_DIRECTION} + +Please provide a comprehensive analysis covering: + +1. CURRENT STATE OF THE FIELD + - What is well-established about this topic? + - What are the 3-5 most important mechanistic insights from recent years? + - Which research groups/labs are leading this field? + +2. KEY MOLECULAR PLAYERS + - List the most important proteins, genes, and non-coding RNAs involved + - Describe their known roles and interactions + - Note any recently discovered players (2022-2025) + +3. DISEASE RELEVANCE + - Which diseases/conditions is this most relevant to? + - What is the current clinical/translational status? + - Any recent clinical trials or translational breakthroughs? + +4. KNOWLEDGE GAPS + - What are the most important unresolved questions? + - Where do different research groups disagree? + - What has been tried but failed, and why? + +5. EMERGING ANGLES + - What novel angles are researchers starting to explore? + - Any recent paradigm shifts in thinking about this topic? + +Provide specific, concrete information. Cite field knowledge accurately." + +After getting MiniMax's response, output it verbatim with the header: "=== MINIMAX ANALYSIS ===" +``` + +--- + +### Task C — Qwen: Pathway Landscape + Regulatory Network + +This task calls Qwen via `mcp__bioclaw__call_qwen` AND runs the pathway-search script for real database data. + +Prompt for Task C: +``` +You are mapping the pathway landscape for the bio-research-pipeline. +Research direction: {RESEARCH_DIRECTION} + +Step 1 — Run real pathway database search: +SKILL_DIR=$(find ~/.claude/skills -name "SKILL.md" -path "*/bio-research-pipeline/*" | xargs dirname 2>/dev/null | head -1) +python3 "$SKILL_DIR/scripts/pathway-search" "{RESEARCH_DIRECTION}" + +Step 2 — Call Qwen for pathway synthesis: +Use mcp__bioclaw__call_qwen with: + +system: "You are a systems biology expert specializing in signaling pathway analysis and gene regulatory networks." + +prompt: "For the research topic: {RESEARCH_DIRECTION} + +Please map the complete biological pathway landscape: + +1. CORE PATHWAYS INVOLVED + - List and briefly describe each relevant pathway + - Explain how they interconnect for this topic + +2. REGULATORY HIERARCHY + - Upstream triggers / sensors + - Master regulators (transcription factors, kinases) + - Key effectors and their downstream targets + - Feedback and feedforward loops + +3. CROSSTALK POINTS + - Where do pathways intersect or antagonize each other? + - Which nodes are shared across multiple pathways? + - Potential compensatory mechanisms to be aware of + +4. CONTEXT-SPECIFIC REGULATION + - How does this regulation differ between cell types? + - Tissue-specific or disease-specific pathway alterations + - Known species differences (mouse vs human) + +5. THERAPEUTIC INTERVENTION POINTS + - Which nodes are most druggable? + - Existing drugs/inhibitors targeting these pathways + - Potential combination therapy rationale + +Be specific about molecule names and interaction types (phosphorylation, ubiquitination, transcriptional activation, etc.)" + +Output Qwen's response with header: "=== QWEN PATHWAY ANALYSIS ===" +Then append the real database results from Step 1 with header: "=== DATABASE RESULTS ===" +``` + +--- + +**After launching all 3 tasks**, collect results: +``` +taskA_result = TaskOutput(task_id_A) +taskB_result = TaskOutput(task_id_B) +taskC_result = TaskOutput(task_id_C) +``` +Wait for all 3 to complete before proceeding to Stage 2. + +**Send a progress update to the user** via `mcp__bioclaw__send_message`: +``` +"📚 文献检索完成(Claude + MiniMax + Qwen 三路并行) +🔬 Claude: PubMed {N} 篇 + 预印本 {M} 篇 +🤖 MiniMax: 生物医学知识分析完成 +⚡ Qwen: 通路图谱梳理完成 +正在综合分析,生成假说..." +``` + +--- + +## STAGE 2 — PATHWAY SYNTHESIS + HYPOTHESIS GENERATION + +**Goal:** Synthesize the 3 literature sources into a pathway map, then generate ≥5 mechanistic hypotheses. + +### 2a. Build Pathway Map + +From the 3 task outputs, extract: +- All mentioned proteins/genes → list with roles +- All mentioned pathways → list with descriptions +- Key interactions (A activates B, X inhibits Y) +- Unresolved questions explicitly mentioned in papers + +### 2b. Generate ≥5 Hypotheses + +For each hypothesis, output a structured block: + +``` +HYPOTHESIS [N]: [One-sentence title] + +Mechanism: + [2-3 sentences describing the molecular mechanism step by step] + e.g. "We propose that [A] activates [B] under [condition X], which leads to [downstream effect Y] + via [pathway Z]. This is supported by [evidence 1] but has not been directly tested in [context]." + +Key molecular players: + - [Gene/Protein 1]: [role] + - [Gene/Protein 2]: [role] + - [Pathway]: [how it's involved] + +Supporting evidence: + - [Paper/finding that supports this] + - [Observation that is consistent with this] + +Evidence gaps (why this is a hypothesis, not established fact): + - [What has NOT been shown] + - [Conflicting data, if any] + +Novelty score (1-10): [score] +Reason: [why this is or isn't novel] + +Testability score (1-10): [score] +Reason: [how difficult it would be to test with standard wet lab methods] +``` + +Generate hypotheses that: +- Cover **different mechanistic angles** (not just variations of the same idea) +- Range from **conservative** (well-supported, incremental) to **bold** (less evidence, high impact) +- Are **wet-lab testable** (avoid purely computational hypotheses) + +--- + +## STAGE 3 — MULTI-AGENT DEBATE + +**Goal:** Critically evaluate each hypothesis from 3 perspectives to identify the strongest ones. + +For each hypothesis, conduct a structured 3-voice review. You will play each role in sequence: + +``` +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +DEBATE — HYPOTHESIS [N]: [title] +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +🟢 SUPPORTER (argues FOR this hypothesis): + Strongest evidence points: + - [evidence 1] + - [evidence 2] + Why this mechanism is biologically plausible: + - [mechanistic reasoning] + Potential impact if confirmed: + - [scientific/clinical significance] + +🔴 SKEPTIC (argues AGAINST / identifies weaknesses): + Critical weaknesses: + - [flaw 1: e.g., "The key evidence comes from in vitro studies only"] + - [flaw 2: e.g., "Alternative explanation: this effect may be due to [X] instead"] + Confounding factors not accounted for: + - [confounder] + Prior work that challenges this: + - [conflicting evidence or null results] + +🔵 METHODOLOGIST (evaluates experimental feasibility): + To directly test this hypothesis, you would need: + - [key experiment] + Technical challenges: + - [challenge 1] + - [challenge 2] + Timeline estimate: [weeks/months] + Whether a typical university wet lab can do this: [Yes/No/Partially] + Model system recommendation: [cell line / mouse model / organoid / etc.] + +DEBATE VERDICT: + Evidence score (1-10): [score] — How well-supported is it currently? + Novelty score (1-10): [score] — How new is this idea? + Feasibility score (1-10): [score] — Can a wet lab test it in <12 months? + Impact score (1-10): [score] — How significant if confirmed? + + COMPOSITE SCORE: [average, weighted: Evidence×0.3 + Novelty×0.25 + Feasibility×0.25 + Impact×0.2] +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +``` + +Run this for ALL hypotheses. Then rank by composite score and select **TOP 3**. + +--- + +## STAGE 4 — TOP 3 REFINEMENT + +For each of the top 3 hypotheses, expand the mechanism with full molecular detail: + +``` +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +TOP [1/2/3]: [Hypothesis title] +Final composite score: [X.X/10] +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +REFINED MECHANISM: + [4-6 sentences with full molecular detail] + Include: upstream triggers, key effectors, downstream consequences, feedback regulation + +PATHWAY DIAGRAM (text-based): + [Stimulus/Condition] + ↓ + [Receptor/Sensor] → activates → [Kinase/TF] + ↓ + [Key effector] + ↓ (promotes) ↓ (inhibits) + [Outcome A] [Outcome B] + +KEY UNKNOWNS to be resolved by experiments: + 1. [Unknown 1] + 2. [Unknown 2] + 3. [Unknown 3] +``` + +--- + +## STAGE 5 — WET LAB EXPERIMENTAL DESIGN + +For each of the top 3 hypotheses, design a complete wet-lab experimental plan: + +``` +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +EXPERIMENTAL PLAN — [Hypothesis title] +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +RECOMMENDED MODEL SYSTEM: + Primary: [e.g., HEK293T cells / primary mouse hepatocytes / C57BL/6 mice] + Rationale: [why this model is appropriate] + Alternative: [backup model if primary unavailable] + +EXPERIMENT 1 — [Core test of the central claim] + Objective: [what this experiment proves or disproves] + + Method: + 1. [Step 1] + 2. [Step 2] + 3. [Step 3] + + Key reagents: + - [Antibody/siRNA/inhibitor/construct needed] + - [Source: commercial/need to generate] + + Readout: [what you measure — Western blot / qPCR / immunofluorescence / etc.] + + Expected result if hypothesis is TRUE: + - [specific measurable outcome, e.g., "50%+ increase in phospho-X levels"] + + Expected result if hypothesis is FALSE: + - [what you'd see instead] + + Controls: + - Positive control: [what and why] + - Negative control: [what and why] + - Technical control: [e.g., loading control, vehicle control] + + Estimated time: [X weeks] + Difficulty: [Easy / Medium / Hard] + +EXPERIMENT 2 — [Validation / Orthogonal approach] + [same structure as Experiment 1] + +EXPERIMENT 3 — [In vivo / disease-relevance test, if applicable] + [same structure — note if this requires animal work / ethics approval] + +DECISION TREE: + If Experiment 1 result is positive → proceed to Experiment 2 + If Experiment 1 result is negative → [interpret: reject hypothesis OR check [alternative explanation]] + If Experiment 2 confirms → [next step: submit for funding / expand to in vivo] + If Experiment 2 conflicts with Experiment 1 → [troubleshoot: check [specific variable]] + +TIMELINE OVERVIEW: + Week 1-2: [setup, reagent procurement] + Week 3-6: [Experiment 1] + Week 7-10: [Experiment 2] + Week 11-16: [Experiment 3, if applicable] + Total estimated time to proof-of-concept: [X months] + +KEY RISKS: + - [Risk 1: e.g., "Primary antibody may not work in mouse samples"] + Mitigation: [e.g., "Order 2 alternative antibodies from different vendors"] + - [Risk 2: e.g., "Model system may not recapitulate in vivo physiology"] + Mitigation: [e.g., "Validate key finding in primary cells"] +``` + +--- + +## FINAL OUTPUT FORMAT + +After completing all 5 stages, produce a summary: + +``` +╔══════════════════════════════════════════════════════╗ +║ RESEARCH BRIEF — [RESEARCH DIRECTION] ║ +║ Generated: [date] ║ +╚══════════════════════════════════════════════════════╝ + +LITERATURE COVERAGE: + PubMed papers reviewed: ~[N] + Preprints reviewed: ~[N] + Key pathways identified: [list] + +ALL HYPOTHESES RANKED: + #1 [score] — [title] + #2 [score] — [title] + #3 [score] — [title] ← TOP 3 + #4 [score] — [title] + #5 [score] — [title] + [#6+ if generated] + +TOP 3 RECOMMENDED FOR INVESTIGATION: + → [Hypothesis 1 title] (strongest evidence + feasible) + → [Hypothesis 2 title] (most novel) + → [Hypothesis 3 title] (highest clinical impact) + +NEXT STEPS: + Immediate (0-1 month): [first experiment to run] + Short-term (1-6 months): [validation plan] + Long-term (6-18 months): [expansion strategy] + +FULL EXPERIMENTAL PLANS: see sections above +``` + +Save the complete output to `/workspace/group/research-brief-[slug].md` where [slug] is a short version of the research direction. + +Tell the user: "研究简报已完成,保存在 research-brief-[slug].md。以下是摘要:" then show the summary block. + +--- + +## IMPORTANT NOTES + +- **Do not skip the debate stage** — the debate is essential to filter weak hypotheses +- **Wet lab focus** — all experimental designs must be physically executable (pipettes, cells, animals), not just computational +- **Be specific** — vague statements like "further research needed" are not acceptable; every gap should map to a specific experiment +- **Cite as you go** — whenever you make a claim, reference which paper or database it came from +- **Chinese output is fine** — if the user wrote in Chinese, respond in Chinese throughout diff --git a/container/skills.dev-backup/bio-research-pipeline/scripts/pathway-search b/container/skills.dev-backup/bio-research-pipeline/scripts/pathway-search new file mode 100755 index 0000000..fa7abee --- /dev/null +++ b/container/skills.dev-backup/bio-research-pipeline/scripts/pathway-search @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 +""" +Pathway database search (KEGG + Reactome + STRING) for bio-research-pipeline. + +Usage: + pathway-search "gene or pathway name" [--gene SYMBOL] [--species hsa|mmu] + +Output: relevant pathways, interacting proteins, and known regulatory relationships +""" + +import sys +import argparse +import json +import time + +try: + import requests +except ImportError: + print("ERROR: requests not installed.", file=sys.stderr) + sys.exit(1) + + +def search_kegg_pathways(query: str, species: str = "hsa") -> list[dict]: + """Search KEGG for pathways matching a keyword.""" + results = [] + + # KEGG find: search pathway database + url = f"https://rest.kegg.jp/find/pathway/{requests.utils.quote(query)}" + try: + r = requests.get(url, timeout=15) + if r.status_code == 200 and r.text.strip(): + for line in r.text.strip().split("\n"): + parts = line.split("\t", 1) + if len(parts) == 2: + path_id, path_name = parts + results.append({"id": path_id, "name": path_name, "source": "KEGG"}) + except Exception as e: + print(f"[WARNING] KEGG pathway search failed: {e}", file=sys.stderr) + + return results[:10] + + +def get_kegg_pathway_genes(pathway_id: str) -> list[str]: + """Get genes in a KEGG pathway.""" + # Convert to species-specific ID if needed + if not pathway_id.startswith("path:"): + pathway_id = f"path:{pathway_id}" + + url = f"https://rest.kegg.jp/get/{pathway_id}" + genes = [] + try: + r = requests.get(url, timeout=15) + if r.status_code == 200: + in_gene_section = False + for line in r.text.split("\n"): + if line.startswith("GENE"): + in_gene_section = True + elif line.startswith("COMPOUND") or line.startswith("REACTION") or line.startswith("///"): + in_gene_section = False + if in_gene_section and line.strip(): + # Extract gene symbol from format: " 1234 GENE_SYMBOL; description" + parts = line.strip().split(";")[0].split() + if len(parts) >= 2: + genes.append(parts[1]) + except Exception as e: + print(f"[WARNING] KEGG gene fetch failed: {e}", file=sys.stderr) + + return genes[:30] + + +def search_reactome(query: str) -> list[dict]: + """Search Reactome for pathways.""" + url = "https://reactome.org/ContentService/search/query" + params = { + "query": query, + "species": "Homo sapiens", + "types": "Pathway", + "cluster": "true", + } + results = [] + + try: + r = requests.get(url, params=params, timeout=20) + if r.status_code == 200: + data = r.json() + entries = data.get("results", [{}])[0].get("entries", []) if data.get("results") else [] + for entry in entries[:8]: + results.append({ + "id": entry.get("stId", ""), + "name": entry.get("name", ""), + "type": entry.get("type", ""), + "species": entry.get("species", ""), + "source": "Reactome", + "url": f"https://reactome.org/PathwayBrowser/#/{entry.get('stId', '')}", + }) + except Exception as e: + print(f"[WARNING] Reactome search failed: {e}", file=sys.stderr) + + return results + + +def search_string_interactions(gene_symbol: str, species: int = 9606, limit: int = 20) -> list[dict]: + """Get protein-protein interactions from STRING.""" + url = "https://string-db.org/api/json/network" + params = { + "identifiers": gene_symbol, + "species": species, + "limit": limit, + "required_score": 700, # high confidence + "caller_identity": "bioclaw-research-pipeline", + } + interactions = [] + + try: + r = requests.get(url, params=params, timeout=20) + if r.status_code == 200: + data = r.json() + for edge in data[:limit]: + interactions.append({ + "protein_a": edge.get("preferredName_A", ""), + "protein_b": edge.get("preferredName_B", ""), + "score": edge.get("score", 0), + "source": "STRING", + }) + except Exception as e: + print(f"[WARNING] STRING search failed: {e}", file=sys.stderr) + + return interactions + + +def search_string_functional(gene_symbol: str, species: int = 9606) -> list[dict]: + """Get functional enrichment for a gene from STRING.""" + url = "https://string-db.org/api/json/functional_annotation" + params = { + "identifiers": gene_symbol, + "species": species, + "caller_identity": "bioclaw-research-pipeline", + } + annotations = [] + + try: + r = requests.get(url, params=params, timeout=20) + if r.status_code == 200: + data = r.json() + for item in data[:15]: + category = item.get("category", "") + if category in ("KEGG", "Reactome", "Process"): + annotations.append({ + "category": category, + "term": item.get("term", ""), + "description": item.get("description", ""), + "fdr": item.get("fdr", 1.0), + }) + except Exception as e: + print(f"[WARNING] STRING functional annotation failed: {e}", file=sys.stderr) + + return annotations + + +def main(): + parser = argparse.ArgumentParser(description="Search pathway databases (KEGG, Reactome, STRING)") + parser.add_argument("query", help="Gene name, protein name, or pathway keyword") + parser.add_argument("--gene", help="Specific gene symbol for STRING interaction search") + parser.add_argument("--species", default="hsa", help="Species code (hsa=human, mmu=mouse)") + parser.add_argument("--json", action="store_true", help="Output raw JSON") + args = parser.parse_args() + + species_ncbi = 9606 if args.species in ("hsa", "human") else 10090 # mouse fallback + + print(f"[Pathway] Searching for: {args.query}", file=sys.stderr) + + # Run searches + kegg_pathways = search_kegg_pathways(args.query, species=args.species) + time.sleep(0.3) + + reactome_pathways = search_reactome(args.query) + time.sleep(0.3) + + gene_symbol = args.gene or args.query.split()[0] + string_interactions = search_string_interactions(gene_symbol, species=species_ncbi) + time.sleep(0.3) + + string_functions = search_string_functional(gene_symbol, species=species_ncbi) if args.gene else [] + + # Get genes for top KEGG pathway + kegg_genes = [] + if kegg_pathways: + top_pathway_id = kegg_pathways[0]["id"] + kegg_genes = get_kegg_pathway_genes(top_pathway_id) + + result = { + "query": args.query, + "kegg_pathways": kegg_pathways, + "kegg_genes_top_pathway": kegg_genes, + "reactome_pathways": reactome_pathways, + "string_interactions": string_interactions, + "string_functional": string_functions, + } + + if args.json: + print(json.dumps(result, ensure_ascii=False, indent=2)) + return + + # Formatted output + print(f"\n{'='*60}") + print(f"PATHWAY DATABASE RESULTS: {args.query}") + print(f"{'='*60}\n") + + # KEGG + if kegg_pathways: + print(f"KEGG PATHWAYS ({len(kegg_pathways)} found):") + for p in kegg_pathways: + print(f" {p['id']:20s} {p['name']}") + if kegg_genes: + print(f"\n Genes in top pathway ({kegg_pathways[0]['name']}):") + print(f" {', '.join(kegg_genes)}") + else: + print("KEGG: No pathways found for this query.") + + print() + + # Reactome + if reactome_pathways: + print(f"REACTOME PATHWAYS ({len(reactome_pathways)} found):") + for p in reactome_pathways: + print(f" [{p['id']}] {p['name']}") + print(f" URL: {p['url']}") + else: + print("Reactome: No pathways found.") + + print() + + # STRING + if string_interactions: + print(f"STRING INTERACTIONS for '{gene_symbol}' (high-confidence, score≥0.7):") + for edge in string_interactions[:15]: + score_pct = int(edge["score"] * 100) + print(f" {edge['protein_a']:12s} ↔ {edge['protein_b']:12s} (confidence: {score_pct}%)") + else: + print(f"STRING: No high-confidence interactions found for '{gene_symbol}'.") + + if string_functions: + print(f"\n Functional annotations:") + for fn in string_functions[:8]: + print(f" [{fn['category']}] {fn['description']} (FDR={fn['fdr']:.2e})") + + print(f"\n{'='*60}") + print("END OF PATHWAY RESULTS") + print(f"{'='*60}") + + print("\nNOTE FOR AGENT: Use these pathways as context when formulating hypotheses.") + print("Cross-reference pathway membership with literature findings to identify key nodes.") + + +if __name__ == "__main__": + main() diff --git a/container/skills.dev-backup/bio-research-pipeline/scripts/preprint-fetch b/container/skills.dev-backup/bio-research-pipeline/scripts/preprint-fetch new file mode 100755 index 0000000..7cb8887 --- /dev/null +++ b/container/skills.dev-backup/bio-research-pipeline/scripts/preprint-fetch @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +""" +Preprint fetcher (bioRxiv + medRxiv) for bio-research-pipeline. + +Usage: + preprint-fetch "research topic" [--max 30] [--days 180] [--server biorxiv|medrxiv|both] + +Output: structured text summary of relevant preprints +""" + +import sys +import argparse +import json +import re +import time +from datetime import datetime, timedelta +from urllib.parse import quote + +try: + import requests +except ImportError: + print("ERROR: requests not installed. Run: pip3 install requests", file=sys.stderr) + sys.exit(1) + + +BIORXIV_API = "https://api.biorxiv.org/details/biorxiv" +MEDRXIV_API = "https://api.biorxiv.org/details/medrxiv" + + +def fetch_server(api_url: str, start_date: str, end_date: str, cursor: int = 0) -> list[dict]: + url = f"{api_url}/{start_date}/{end_date}/{cursor}/json" + try: + r = requests.get(url, timeout=30) + r.raise_for_status() + data = r.json() + return data.get("collection", []) + except Exception as e: + print(f"[WARNING] API error: {e}", file=sys.stderr) + return [] + + +def score_relevance(paper: dict, keywords: list[str]) -> float: + title = paper.get("title", "").lower() + abstract = paper.get("abstract", "").lower() + category = paper.get("category", "").lower() + + score = 0.0 + for kw in keywords: + kw_lower = kw.lower() + if kw_lower in title: + score += 3.0 + if kw_lower in abstract: + score += 1.0 + if kw_lower in category: + score += 0.5 + + return score + + +def fetch_preprints( + topic: str, + days_back: int = 180, + max_results: int = 30, + server: str = "both", +) -> list[dict]: + end_date = datetime.now().strftime("%Y-%m-%d") + start_date = (datetime.now() - timedelta(days=days_back)).strftime("%Y-%m-%d") + + keywords = [w for w in topic.split() if len(w) > 3] + + print(f"[Preprint] Date range: {start_date} to {end_date}", file=sys.stderr) + print(f"[Preprint] Keywords: {keywords}", file=sys.stderr) + + all_papers = [] + + servers_to_search = [] + if server in ("biorxiv", "both"): + servers_to_search.append(("bioRxiv", BIORXIV_API)) + if server in ("medrxiv", "both"): + servers_to_search.append(("medRxiv", MEDRXIV_API)) + + for server_name, api_url in servers_to_search: + print(f"[Preprint] Fetching from {server_name}...", file=sys.stderr) + + # Fetch up to 200 papers (API returns 100 per page) + for cursor in [0, 100]: + papers = fetch_server(api_url, start_date, end_date, cursor) + if not papers: + break + + # Score and filter + for paper in papers: + score = score_relevance(paper, keywords) + if score > 0: + all_papers.append({ + "server": server_name, + "title": paper.get("title", ""), + "authors": paper.get("authors", ""), + "date": paper.get("date", ""), + "doi": paper.get("doi", ""), + "abstract": paper.get("abstract", ""), + "category": paper.get("category", ""), + "version": paper.get("version", "1"), + "relevance_score": score, + }) + + time.sleep(0.3) # be polite to API + + # Sort by relevance, then by date (most recent first) + all_papers.sort(key=lambda x: (x["relevance_score"], x["date"]), reverse=True) + + return all_papers[:max_results] + + +def main(): + parser = argparse.ArgumentParser(description="Fetch and summarize bioRxiv/medRxiv preprints") + parser.add_argument("topic", help="Research topic") + parser.add_argument("--max", type=int, default=30, help="Max preprints to return (default: 30)") + parser.add_argument("--days", type=int, default=180, help="Days back to search (default: 180)") + parser.add_argument("--server", choices=["biorxiv", "medrxiv", "both"], default="both") + parser.add_argument("--json", action="store_true", help="Output raw JSON") + args = parser.parse_args() + + print(f"[Preprint] Searching for: {args.topic}", file=sys.stderr) + + papers = fetch_preprints( + topic=args.topic, + days_back=args.days, + max_results=args.max, + server=args.server, + ) + + print(f"[Preprint] Found {len(papers)} relevant preprints", file=sys.stderr) + + if args.json: + print(json.dumps(papers, ensure_ascii=False, indent=2)) + return + + print(f"\n{'='*60}") + print(f"PREPRINT RESULTS: {args.topic}") + print(f"Papers found: {len(papers)} | Servers: {args.server} | Period: last {args.days} days") + print(f"{'='*60}\n") + + if not papers: + print("No relevant preprints found for this topic in the specified date range.") + print("Suggestions:") + print(" - Try broader search terms") + print(" - Increase --days parameter") + print(f" - Search directly at https://www.biorxiv.org/search/{quote(args.topic)}") + return + + for i, paper in enumerate(papers, 1): + # Truncate authors + authors = paper["authors"] + if len(authors) > 80: + authors = authors[:80] + "..." + + print(f"[{i}] [{paper['server']}] {paper['title']}") + print(f" {authors}") + print(f" Date: {paper['date']} | Category: {paper['category']} | v{paper['version']}") + print(f" DOI: https://doi.org/{paper['doi']}" if paper['doi'] else " DOI: N/A") + + abstract = paper["abstract"] + if len(abstract) > 450: + abstract = abstract[:450] + "..." + if abstract: + print(f" {abstract}") + + print(f" Relevance score: {paper['relevance_score']:.1f}") + print() + + print(f"\n{'='*60}") + print("END OF PREPRINT RESULTS") + print(f"{'='*60}") + + # Highlight novel findings + print("\nNOTE FOR AGENT: Preprints have NOT been peer-reviewed.") + print("Treat findings as preliminary. Check if any have since been published in journals.") + print(f"Direct search URL: https://www.biorxiv.org/search/{quote(args.topic)}") + + +if __name__ == "__main__": + main() diff --git a/container/skills.dev-backup/bio-research-pipeline/scripts/pubmed-fetch b/container/skills.dev-backup/bio-research-pipeline/scripts/pubmed-fetch new file mode 100755 index 0000000..4441d4b --- /dev/null +++ b/container/skills.dev-backup/bio-research-pipeline/scripts/pubmed-fetch @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +""" +PubMed literature fetcher for bio-research-pipeline. + +Usage: + pubmed-fetch "research topic" [--max 40] [--years 5] [--mode abstract|full] + +Output: structured text summary of relevant papers +""" + +import sys +import argparse +import json +import re +from datetime import datetime + +try: + from Bio import Entrez, Medline +except ImportError: + print("ERROR: biopython not installed. Run: pip3 install biopython", file=sys.stderr) + sys.exit(1) + +Entrez.email = "bioclaw-agent@research.ai" +Entrez.tool = "BioClaw-ResearchPipeline" + + +def search_pubmed(query: str, max_results: int = 40, years_back: int = 5) -> list[str]: + current_year = datetime.now().year + min_year = current_year - years_back + + handle = Entrez.esearch( + db="pubmed", + term=query, + retmax=max_results, + sort="relevance", + datetype="pdat", + mindate=str(min_year), + maxdate=str(current_year), + ) + record = Entrez.read(handle) + handle.close() + return record["IdList"] + + +def fetch_abstracts(pmids: list[str]) -> list[dict]: + if not pmids: + return [] + + handle = Entrez.efetch( + db="pubmed", + id=",".join(pmids), + rettype="medline", + retmode="text", + ) + records = list(Medline.parse(handle)) + handle.close() + + papers = [] + for rec in records: + papers.append({ + "pmid": rec.get("PMID", ""), + "title": rec.get("TI", "No title"), + "authors": rec.get("AU", [])[:3], # first 3 authors + "journal": rec.get("TA", ""), + "year": rec.get("DP", "")[:4], + "abstract": rec.get("AB", "No abstract available"), + "mesh_terms": rec.get("MH", [])[:10], + "keywords": rec.get("OT", [])[:10], + }) + return papers + + +def extract_key_entities(papers: list[dict]) -> dict: + """Simple heuristic extraction of genes, proteins, pathways from abstracts.""" + all_text = " ".join(p["abstract"] for p in papers).upper() + + # Common pathway keywords + pathways = [] + pathway_keywords = [ + "MAPK", "PI3K", "AKT", "mTOR", "NF-κB", "NFKB", "Wnt", "WNT", + "Notch", "NOTCH", "Hedgehog", "JAK", "STAT", "TGF-β", "TGFB", + "p53", "TP53", "AMPK", "HIF", "VEGF", "TNF", "IL-6", "IL6", + "Hippo", "YAP", "TAZ", "KRAS", "EGFR", "ERK", "JNK", "p38", + "CDK", "RB", "E2F", "Autophagy", "AUTOPHAGY", "Apoptosis", "APOPTOSIS", + "Ferroptosis", "FERROPTOSIS", "Pyroptosis", "Ubiquitin", "UBIQUITIN", + ] + for kw in pathway_keywords: + if kw.upper() in all_text: + pathways.append(kw) + + return {"mentioned_pathways": list(set(pathways))[:20]} + + +def main(): + parser = argparse.ArgumentParser(description="Fetch and summarize PubMed literature") + parser.add_argument("topic", help="Research topic or query string") + parser.add_argument("--max", type=int, default=40, help="Max papers to retrieve (default: 40)") + parser.add_argument("--years", type=int, default=5, help="Years back to search (default: 5)") + parser.add_argument("--json", action="store_true", help="Output raw JSON instead of formatted text") + args = parser.parse_args() + + print(f"[PubMed] Searching for: {args.topic}", file=sys.stderr) + print(f"[PubMed] Parameters: max={args.max}, years_back={args.years}", file=sys.stderr) + + # Build enriched query + base_query = args.topic + mechanism_query = f"({base_query}[Title/Abstract]) AND (mechanism[Title/Abstract] OR pathway[Title/Abstract] OR signaling[Title/Abstract] OR molecular[Title/Abstract])" + + pmids = search_pubmed(mechanism_query, max_results=args.max, years_back=args.years) + print(f"[PubMed] Found {len(pmids)} papers", file=sys.stderr) + + if not pmids: + # Fallback: broader search without mechanism filter + pmids = search_pubmed(base_query, max_results=args.max, years_back=args.years) + print(f"[PubMed] Fallback search: {len(pmids)} papers", file=sys.stderr) + + papers = fetch_abstracts(pmids[:30]) + entities = extract_key_entities(papers) + + if args.json: + print(json.dumps({"papers": papers, "entities": entities}, ensure_ascii=False, indent=2)) + return + + # Formatted output + print(f"\n{'='*60}") + print(f"PUBMED SEARCH RESULTS: {args.topic}") + print(f"Papers retrieved: {len(papers)} | Search period: last {args.years} years") + print(f"{'='*60}\n") + + if entities["mentioned_pathways"]: + print(f"KEY PATHWAYS MENTIONED ACROSS PAPERS:") + print(f" {', '.join(entities['mentioned_pathways'])}\n") + + for i, paper in enumerate(papers, 1): + authors_str = ", ".join(paper["authors"]) if paper["authors"] else "Unknown" + if len(paper["authors"]) >= 3: + authors_str += " et al." + + print(f"[{i}] {paper['title']}") + print(f" {authors_str} | {paper['journal']} {paper['year']} | PMID: {paper['pmid']}") + + # Truncate abstract to 400 chars + abstract = paper["abstract"] + if len(abstract) > 400: + abstract = abstract[:400] + "..." + print(f" {abstract}") + + if paper["mesh_terms"]: + print(f" MeSH: {', '.join(paper['mesh_terms'][:5])}") + print() + + print(f"\n{'='*60}") + print("END OF PUBMED RESULTS") + print(f"{'='*60}") + + +if __name__ == "__main__": + main() diff --git a/container/skills.dev-backup/bio-tools/SKILL.md b/container/skills.dev-backup/bio-tools/SKILL.md new file mode 100644 index 0000000..6379a44 --- /dev/null +++ b/container/skills.dev-backup/bio-tools/SKILL.md @@ -0,0 +1,105 @@ +--- +name: bio-tools +description: Biology research tools reference. Always available inside agent containers. +--- + +# Bio Tools Reference + +You are running inside a BioClaw container with the following biology tools pre-installed. + +## Quick Reference + +### Sequence Search +```bash +# Nucleotide BLAST +blastn -query input.fa -subject ref.fa -outfmt 6 -evalue 1e-5 + +# Protein BLAST +blastp -query protein.fa -subject ref_protein.fa -outfmt 6 + +# Translate then search +blastx -query nucleotide.fa -subject protein_db.fa -outfmt 6 +``` + +### Read Alignment +```bash +# Index reference +bwa index reference.fa + +# Align short reads +bwa mem reference.fa reads_R1.fq reads_R2.fq > aligned.sam + +# Long reads +minimap2 -a reference.fa long_reads.fq > aligned.sam + +# SAM to sorted BAM +samtools view -bS aligned.sam | samtools sort -o sorted.bam +samtools index sorted.bam +``` + +### Quality Control +```bash +# FastQC report +fastqc reads.fq -o qc_output/ + +# FASTA/FASTQ stats +seqtk comp reads.fq | head +seqtk size reads.fq +``` + +### Genome Arithmetic +```bash +# Intersect two BED files +bedtools intersect -a regions.bed -b features.bed + +# Coverage +bedtools coverage -a regions.bed -b aligned.bam + +# Get FASTA from BED regions +bedtools getfasta -fi reference.fa -bed regions.bed +``` + +### Python Quick Recipes + +```python +# Read FASTA/FASTQ +from Bio import SeqIO +for record in SeqIO.parse("input.fa", "fasta"): + print(record.id, len(record.seq)) + +# Fetch from NCBI +from Bio import Entrez +Entrez.email = "bioclaw@example.com" +handle = Entrez.efetch(db="nucleotide", id="NM_000546", rettype="fasta") +record = SeqIO.read(handle, "fasta") + +# Differential expression +from pydeseq2 import DeseqDataSet, DeseqStats +dds = DeseqDataSet(counts=count_matrix, metadata=metadata, design="~condition") +dds.deseq2() +stat_res = DeseqStats(dds, contrast=["condition", "treated", "untreated"]) +stat_res.summary() + +# Single-cell RNA-seq +import scanpy as sc +adata = sc.read_h5ad("data.h5ad") +sc.pp.normalize_total(adata) +sc.pp.log1p(adata) +sc.tl.pca(adata) +sc.tl.umap(adata) +sc.tl.leiden(adata) + +# Molecular structures +from rdkit import Chem +from rdkit.Chem import Descriptors +mol = Chem.MolFromSmiles("CC(=O)OC1=CC=CC=C1C(=O)O") # Aspirin +print(f"MW: {Descriptors.MolWt(mol):.1f}") +print(f"LogP: {Descriptors.MolLogP(mol):.2f}") +``` + +## Important Notes + +- For remote BLAST against NCBI, use `Bio.Blast.NCBIWWW.qblast()` — this sends the query over the network +- For large files, prefer streaming with `SeqIO.parse()` over `SeqIO.read()` +- Save plots to files (`plt.savefig("/workspace/group/plot.png")`) since there's no display +- Write output files to `/workspace/group/` so the user can access them diff --git a/container/skills/alphafold-database/scripts/analyze b/container/skills/alphafold-database/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/alphafold-database/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/alphafold-database/scripts/process b/container/skills/alphafold-database/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/alphafold-database/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/alphafold-database/scripts/run b/container/skills/alphafold-database/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/alphafold-database/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/alphafold-database/scripts/search b/container/skills/alphafold-database/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/alphafold-database/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/alphafold-database/skill.md b/container/skills/alphafold-database/skill.md new file mode 100644 index 0000000..bca3c86 --- /dev/null +++ b/container/skills/alphafold-database/skill.md @@ -0,0 +1,500 @@ +--- +name: alphafold-database +description: "Access AlphaFold's 200M+ AI-predicted protein structures. Retrieve structures by UniProt ID, download PDB/mmCIF files, analyze confidence metrics (pLDDT, PAE), for drug discovery and structural biology." +--- + +# AlphaFold Database + +## Overview + +AlphaFold DB is a public repository of AI-predicted 3D protein structures for over 200 million proteins, maintained by DeepMind and EMBL-EBI. Access structure predictions with confidence metrics, download coordinate files, retrieve bulk datasets, and integrate predictions into computational workflows. + +## When to Use This Skill + +This skill should be used when working with AI-predicted protein structures in scenarios such as: + +- Retrieving protein structure predictions by UniProt ID or protein name +- Downloading PDB/mmCIF coordinate files for structural analysis +- Analyzing prediction confidence metrics (pLDDT, PAE) to assess reliability +- Accessing bulk proteome datasets via Google Cloud Platform +- Comparing predicted structures with experimental data +- Performing structure-based drug discovery or protein engineering +- Building structural models for proteins lacking experimental structures +- Integrating AlphaFold predictions into computational pipelines + +## Core Capabilities + +### 1. Searching and Retrieving Predictions + +**Using Biopython (Recommended):** + +The Biopython library provides the simplest interface for retrieving AlphaFold structures: + +```python +from Bio.PDB import alphafold_db + +# Get all predictions for a UniProt accession +predictions = list(alphafold_db.get_predictions("P00520")) + +# Download structure file (mmCIF format) +for prediction in predictions: + cif_file = alphafold_db.download_cif_for(prediction, directory="./structures") + print(f"Downloaded: {cif_file}") + +# Get Structure objects directly +from Bio.PDB import MMCIFParser +structures = list(alphafold_db.get_structural_models_for("P00520")) +``` + +**Direct API Access:** + +Query predictions using REST endpoints: + +```python +import requests + +# Get prediction metadata for a UniProt accession +uniprot_id = "P00520" +api_url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}" +response = requests.get(api_url) +prediction_data = response.json() + +# Extract AlphaFold ID +alphafold_id = prediction_data[0]['entryId'] +print(f"AlphaFold ID: {alphafold_id}") +``` + +**Using UniProt to Find Accessions:** + +Search UniProt to find protein accessions first: + +```python +import urllib.parse, urllib.request + +def get_uniprot_ids(query, query_type='PDB_ID'): + """Query UniProt to get accession IDs""" + url = 'https://www.uniprot.org/uploadlists/' + params = { + 'from': query_type, + 'to': 'ACC', + 'format': 'txt', + 'query': query + } + data = urllib.parse.urlencode(params).encode('ascii') + with urllib.request.urlopen(urllib.request.Request(url, data)) as response: + return response.read().decode('utf-8').splitlines() + +# Example: Find UniProt IDs for a protein name +protein_ids = get_uniprot_ids("hemoglobin", query_type="GENE_NAME") +``` + +### 2. Downloading Structure Files + +AlphaFold provides multiple file formats for each prediction: + +**File Types Available:** + +- **Model coordinates** (`model_v4.cif`): Atomic coordinates in mmCIF/PDBx format +- **Confidence scores** (`confidence_v4.json`): Per-residue pLDDT scores (0-100) +- **Predicted Aligned Error** (`predicted_aligned_error_v4.json`): PAE matrix for residue pair confidence + +**Download URLs:** + +```python +import requests + +alphafold_id = "AF-P00520-F1" +version = "v4" + +# Model coordinates (mmCIF) +model_url = f"https://alphafold.ebi.ac.uk/files/{alphafold_id}-model_{version}.cif" +response = requests.get(model_url) +with open(f"{alphafold_id}.cif", "w") as f: + f.write(response.text) + +# Confidence scores (JSON) +confidence_url = f"https://alphafold.ebi.ac.uk/files/{alphafold_id}-confidence_{version}.json" +response = requests.get(confidence_url) +confidence_data = response.json() + +# Predicted Aligned Error (JSON) +pae_url = f"https://alphafold.ebi.ac.uk/files/{alphafold_id}-predicted_aligned_error_{version}.json" +response = requests.get(pae_url) +pae_data = response.json() +``` + +**PDB Format (Alternative):** + +```python +# Download as PDB format instead of mmCIF +pdb_url = f"https://alphafold.ebi.ac.uk/files/{alphafold_id}-model_{version}.pdb" +response = requests.get(pdb_url) +with open(f"{alphafold_id}.pdb", "wb") as f: + f.write(response.content) +``` + +### 3. Working with Confidence Metrics + +AlphaFold predictions include confidence estimates critical for interpretation: + +**pLDDT (per-residue confidence):** + +```python +import json +import requests + +# Load confidence scores +alphafold_id = "AF-P00520-F1" +confidence_url = f"https://alphafold.ebi.ac.uk/files/{alphafold_id}-confidence_v4.json" +confidence = requests.get(confidence_url).json() + +# Extract pLDDT scores +plddt_scores = confidence['confidenceScore'] + +# Interpret confidence levels +# pLDDT > 90: Very high confidence +# pLDDT 70-90: High confidence +# pLDDT 50-70: Low confidence +# pLDDT < 50: Very low confidence + +high_confidence_residues = [i for i, score in enumerate(plddt_scores) if score > 90] +print(f"High confidence residues: {len(high_confidence_residues)}/{len(plddt_scores)}") +``` + +**PAE (Predicted Aligned Error):** + +PAE indicates confidence in relative domain positions: + +```python +import numpy as np +import matplotlib.pyplot as plt + +# Load PAE matrix +pae_url = f"https://alphafold.ebi.ac.uk/files/{alphafold_id}-predicted_aligned_error_v4.json" +pae = requests.get(pae_url).json() + +# Visualize PAE matrix +pae_matrix = np.array(pae['distance']) +plt.figure(figsize=(10, 8)) +plt.imshow(pae_matrix, cmap='viridis_r', vmin=0, vmax=30) +plt.colorbar(label='PAE (Å)') +plt.title(f'Predicted Aligned Error: {alphafold_id}') +plt.xlabel('Residue') +plt.ylabel('Residue') +plt.savefig(f'{alphafold_id}_pae.png', dpi=300, bbox_inches='tight') + +# Low PAE values (<5 Å) indicate confident relative positioning +# High PAE values (>15 Å) suggest uncertain domain arrangements +``` + +### 4. Bulk Data Access via Google Cloud + +For large-scale analyses, use Google Cloud datasets: + +**Google Cloud Storage:** + +```bash +# Install gsutil +uv pip install gsutil + +# List available data +gsutil ls gs://public-datasets-deepmind-alphafold-v4/ + +# Download entire proteomes (by taxonomy ID) +gsutil -m cp gs://public-datasets-deepmind-alphafold-v4/proteomes/proteome-tax_id-9606-*.tar . + +# Download specific files +gsutil cp gs://public-datasets-deepmind-alphafold-v4/accession_ids.csv . +``` + +**BigQuery Metadata Access:** + +```python +from google.cloud import bigquery + +# Initialize client +client = bigquery.Client() + +# Query metadata +query = """ +SELECT + entryId, + uniprotAccession, + organismScientificName, + globalMetricValue, + fractionPlddtVeryHigh +FROM `bigquery-public-data.deepmind_alphafold.metadata` +WHERE organismScientificName = 'Homo sapiens' + AND fractionPlddtVeryHigh > 0.8 +LIMIT 100 +""" + +results = client.query(query).to_dataframe() +print(f"Found {len(results)} high-confidence human proteins") +``` + +**Download by Species:** + +```python +import subprocess + +def download_proteome(taxonomy_id, output_dir="./proteomes"): + """Download all AlphaFold predictions for a species""" + pattern = f"gs://public-datasets-deepmind-alphafold-v4/proteomes/proteome-tax_id-{taxonomy_id}-*_v4.tar" + cmd = f"gsutil -m cp {pattern} {output_dir}/" + subprocess.run(cmd, shell=True, check=True) + +# Download E. coli proteome (tax ID: 83333) +download_proteome(83333) + +# Download human proteome (tax ID: 9606) +download_proteome(9606) +``` + +### 5. Parsing and Analyzing Structures + +Work with downloaded AlphaFold structures using BioPython: + +```python +from Bio.PDB import MMCIFParser, PDBIO +import numpy as np + +# Parse mmCIF file +parser = MMCIFParser(QUIET=True) +structure = parser.get_structure("protein", "AF-P00520-F1-model_v4.cif") + +# Extract coordinates +coords = [] +for model in structure: + for chain in model: + for residue in chain: + if 'CA' in residue: # Alpha carbons only + coords.append(residue['CA'].get_coord()) + +coords = np.array(coords) +print(f"Structure has {len(coords)} residues") + +# Calculate distances +from scipy.spatial.distance import pdist, squareform +distance_matrix = squareform(pdist(coords)) + +# Identify contacts (< 8 Å) +contacts = np.where((distance_matrix > 0) & (distance_matrix < 8)) +print(f"Number of contacts: {len(contacts[0]) // 2}") +``` + +**Extract B-factors (pLDDT values):** + +AlphaFold stores pLDDT scores in the B-factor column: + +```python +from Bio.PDB import MMCIFParser + +parser = MMCIFParser(QUIET=True) +structure = parser.get_structure("protein", "AF-P00520-F1-model_v4.cif") + +# Extract pLDDT from B-factors +plddt_scores = [] +for model in structure: + for chain in model: + for residue in chain: + if 'CA' in residue: + plddt_scores.append(residue['CA'].get_bfactor()) + +# Identify high-confidence regions +high_conf_regions = [(i, score) for i, score in enumerate(plddt_scores, 1) if score > 90] +print(f"High confidence residues: {len(high_conf_regions)}") +``` + +### 6. Batch Processing Multiple Proteins + +Process multiple predictions efficiently: + +```python +from Bio.PDB import alphafold_db +import pandas as pd + +uniprot_ids = ["P00520", "P12931", "P04637"] # Multiple proteins +results = [] + +for uniprot_id in uniprot_ids: + try: + # Get prediction + predictions = list(alphafold_db.get_predictions(uniprot_id)) + + if predictions: + pred = predictions[0] + + # Download structure + cif_file = alphafold_db.download_cif_for(pred, directory="./batch_structures") + + # Get confidence data + alphafold_id = pred['entryId'] + conf_url = f"https://alphafold.ebi.ac.uk/files/{alphafold_id}-confidence_v4.json" + conf_data = requests.get(conf_url).json() + + # Calculate statistics + plddt_scores = conf_data['confidenceScore'] + avg_plddt = np.mean(plddt_scores) + high_conf_fraction = sum(1 for s in plddt_scores if s > 90) / len(plddt_scores) + + results.append({ + 'uniprot_id': uniprot_id, + 'alphafold_id': alphafold_id, + 'avg_plddt': avg_plddt, + 'high_conf_fraction': high_conf_fraction, + 'length': len(plddt_scores) + }) + except Exception as e: + print(f"Error processing {uniprot_id}: {e}") + +# Create summary DataFrame +df = pd.DataFrame(results) +print(df) +``` + +## Installation and Setup + +### Python Libraries + +```bash +# Install Biopython for structure access +uv pip install biopython + +# Install requests for API access +uv pip install requests + +# For visualization and analysis +uv pip install numpy matplotlib pandas scipy + +# For Google Cloud access (optional) +uv pip install google-cloud-bigquery gsutil +``` + +### 3D-Beacons API Alternative + +AlphaFold can also be accessed via the 3D-Beacons federated API: + +```python +import requests + +# Query via 3D-Beacons +uniprot_id = "P00520" +url = f"https://www.ebi.ac.uk/pdbe/pdbe-kb/3dbeacons/api/uniprot/summary/{uniprot_id}.json" +response = requests.get(url) +data = response.json() + +# Filter for AlphaFold structures +af_structures = [s for s in data['structures'] if s['provider'] == 'AlphaFold DB'] +``` + +## Common Use Cases + +### Structural Proteomics +- Download complete proteome predictions for analysis +- Identify high-confidence structural regions across proteins +- Compare predicted structures with experimental data +- Build structural models for protein families + +### Drug Discovery +- Retrieve target protein structures for docking studies +- Analyze binding site conformations +- Identify druggable pockets in predicted structures +- Compare structures across homologs + +### Protein Engineering +- Identify stable/unstable regions using pLDDT +- Design mutations in high-confidence regions +- Analyze domain architectures using PAE +- Model protein variants and mutations + +### Evolutionary Studies +- Compare ortholog structures across species +- Analyze conservation of structural features +- Study domain evolution patterns +- Identify functionally important regions + +## Key Concepts + +**UniProt Accession:** Primary identifier for proteins (e.g., "P00520"). Required for querying AlphaFold DB. + +**AlphaFold ID:** Internal identifier format: `AF-[UniProt accession]-F[fragment number]` (e.g., "AF-P00520-F1"). + +**pLDDT (predicted Local Distance Difference Test):** Per-residue confidence metric (0-100). Higher values indicate more confident predictions. + +**PAE (Predicted Aligned Error):** Matrix indicating confidence in relative positions between residue pairs. Low values (<5 Å) suggest confident relative positioning. + +**Database Version:** Current version is v4. File URLs include version suffix (e.g., `model_v4.cif`). + +**Fragment Number:** Large proteins may be split into fragments. Fragment number appears in AlphaFold ID (e.g., F1, F2). + +## Confidence Interpretation Guidelines + +**pLDDT Thresholds:** +- **>90**: Very high confidence - suitable for detailed analysis +- **70-90**: High confidence - generally reliable backbone structure +- **50-70**: Low confidence - use with caution, flexible regions +- **<50**: Very low confidence - likely disordered or unreliable + +**PAE Guidelines:** +- **<5 Å**: Confident relative positioning of domains +- **5-10 Å**: Moderate confidence in arrangement +- **>15 Å**: Uncertain relative positions, domains may be mobile + +## Resources + +### references/api_reference.md + +Comprehensive API documentation covering: +- Complete REST API endpoint specifications +- File format details and data schemas +- Google Cloud dataset structure and access patterns +- Advanced query examples and batch processing strategies +- Rate limiting, caching, and best practices +- Troubleshooting common issues + +Consult this reference for detailed API information, bulk download strategies, or when working with large-scale datasets. + +## Important Notes + +### Data Usage and Attribution + +- AlphaFold DB is freely available under CC-BY-4.0 license +- Cite: Jumper et al. (2021) Nature and Varadi et al. (2022) Nucleic Acids Research +- Predictions are computational models, not experimental structures +- Always assess confidence metrics before downstream analysis + +### Version Management + +- Current database version: v4 (as of 2024-2025) +- File URLs include version suffix (e.g., `_v4.cif`) +- Check for database updates regularly +- Older versions may be deprecated over time + +### Data Quality Considerations + +- High pLDDT doesn't guarantee functional accuracy +- Low confidence regions may be disordered in vivo +- PAE indicates relative domain confidence, not absolute positioning +- Predictions lack ligands, post-translational modifications, and cofactors +- Multi-chain complexes are not predicted (single chains only) + +### Performance Tips + +- Use Biopython for simple single-protein access +- Use Google Cloud for bulk downloads (much faster than individual files) +- Cache downloaded files locally to avoid repeated downloads +- BigQuery free tier: 1 TB processed data per month +- Consider network bandwidth for large-scale downloads + +## Additional Resources + +- **AlphaFold DB Website:** https://alphafold.ebi.ac.uk/ +- **API Documentation:** https://alphafold.ebi.ac.uk/api-docs +- **Google Cloud Dataset:** https://cloud.google.com/blog/products/ai-machine-learning/alphafold-protein-structure-database +- **3D-Beacons API:** https://www.ebi.ac.uk/pdbe/pdbe-kb/3dbeacons/ +- **AlphaFold Papers:** + - Nature (2021): https://doi.org/10.1038/s41586-021-03819-2 + - Nucleic Acids Research (2024): https://doi.org/10.1093/nar/gkad1011 +- **Biopython Documentation:** https://biopython.org/docs/dev/api/Bio.PDB.alphafold_db.html +- **GitHub Repository:** https://github.com/google-deepmind/alphafold diff --git a/container/skills/bio-alternative-splicing/scripts/analyze b/container/skills/bio-alternative-splicing/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-alternative-splicing/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-alternative-splicing/scripts/process b/container/skills/bio-alternative-splicing/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-alternative-splicing/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-alternative-splicing/scripts/run b/container/skills/bio-alternative-splicing/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-alternative-splicing/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-alternative-splicing/scripts/search b/container/skills/bio-alternative-splicing/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-alternative-splicing/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-alternative-splicing/skill.md b/container/skills/bio-alternative-splicing/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-alternative-splicing/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-atac-seq-analysis/scripts/analyze b/container/skills/bio-atac-seq-analysis/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-atac-seq-analysis/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-atac-seq-analysis/scripts/process b/container/skills/bio-atac-seq-analysis/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-atac-seq-analysis/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-atac-seq-analysis/scripts/run b/container/skills/bio-atac-seq-analysis/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-atac-seq-analysis/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-atac-seq-analysis/scripts/search b/container/skills/bio-atac-seq-analysis/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-atac-seq-analysis/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-atac-seq-analysis/skill.md b/container/skills/bio-atac-seq-analysis/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-atac-seq-analysis/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-bcr-seq/scripts/analyze b/container/skills/bio-bcr-seq/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-bcr-seq/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-bcr-seq/scripts/process b/container/skills/bio-bcr-seq/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-bcr-seq/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-bcr-seq/scripts/run b/container/skills/bio-bcr-seq/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-bcr-seq/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-bcr-seq/scripts/search b/container/skills/bio-bcr-seq/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-bcr-seq/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-bcr-seq/skill.md b/container/skills/bio-bcr-seq/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-bcr-seq/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-bisulfite-seq/scripts/analyze b/container/skills/bio-bisulfite-seq/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-bisulfite-seq/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-bisulfite-seq/scripts/process b/container/skills/bio-bisulfite-seq/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-bisulfite-seq/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-bisulfite-seq/scripts/run b/container/skills/bio-bisulfite-seq/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-bisulfite-seq/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-bisulfite-seq/scripts/search b/container/skills/bio-bisulfite-seq/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-bisulfite-seq/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-bisulfite-seq/skill.md b/container/skills/bio-bisulfite-seq/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-bisulfite-seq/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-annotation/scripts/analyze b/container/skills/bio-chip-seq-annotation/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-annotation/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-annotation/scripts/process b/container/skills/bio-chip-seq-annotation/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-annotation/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-annotation/scripts/run b/container/skills/bio-chip-seq-annotation/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-annotation/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-annotation/scripts/search b/container/skills/bio-chip-seq-annotation/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-annotation/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-annotation/skill.md b/container/skills/bio-chip-seq-annotation/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-annotation/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-motif-analysis/scripts/analyze b/container/skills/bio-chip-seq-motif-analysis/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-motif-analysis/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-motif-analysis/scripts/process b/container/skills/bio-chip-seq-motif-analysis/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-motif-analysis/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-motif-analysis/scripts/run b/container/skills/bio-chip-seq-motif-analysis/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-motif-analysis/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-motif-analysis/scripts/search b/container/skills/bio-chip-seq-motif-analysis/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-motif-analysis/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-motif-analysis/skill.md b/container/skills/bio-chip-seq-motif-analysis/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-motif-analysis/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-peak-calling/scripts/analyze b/container/skills/bio-chip-seq-peak-calling/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-peak-calling/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-peak-calling/scripts/process b/container/skills/bio-chip-seq-peak-calling/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-peak-calling/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-peak-calling/scripts/run b/container/skills/bio-chip-seq-peak-calling/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-peak-calling/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-peak-calling/scripts/search b/container/skills/bio-chip-seq-peak-calling/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-peak-calling/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-chip-seq-peak-calling/skill.md b/container/skills/bio-chip-seq-peak-calling/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-chip-seq-peak-calling/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-copy-number-variation/scripts/analyze b/container/skills/bio-copy-number-variation/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-copy-number-variation/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-copy-number-variation/scripts/process b/container/skills/bio-copy-number-variation/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-copy-number-variation/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-copy-number-variation/scripts/run b/container/skills/bio-copy-number-variation/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-copy-number-variation/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-copy-number-variation/scripts/search b/container/skills/bio-copy-number-variation/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-copy-number-variation/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-copy-number-variation/skill.md b/container/skills/bio-copy-number-variation/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-copy-number-variation/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-differential-expression-edger/scripts/analyze b/container/skills/bio-differential-expression-edger/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-differential-expression-edger/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-differential-expression-edger/scripts/process b/container/skills/bio-differential-expression-edger/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-differential-expression-edger/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-differential-expression-edger/scripts/run b/container/skills/bio-differential-expression-edger/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-differential-expression-edger/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-differential-expression-edger/scripts/search b/container/skills/bio-differential-expression-edger/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-differential-expression-edger/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-differential-expression-edger/skill.md b/container/skills/bio-differential-expression-edger/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-differential-expression-edger/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-gene-regulatory-networks/scripts/analyze b/container/skills/bio-gene-regulatory-networks/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-gene-regulatory-networks/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-gene-regulatory-networks/scripts/process b/container/skills/bio-gene-regulatory-networks/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-gene-regulatory-networks/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-gene-regulatory-networks/scripts/run b/container/skills/bio-gene-regulatory-networks/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-gene-regulatory-networks/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-gene-regulatory-networks/scripts/search b/container/skills/bio-gene-regulatory-networks/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-gene-regulatory-networks/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-gene-regulatory-networks/skill.md b/container/skills/bio-gene-regulatory-networks/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-gene-regulatory-networks/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-go-enrichment/scripts/analyze b/container/skills/bio-go-enrichment/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-go-enrichment/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-go-enrichment/scripts/process b/container/skills/bio-go-enrichment/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-go-enrichment/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-go-enrichment/scripts/run b/container/skills/bio-go-enrichment/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-go-enrichment/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-go-enrichment/scripts/search b/container/skills/bio-go-enrichment/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-go-enrichment/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-go-enrichment/skill.md b/container/skills/bio-go-enrichment/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-go-enrichment/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-hi-c-analysis/scripts/analyze b/container/skills/bio-hi-c-analysis/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-hi-c-analysis/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-hi-c-analysis/scripts/process b/container/skills/bio-hi-c-analysis/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-hi-c-analysis/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-hi-c-analysis/scripts/run b/container/skills/bio-hi-c-analysis/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-hi-c-analysis/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-hi-c-analysis/scripts/search b/container/skills/bio-hi-c-analysis/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-hi-c-analysis/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-hi-c-analysis/skill.md b/container/skills/bio-hi-c-analysis/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-hi-c-analysis/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-hla-typing/scripts/analyze b/container/skills/bio-hla-typing/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-hla-typing/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-hla-typing/scripts/process b/container/skills/bio-hla-typing/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-hla-typing/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-hla-typing/scripts/run b/container/skills/bio-hla-typing/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-hla-typing/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-hla-typing/scripts/search b/container/skills/bio-hla-typing/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-hla-typing/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-hla-typing/skill.md b/container/skills/bio-hla-typing/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-hla-typing/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-isoform-detection/scripts/analyze b/container/skills/bio-isoform-detection/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-isoform-detection/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-isoform-detection/scripts/process b/container/skills/bio-isoform-detection/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-isoform-detection/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-isoform-detection/scripts/run b/container/skills/bio-isoform-detection/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-isoform-detection/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-isoform-detection/scripts/search b/container/skills/bio-isoform-detection/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-isoform-detection/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-isoform-detection/skill.md b/container/skills/bio-isoform-detection/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-isoform-detection/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-kegg-pathway-analysis/scripts/analyze b/container/skills/bio-kegg-pathway-analysis/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-kegg-pathway-analysis/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-kegg-pathway-analysis/scripts/process b/container/skills/bio-kegg-pathway-analysis/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-kegg-pathway-analysis/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-kegg-pathway-analysis/scripts/run b/container/skills/bio-kegg-pathway-analysis/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-kegg-pathway-analysis/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-kegg-pathway-analysis/scripts/search b/container/skills/bio-kegg-pathway-analysis/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-kegg-pathway-analysis/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-kegg-pathway-analysis/skill.md b/container/skills/bio-kegg-pathway-analysis/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-kegg-pathway-analysis/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-long-read-assembly/scripts/analyze b/container/skills/bio-long-read-assembly/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-long-read-assembly/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-long-read-assembly/scripts/process b/container/skills/bio-long-read-assembly/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-long-read-assembly/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-long-read-assembly/scripts/run b/container/skills/bio-long-read-assembly/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-long-read-assembly/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-long-read-assembly/scripts/search b/container/skills/bio-long-read-assembly/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-long-read-assembly/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-long-read-assembly/skill.md b/container/skills/bio-long-read-assembly/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-long-read-assembly/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-metabolomics-annotation/scripts/analyze b/container/skills/bio-metabolomics-annotation/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-metabolomics-annotation/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-metabolomics-annotation/scripts/process b/container/skills/bio-metabolomics-annotation/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-metabolomics-annotation/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-metabolomics-annotation/scripts/run b/container/skills/bio-metabolomics-annotation/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-metabolomics-annotation/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-metabolomics-annotation/scripts/search b/container/skills/bio-metabolomics-annotation/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-metabolomics-annotation/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-metabolomics-annotation/skill.md b/container/skills/bio-metabolomics-annotation/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-metabolomics-annotation/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-metabolomics-pathway-mapping/scripts/analyze b/container/skills/bio-metabolomics-pathway-mapping/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-metabolomics-pathway-mapping/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-metabolomics-pathway-mapping/scripts/process b/container/skills/bio-metabolomics-pathway-mapping/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-metabolomics-pathway-mapping/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-metabolomics-pathway-mapping/scripts/run b/container/skills/bio-metabolomics-pathway-mapping/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-metabolomics-pathway-mapping/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-metabolomics-pathway-mapping/scripts/search b/container/skills/bio-metabolomics-pathway-mapping/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-metabolomics-pathway-mapping/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-metabolomics-pathway-mapping/skill.md b/container/skills/bio-metabolomics-pathway-mapping/skill.md new file mode 100644 index 0000000..87dc5e1 --- /dev/null +++ b/container/skills/bio-metabolomics-pathway-mapping/skill.md @@ -0,0 +1,237 @@ +--- +name: bio-metabolomics-pathway-mapping +description: Map metabolites to biological pathways using KEGG, Reactome, and MetaboAnalyst. Perform pathway enrichment and topology analysis. Use when interpreting metabolomics results in the context of biochemical pathways. +tool_type: r +primary_tool: MetaboAnalystR +--- + +## Version Compatibility + +Reference examples tested with: ReactomePA 1.46+, clusterProfiler 4.10+ + +Before using code patterns, verify installed versions match. If versions differ: +- R: `packageVersion('')` then `?function_name` to verify parameters + +If code throws ImportError, AttributeError, or TypeError, introspect the installed +package and adapt the example to match the actual API rather than retrying. + +# Metabolomics Pathway Mapping + +**"Map my metabolites to pathways"** → Perform pathway enrichment and topology analysis using KEGG, Reactome, or MetaboAnalyst to interpret metabolomics results in biochemical context. +- R: `MetaboAnalystR::SetMetabolomeFilter()` → `PerformDetailMatch()` → pathway topology + +## KEGG Pathway Enrichment + +```r +library(MetaboAnalystR) + +# Initialize MetaboAnalyst +mSet <- InitDataObjects('conc', 'pathora', FALSE) + +# Set organism +mSet <- SetOrganism(mSet, 'hsa') # Human + +# Load metabolite list (HMDB IDs or compound names) +metabolites <- c('HMDB0000001', 'HMDB0000005', 'HMDB0000010') # Example HMDB IDs +# Or use names: c('Glucose', 'Lactate', 'Pyruvate') + +mSet <- Setup.MapData(mSet, metabolites) +mSet <- CrossReferencing(mSet, 'hmdb') # Or 'name', 'kegg', 'pubchem' + +# Pathway analysis +mSet <- SetKEGG.PathLib(mSet, 'hsa', 'current') +mSet <- SetMetabolomeFilter(mSet, FALSE) +mSet <- CalculateOraScore(mSet, 'rbc', 'hyperg') # Over-representation + +# Get results +pathway_results <- mSet$analSet$ora.mat +print(pathway_results) +``` + +## Quantitative Enrichment Analysis (QEA) + +```r +# For continuous data (fold changes or concentrations) +mSet <- InitDataObjects('conc', 'pathqea', FALSE) +mSet <- SetOrganism(mSet, 'hsa') + +# Load data with values +metabolite_data <- data.frame( + compound = c('Glucose', 'Lactate', 'Pyruvate'), + fc = c(1.5, 2.3, 0.7) # Fold changes +) + +mSet <- Setup.MapData(mSet, metabolite_data) +mSet <- CrossReferencing(mSet, 'name') + +# QEA analysis +mSet <- SetKEGG.PathLib(mSet, 'hsa', 'current') +mSet <- CalculateQeaScore(mSet, 'rbc', 'gt') + +# Results +qea_results <- mSet$analSet$qea.mat +``` + +## Topology-Based Analysis + +```r +# Considers pathway structure (betweenness, degree) +mSet <- InitDataObjects('conc', 'pathinteg', FALSE) +mSet <- SetOrganism(mSet, 'hsa') + +mSet <- Setup.MapData(mSet, metabolites) +mSet <- CrossReferencing(mSet, 'hmdb') + +# Topology analysis +mSet <- SetKEGG.PathLib(mSet, 'hsa', 'current') +mSet <- SetMetabolomeFilter(mSet, FALSE) +mSet <- CalculateHyperScore(mSet) # Combined ORA + topology + +topo_results <- mSet$analSet$topo.mat +``` + +## Reactome Pathways + +```r +library(ReactomePA) +library(clusterProfiler) + +# Convert to Reactome IDs (if available) +reactome_ids <- c('R-HSA-70171', 'R-HSA-1428517') # Example + +# Enrichment +enriched <- enrichPathway(gene = reactome_ids, organism = 'human', pvalueCutoff = 0.05) +print(enriched) +``` + +## KEGG Mapper (Direct API) + +```r +library(KEGGREST) + +# Get pathway information +pathway_info <- keggGet('hsa00010') # Glycolysis + +# Map compounds to pathways +kegg_ids <- c('C00031', 'C00186', 'C00022') # Glucose, Lactate, Pyruvate + +# Find pathways containing these compounds +find_pathways <- function(kegg_id) { + pathways <- keggLink('pathway', kegg_id) + return(pathways) +} + +all_pathways <- lapply(kegg_ids, find_pathways) +``` + +## Pathway Visualization + +```r +library(pathview) + +# Visualize KEGG pathway with metabolite data +metabolite_data <- c('C00031' = 1.5, 'C00186' = 2.3, 'C00022' = 0.7) + +pathview(cpd.data = metabolite_data, + pathway.id = '00010', # Glycolysis + species = 'hsa', + cpd.idtype = 'kegg', + out.suffix = 'glycolysis_mapped') + +# Output: hsa00010.glycolysis_mapped.png +``` + +## Network-Based Analysis + +**Goal:** Visualize metabolite-pathway relationships as a bipartite network for identifying pathway crosstalk and hub metabolites. + +**Approach:** Extract metabolite-pathway edges from enrichment results, build an igraph network, and annotate nodes by type for interactive visualization. + +```r +library(igraph) + +# Build metabolite-pathway network +build_network <- function(pathway_results) { + edges <- data.frame() + + for (i in 1:nrow(pathway_results)) { + pathway <- rownames(pathway_results)[i] + metabolites <- strsplit(pathway_results$Metabolites[i], '; ')[[1]] + + for (met in metabolites) { + edges <- rbind(edges, data.frame(from = met, to = pathway)) + } + } + + g <- graph_from_data_frame(edges, directed = FALSE) + + # Add attributes + V(g)$type <- ifelse(V(g)$name %in% edges$from, 'metabolite', 'pathway') + + return(g) +} + +network <- build_network(pathway_results) +plot(network, vertex.size = ifelse(V(network)$type == 'pathway', 15, 5)) +``` + +## Metabolite Set Enrichment + +```r +# MSEA using predefined metabolite sets +mSet <- InitDataObjects('conc', 'msetora', FALSE) + +# Use SMPDB (Small Molecule Pathway Database) +mSet <- SetMetaboliteFilter(mSet, FALSE) +mSet <- SetCurrentMsetLib(mSet, 'smpdb_pathway', 2) + +mSet <- Setup.MapData(mSet, metabolites) +mSet <- CrossReferencing(mSet, 'hmdb') + +mSet <- CalculateHyperScore(mSet) +msea_results <- mSet$analSet$ora.mat +``` + +## Combine with Gene Expression + +```r +# Integrated pathway analysis (metabolites + genes) +library(IMPaLA) + +# Prepare gene list +genes <- c('HK1', 'PFKM', 'ALDOA') # Glycolysis enzymes + +# Prepare metabolite list +metabolites <- c('HMDB0000122', 'HMDB0000190') # Glucose, Lactate + +# Joint pathway analysis +# (Use MetaboAnalyst joint pathway analysis or custom integration) +``` + +## Export Results + +```r +# Format for publication +export_pathways <- function(results, output_file) { + results_df <- as.data.frame(results) + results_df$pathway <- rownames(results) + + # Select relevant columns + results_df <- results_df[, c('pathway', 'Total', 'Expected', 'Hits', + 'Raw p', 'Holm adjust', 'FDR', 'Impact')] + + # Sort by FDR + results_df <- results_df[order(results_df$FDR), ] + + write.csv(results_df, output_file, row.names = FALSE) + return(results_df) +} + +export_pathways(pathway_results, 'pathway_enrichment.csv') +``` + +## Related Skills + +- metabolite-annotation - Identify metabolites first +- statistical-analysis - Get significant metabolites +- pathway-analysis/kegg-pathways - Similar enrichment concepts for genes diff --git a/container/skills/bio-methylation-analysis/scripts/analyze b/container/skills/bio-methylation-analysis/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-methylation-analysis/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-methylation-analysis/scripts/process b/container/skills/bio-methylation-analysis/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-methylation-analysis/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-methylation-analysis/scripts/run b/container/skills/bio-methylation-analysis/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-methylation-analysis/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-methylation-analysis/scripts/search b/container/skills/bio-methylation-analysis/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-methylation-analysis/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-methylation-analysis/skill.md b/container/skills/bio-methylation-analysis/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-methylation-analysis/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-motif-discovery/scripts/analyze b/container/skills/bio-motif-discovery/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-motif-discovery/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-motif-discovery/scripts/process b/container/skills/bio-motif-discovery/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-motif-discovery/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-motif-discovery/scripts/run b/container/skills/bio-motif-discovery/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-motif-discovery/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-motif-discovery/scripts/search b/container/skills/bio-motif-discovery/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-motif-discovery/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-motif-discovery/skill.md b/container/skills/bio-motif-discovery/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-motif-discovery/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-multiome-analysis/scripts/analyze b/container/skills/bio-multiome-analysis/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-multiome-analysis/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-multiome-analysis/scripts/process b/container/skills/bio-multiome-analysis/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-multiome-analysis/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-multiome-analysis/scripts/run b/container/skills/bio-multiome-analysis/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-multiome-analysis/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-multiome-analysis/scripts/search b/container/skills/bio-multiome-analysis/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-multiome-analysis/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-multiome-analysis/skill.md b/container/skills/bio-multiome-analysis/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-multiome-analysis/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-nanopore-analysis/scripts/analyze b/container/skills/bio-nanopore-analysis/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-nanopore-analysis/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-nanopore-analysis/scripts/process b/container/skills/bio-nanopore-analysis/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-nanopore-analysis/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-nanopore-analysis/scripts/run b/container/skills/bio-nanopore-analysis/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-nanopore-analysis/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-nanopore-analysis/scripts/search b/container/skills/bio-nanopore-analysis/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-nanopore-analysis/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-nanopore-analysis/skill.md b/container/skills/bio-nanopore-analysis/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-nanopore-analysis/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-pathway-enrichment/scripts/analyze b/container/skills/bio-pathway-enrichment/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-pathway-enrichment/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-pathway-enrichment/scripts/process b/container/skills/bio-pathway-enrichment/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-pathway-enrichment/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-pathway-enrichment/scripts/run b/container/skills/bio-pathway-enrichment/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-pathway-enrichment/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-pathway-enrichment/scripts/search b/container/skills/bio-pathway-enrichment/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-pathway-enrichment/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-pathway-enrichment/skill.md b/container/skills/bio-pathway-enrichment/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-pathway-enrichment/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-phylogenetic-analysis/scripts/analyze b/container/skills/bio-phylogenetic-analysis/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-phylogenetic-analysis/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-phylogenetic-analysis/scripts/process b/container/skills/bio-phylogenetic-analysis/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-phylogenetic-analysis/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-phylogenetic-analysis/scripts/run b/container/skills/bio-phylogenetic-analysis/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-phylogenetic-analysis/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-phylogenetic-analysis/scripts/search b/container/skills/bio-phylogenetic-analysis/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-phylogenetic-analysis/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-phylogenetic-analysis/skill.md b/container/skills/bio-phylogenetic-analysis/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-phylogenetic-analysis/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-protein-interaction-networks/scripts/analyze b/container/skills/bio-protein-interaction-networks/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-protein-interaction-networks/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-protein-interaction-networks/scripts/process b/container/skills/bio-protein-interaction-networks/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-protein-interaction-networks/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-protein-interaction-networks/scripts/run b/container/skills/bio-protein-interaction-networks/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-protein-interaction-networks/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-protein-interaction-networks/scripts/search b/container/skills/bio-protein-interaction-networks/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-protein-interaction-networks/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-protein-interaction-networks/skill.md b/container/skills/bio-protein-interaction-networks/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-protein-interaction-networks/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-proteomics-quantification/scripts/analyze b/container/skills/bio-proteomics-quantification/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-proteomics-quantification/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-proteomics-quantification/scripts/process b/container/skills/bio-proteomics-quantification/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-proteomics-quantification/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-proteomics-quantification/scripts/run b/container/skills/bio-proteomics-quantification/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-proteomics-quantification/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-proteomics-quantification/scripts/search b/container/skills/bio-proteomics-quantification/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-proteomics-quantification/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-proteomics-quantification/skill.md b/container/skills/bio-proteomics-quantification/skill.md new file mode 100644 index 0000000..ec05b12 --- /dev/null +++ b/container/skills/bio-proteomics-quantification/skill.md @@ -0,0 +1,141 @@ +--- +name: bio-proteomics-quantification +description: Protein quantification from mass spectrometry data including label-free (LFQ, intensity-based), isobaric labeling (TMT, iTRAQ), and metabolic labeling (SILAC) approaches. Use when extracting protein abundances from MS data for differential analysis. +tool_type: mixed +primary_tool: MSstats +--- + +## Version Compatibility + +Reference examples tested with: MSnbase 2.28+, numpy 1.26+, pandas 2.2+ + +Before using code patterns, verify installed versions match. If versions differ: +- Python: `pip show ` then `help(module.function)` to check signatures +- R: `packageVersion('')` then `?function_name` to verify parameters + +If code throws ImportError, AttributeError, or TypeError, introspect the installed +package and adapt the example to match the actual API rather than retrying. + +# Protein Quantification + +**"Quantify proteins from my mass spec data"** → Extract protein abundances from MS data using label-free (LFQ, spectral counting), isobaric labeling (TMT, iTRAQ), or metabolic labeling (SILAC) approaches. +- R: `MSstats::dataProcess()` for feature-to-protein summarization +- Python: `pandas` for MaxLFQ-style normalization and ratio calculation +- R: `MSnbase` for isobaric tag reporter ion extraction + +## Label-Free Quantification (LFQ) + +### Intensity-Based (MaxLFQ Algorithm) + +```python +import pandas as pd +import numpy as np + +def maxlfq_normalize(intensities): + '''Simplified MaxLFQ normalization''' + log_int = np.log2(intensities.replace(0, np.nan)) + + # Median centering per sample + sample_medians = log_int.median(axis=0) + global_median = sample_medians.median() + normalized = log_int - sample_medians + global_median + + return normalized +``` + +### Spectral Counting + +```python +def spectral_count_normalize(counts, total_spectra): + '''Normalized spectral abundance factor (NSAF)''' + # Divide by protein length, then by total + nsaf = counts / total_spectra + return nsaf / nsaf.sum() +``` + +## TMT/iTRAQ Quantification + +```r +library(MSnbase) + +# Load reporter ion data +tmt_data <- readMSnSet('tmt_data.txt') + +# Normalize with reference channel +tmt_normalized <- normalize(tmt_data, method = 'center.median') + +# Summarize to protein level +protein_data <- combineFeatures(tmt_normalized, groupBy = fData(tmt_data)$protein, + fun = 'median') +``` + +### Python TMT Processing + +```python +def extract_tmt_intensities(spectrum, reporter_mz, tolerance=0.003): + '''Extract TMT reporter ion intensities''' + mz, intensity = spectrum.get_peaks() + tmt_intensities = {} + + for channel, target_mz in reporter_mz.items(): + mask = np.abs(mz - target_mz) < tolerance + if mask.any(): + tmt_intensities[channel] = intensity[mask].max() + else: + tmt_intensities[channel] = 0 + + return tmt_intensities + +TMT_10PLEX = {'126': 126.127726, '127N': 127.124761, '127C': 127.131081, + '128N': 128.128116, '128C': 128.134436, '129N': 129.131471, + '129C': 129.137790, '130N': 130.134825, '130C': 130.141145, + '131': 131.138180} +``` + +## SILAC Quantification + +```python +def calculate_silac_ratio(heavy_intensity, light_intensity): + '''Calculate SILAC H/L ratio''' + if light_intensity > 0 and heavy_intensity > 0: + return np.log2(heavy_intensity / light_intensity) + return np.nan + +# Typical mass shifts +SILAC_SHIFTS = { + 'Arg10': 10.008269, # 13C6 15N4 Arginine + 'Lys8': 8.014199, # 13C6 15N2 Lysine + 'Arg6': 6.020129, # 13C6 Arginine + 'Lys6': 6.020129 # 13C6 Lysine +} +``` + +## MSstats Workflow (R) + +**Goal:** Convert MaxQuant output into normalized protein-level abundance estimates using MSstats feature-to-protein summarization. + +**Approach:** Reformat MaxQuant evidence and proteinGroups files into MSstats input format, then apply median equalization normalization with Tukey's median polish for protein-level summarization. + +```r +library(MSstats) + +# Prepare input from MaxQuant +maxquant_input <- MaxQtoMSstatsFormat( + evidence = read.table('evidence.txt', sep = '\t', header = TRUE), + proteinGroups = read.table('proteinGroups.txt', sep = '\t', header = TRUE), + annotation = read.csv('annotation.csv') +) + +# Process and normalize +processed <- dataProcess(maxquant_input, normalization = 'equalizeMedians', + summaryMethod = 'TMP', censoredInt = 'NA') + +# Protein-level summary +protein_summary <- quantification(processed) +``` + +## Related Skills + +- data-import - Load MS data before quantification +- differential-abundance - Statistical testing after quantification +- expression-matrix/counts-ingest - Similar matrix handling diff --git a/container/skills/bio-research-pipeline/SKILL.md b/container/skills/bio-research-pipeline/SKILL.md new file mode 100644 index 0000000..39508eb --- /dev/null +++ b/container/skills/bio-research-pipeline/SKILL.md @@ -0,0 +1,479 @@ +--- +name: bio-research-pipeline +description: > + Biological hypothesis generation pipeline. Given a broad research direction, + runs parallel literature searches (PubMed + preprints + pathway DBs), generates + ≥5 mechanistic hypotheses, conducts multi-agent debate to select top 3, then + designs wet-lab experimental plans for each. Outputs a structured research brief. +keywords: + - bio-research-pipeline + - hypothesis-generation + - literature-review + - experimental-design + - wet-lab + - multi-agent-debate + - pathway-analysis +--- + +# Biological Research Hypothesis Pipeline + +You are a research coordinator orchestrating a multi-stage biological research pipeline. +This skill is triggered when a user provides a broad biological research direction and wants: +- A thorough literature review +- Multiple mechanistic hypotheses +- Multi-perspective critique and ranking +- Wet-lab experimental designs + +--- + +## SCRIPT PATH RESOLUTION + +This skill includes three helper scripts. Before running them, locate them: + +```bash +SKILL_DIR=$(find ~/.claude/skills -name "SKILL.md" -path "*/bio-research-pipeline/*" | xargs dirname 2>/dev/null | head -1) +PUBMED_SCRIPT="$SKILL_DIR/scripts/pubmed-fetch" +PREPRINT_SCRIPT="$SKILL_DIR/scripts/preprint-fetch" +PATHWAY_SCRIPT="$SKILL_DIR/scripts/pathway-search" +echo "Skill dir: $SKILL_DIR" +ls "$SKILL_DIR/scripts/" +``` + +Then invoke scripts as: +```bash +python3 "$PUBMED_SCRIPT" "your topic" --max 40 --years 5 +python3 "$PREPRINT_SCRIPT" "your topic" --max 30 --days 180 +python3 "$PATHWAY_SCRIPT" "your topic" --gene GENE_SYMBOL +``` + +--- + +## HOW TO INVOKE THIS PIPELINE + +When a user asks something like: +- "帮我研究一下 [方向]" +- "我想研究 [X] 机制,帮我提假说" +- "针对 [疾病/通路/基因],设计一个研究方向" +- "run bio-research-pipeline on [topic]" + +Parse the research direction from the user's message, then execute all 5 stages below **in order**. + +--- + +## STAGE 1 — PARALLEL LITERATURE SEARCH (3 MODELS) + +**Goal:** Cast a wide net using three models simultaneously — each with a different strength. + +| Role | Model | Responsibility | +|------|-------|---------------| +| Task A | **Claude** (you) | Real data fetch — PubMed API + bioRxiv API, runs Python scripts | +| Task B | **MiniMax** | Biomedical knowledge analysis, Chinese literature context, cross-checking | +| Task C | **Qwen** | Pathway landscape mapping, fast synthesis, regulatory network overview | + +Use the `Task` tool to launch **all 3 tasks in the same message** (parallel). Do NOT wait for one before starting the next. + +--- + +### Task A — Claude: Real Literature Fetch (PubMed + Preprints) + +This task runs actual API calls using the helper scripts in this skill. + +Prompt for Task A: +``` +You are doing a real literature fetch for the bio-research-pipeline. +Research direction: {RESEARCH_DIRECTION} + +Step 1 — Locate skill scripts: +SKILL_DIR=$(find ~/.claude/skills -name "SKILL.md" -path "*/bio-research-pipeline/*" | xargs dirname 2>/dev/null | head -1) + +Step 2 — Run PubMed fetch: +python3 "$SKILL_DIR/scripts/pubmed-fetch" "{RESEARCH_DIRECTION}" --max 40 --years 5 + +Step 3 — Run preprint fetch: +python3 "$SKILL_DIR/scripts/preprint-fetch" "{RESEARCH_DIRECTION}" --max 25 --days 180 + +Step 4 — Synthesize into structured summary: +- Key molecular mechanisms found +- Key proteins/genes/pathways mentioned +- Most significant recent findings (2023-2025) +- Any contradictions or open debates +- Top 5 most relevant papers with PMID/DOI + +Output all results clearly labeled with section headers. +``` + +--- + +### Task B — MiniMax: Biomedical Knowledge Analysis + +This task calls MiniMax via `mcp__bioclaw__call_minimax`. MiniMax contributes its own training knowledge — particularly strong on Chinese biomedical literature, clinical context, and TCM-related pathways. + +Prompt for Task B: +``` +Call mcp__bioclaw__call_minimax with this prompt: + +system: "You are an expert biomedical research analyst with deep knowledge of molecular biology, disease mechanisms, and the latest research trends in both Western and Chinese scientific literature." + +prompt: "Research direction: {RESEARCH_DIRECTION} + +Please provide a comprehensive analysis covering: + +1. CURRENT STATE OF THE FIELD + - What is well-established about this topic? + - What are the 3-5 most important mechanistic insights from recent years? + - Which research groups/labs are leading this field? + +2. KEY MOLECULAR PLAYERS + - List the most important proteins, genes, and non-coding RNAs involved + - Describe their known roles and interactions + - Note any recently discovered players (2022-2025) + +3. DISEASE RELEVANCE + - Which diseases/conditions is this most relevant to? + - What is the current clinical/translational status? + - Any recent clinical trials or translational breakthroughs? + +4. KNOWLEDGE GAPS + - What are the most important unresolved questions? + - Where do different research groups disagree? + - What has been tried but failed, and why? + +5. EMERGING ANGLES + - What novel angles are researchers starting to explore? + - Any recent paradigm shifts in thinking about this topic? + +Provide specific, concrete information. Cite field knowledge accurately." + +After getting MiniMax's response, output it verbatim with the header: "=== MINIMAX ANALYSIS ===" +``` + +--- + +### Task C — Qwen: Pathway Landscape + Regulatory Network + +This task calls Qwen via `mcp__bioclaw__call_qwen` AND runs the pathway-search script for real database data. + +Prompt for Task C: +``` +You are mapping the pathway landscape for the bio-research-pipeline. +Research direction: {RESEARCH_DIRECTION} + +Step 1 — Run real pathway database search: +SKILL_DIR=$(find ~/.claude/skills -name "SKILL.md" -path "*/bio-research-pipeline/*" | xargs dirname 2>/dev/null | head -1) +python3 "$SKILL_DIR/scripts/pathway-search" "{RESEARCH_DIRECTION}" + +Step 2 — Call Qwen for pathway synthesis: +Use mcp__bioclaw__call_qwen with: + +system: "You are a systems biology expert specializing in signaling pathway analysis and gene regulatory networks." + +prompt: "For the research topic: {RESEARCH_DIRECTION} + +Please map the complete biological pathway landscape: + +1. CORE PATHWAYS INVOLVED + - List and briefly describe each relevant pathway + - Explain how they interconnect for this topic + +2. REGULATORY HIERARCHY + - Upstream triggers / sensors + - Master regulators (transcription factors, kinases) + - Key effectors and their downstream targets + - Feedback and feedforward loops + +3. CROSSTALK POINTS + - Where do pathways intersect or antagonize each other? + - Which nodes are shared across multiple pathways? + - Potential compensatory mechanisms to be aware of + +4. CONTEXT-SPECIFIC REGULATION + - How does this regulation differ between cell types? + - Tissue-specific or disease-specific pathway alterations + - Known species differences (mouse vs human) + +5. THERAPEUTIC INTERVENTION POINTS + - Which nodes are most druggable? + - Existing drugs/inhibitors targeting these pathways + - Potential combination therapy rationale + +Be specific about molecule names and interaction types (phosphorylation, ubiquitination, transcriptional activation, etc.)" + +Output Qwen's response with header: "=== QWEN PATHWAY ANALYSIS ===" +Then append the real database results from Step 1 with header: "=== DATABASE RESULTS ===" +``` + +--- + +**After launching all 3 tasks**, collect results: +``` +taskA_result = TaskOutput(task_id_A) +taskB_result = TaskOutput(task_id_B) +taskC_result = TaskOutput(task_id_C) +``` +Wait for all 3 to complete before proceeding to Stage 2. + +**Send a progress update to the user** via `mcp__bioclaw__send_message`: +``` +"📚 文献检索完成(Claude + MiniMax + Qwen 三路并行) +🔬 Claude: PubMed {N} 篇 + 预印本 {M} 篇 +🤖 MiniMax: 生物医学知识分析完成 +⚡ Qwen: 通路图谱梳理完成 +正在综合分析,生成假说..." +``` + +--- + +## STAGE 2 — PATHWAY SYNTHESIS + HYPOTHESIS GENERATION + +**Goal:** Synthesize the 3 literature sources into a pathway map, then generate ≥5 mechanistic hypotheses. + +### 2a. Build Pathway Map + +From the 3 task outputs, extract: +- All mentioned proteins/genes → list with roles +- All mentioned pathways → list with descriptions +- Key interactions (A activates B, X inhibits Y) +- Unresolved questions explicitly mentioned in papers + +### 2b. Generate ≥5 Hypotheses + +For each hypothesis, output a structured block: + +``` +HYPOTHESIS [N]: [One-sentence title] + +Mechanism: + [2-3 sentences describing the molecular mechanism step by step] + e.g. "We propose that [A] activates [B] under [condition X], which leads to [downstream effect Y] + via [pathway Z]. This is supported by [evidence 1] but has not been directly tested in [context]." + +Key molecular players: + - [Gene/Protein 1]: [role] + - [Gene/Protein 2]: [role] + - [Pathway]: [how it's involved] + +Supporting evidence: + - [Paper/finding that supports this] + - [Observation that is consistent with this] + +Evidence gaps (why this is a hypothesis, not established fact): + - [What has NOT been shown] + - [Conflicting data, if any] + +Novelty score (1-10): [score] +Reason: [why this is or isn't novel] + +Testability score (1-10): [score] +Reason: [how difficult it would be to test with standard wet lab methods] +``` + +Generate hypotheses that: +- Cover **different mechanistic angles** (not just variations of the same idea) +- Range from **conservative** (well-supported, incremental) to **bold** (less evidence, high impact) +- Are **wet-lab testable** (avoid purely computational hypotheses) + +--- + +## STAGE 3 — MULTI-AGENT DEBATE + +**Goal:** Critically evaluate each hypothesis from 3 perspectives to identify the strongest ones. + +For each hypothesis, conduct a structured 3-voice review. You will play each role in sequence: + +``` +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +DEBATE — HYPOTHESIS [N]: [title] +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +🟢 SUPPORTER (argues FOR this hypothesis): + Strongest evidence points: + - [evidence 1] + - [evidence 2] + Why this mechanism is biologically plausible: + - [mechanistic reasoning] + Potential impact if confirmed: + - [scientific/clinical significance] + +🔴 SKEPTIC (argues AGAINST / identifies weaknesses): + Critical weaknesses: + - [flaw 1: e.g., "The key evidence comes from in vitro studies only"] + - [flaw 2: e.g., "Alternative explanation: this effect may be due to [X] instead"] + Confounding factors not accounted for: + - [confounder] + Prior work that challenges this: + - [conflicting evidence or null results] + +🔵 METHODOLOGIST (evaluates experimental feasibility): + To directly test this hypothesis, you would need: + - [key experiment] + Technical challenges: + - [challenge 1] + - [challenge 2] + Timeline estimate: [weeks/months] + Whether a typical university wet lab can do this: [Yes/No/Partially] + Model system recommendation: [cell line / mouse model / organoid / etc.] + +DEBATE VERDICT: + Evidence score (1-10): [score] — How well-supported is it currently? + Novelty score (1-10): [score] — How new is this idea? + Feasibility score (1-10): [score] — Can a wet lab test it in <12 months? + Impact score (1-10): [score] — How significant if confirmed? + + COMPOSITE SCORE: [average, weighted: Evidence×0.3 + Novelty×0.25 + Feasibility×0.25 + Impact×0.2] +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +``` + +Run this for ALL hypotheses. Then rank by composite score and select **TOP 3**. + +--- + +## STAGE 4 — TOP 3 REFINEMENT + +For each of the top 3 hypotheses, expand the mechanism with full molecular detail: + +``` +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +TOP [1/2/3]: [Hypothesis title] +Final composite score: [X.X/10] +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +REFINED MECHANISM: + [4-6 sentences with full molecular detail] + Include: upstream triggers, key effectors, downstream consequences, feedback regulation + +PATHWAY DIAGRAM (text-based): + [Stimulus/Condition] + ↓ + [Receptor/Sensor] → activates → [Kinase/TF] + ↓ + [Key effector] + ↓ (promotes) ↓ (inhibits) + [Outcome A] [Outcome B] + +KEY UNKNOWNS to be resolved by experiments: + 1. [Unknown 1] + 2. [Unknown 2] + 3. [Unknown 3] +``` + +--- + +## STAGE 5 — WET LAB EXPERIMENTAL DESIGN + +For each of the top 3 hypotheses, design a complete wet-lab experimental plan: + +``` +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ +EXPERIMENTAL PLAN — [Hypothesis title] +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +RECOMMENDED MODEL SYSTEM: + Primary: [e.g., HEK293T cells / primary mouse hepatocytes / C57BL/6 mice] + Rationale: [why this model is appropriate] + Alternative: [backup model if primary unavailable] + +EXPERIMENT 1 — [Core test of the central claim] + Objective: [what this experiment proves or disproves] + + Method: + 1. [Step 1] + 2. [Step 2] + 3. [Step 3] + + Key reagents: + - [Antibody/siRNA/inhibitor/construct needed] + - [Source: commercial/need to generate] + + Readout: [what you measure — Western blot / qPCR / immunofluorescence / etc.] + + Expected result if hypothesis is TRUE: + - [specific measurable outcome, e.g., "50%+ increase in phospho-X levels"] + + Expected result if hypothesis is FALSE: + - [what you'd see instead] + + Controls: + - Positive control: [what and why] + - Negative control: [what and why] + - Technical control: [e.g., loading control, vehicle control] + + Estimated time: [X weeks] + Difficulty: [Easy / Medium / Hard] + +EXPERIMENT 2 — [Validation / Orthogonal approach] + [same structure as Experiment 1] + +EXPERIMENT 3 — [In vivo / disease-relevance test, if applicable] + [same structure — note if this requires animal work / ethics approval] + +DECISION TREE: + If Experiment 1 result is positive → proceed to Experiment 2 + If Experiment 1 result is negative → [interpret: reject hypothesis OR check [alternative explanation]] + If Experiment 2 confirms → [next step: submit for funding / expand to in vivo] + If Experiment 2 conflicts with Experiment 1 → [troubleshoot: check [specific variable]] + +TIMELINE OVERVIEW: + Week 1-2: [setup, reagent procurement] + Week 3-6: [Experiment 1] + Week 7-10: [Experiment 2] + Week 11-16: [Experiment 3, if applicable] + Total estimated time to proof-of-concept: [X months] + +KEY RISKS: + - [Risk 1: e.g., "Primary antibody may not work in mouse samples"] + Mitigation: [e.g., "Order 2 alternative antibodies from different vendors"] + - [Risk 2: e.g., "Model system may not recapitulate in vivo physiology"] + Mitigation: [e.g., "Validate key finding in primary cells"] +``` + +--- + +## FINAL OUTPUT FORMAT + +After completing all 5 stages, produce a summary: + +``` +╔══════════════════════════════════════════════════════╗ +║ RESEARCH BRIEF — [RESEARCH DIRECTION] ║ +║ Generated: [date] ║ +╚══════════════════════════════════════════════════════╝ + +LITERATURE COVERAGE: + PubMed papers reviewed: ~[N] + Preprints reviewed: ~[N] + Key pathways identified: [list] + +ALL HYPOTHESES RANKED: + #1 [score] — [title] + #2 [score] — [title] + #3 [score] — [title] ← TOP 3 + #4 [score] — [title] + #5 [score] — [title] + [#6+ if generated] + +TOP 3 RECOMMENDED FOR INVESTIGATION: + → [Hypothesis 1 title] (strongest evidence + feasible) + → [Hypothesis 2 title] (most novel) + → [Hypothesis 3 title] (highest clinical impact) + +NEXT STEPS: + Immediate (0-1 month): [first experiment to run] + Short-term (1-6 months): [validation plan] + Long-term (6-18 months): [expansion strategy] + +FULL EXPERIMENTAL PLANS: see sections above +``` + +Save the complete output to `/workspace/group/research-brief-[slug].md` where [slug] is a short version of the research direction. + +Tell the user: "研究简报已完成,保存在 research-brief-[slug].md。以下是摘要:" then show the summary block. + +--- + +## IMPORTANT NOTES + +- **Do not skip the debate stage** — the debate is essential to filter weak hypotheses +- **Wet lab focus** — all experimental designs must be physically executable (pipettes, cells, animals), not just computational +- **Be specific** — vague statements like "further research needed" are not acceptable; every gap should map to a specific experiment +- **Cite as you go** — whenever you make a claim, reference which paper or database it came from +- **Chinese output is fine** — if the user wrote in Chinese, respond in Chinese throughout diff --git a/container/skills/bio-research-pipeline/scripts/pathway-search b/container/skills/bio-research-pipeline/scripts/pathway-search new file mode 100755 index 0000000..fa7abee --- /dev/null +++ b/container/skills/bio-research-pipeline/scripts/pathway-search @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 +""" +Pathway database search (KEGG + Reactome + STRING) for bio-research-pipeline. + +Usage: + pathway-search "gene or pathway name" [--gene SYMBOL] [--species hsa|mmu] + +Output: relevant pathways, interacting proteins, and known regulatory relationships +""" + +import sys +import argparse +import json +import time + +try: + import requests +except ImportError: + print("ERROR: requests not installed.", file=sys.stderr) + sys.exit(1) + + +def search_kegg_pathways(query: str, species: str = "hsa") -> list[dict]: + """Search KEGG for pathways matching a keyword.""" + results = [] + + # KEGG find: search pathway database + url = f"https://rest.kegg.jp/find/pathway/{requests.utils.quote(query)}" + try: + r = requests.get(url, timeout=15) + if r.status_code == 200 and r.text.strip(): + for line in r.text.strip().split("\n"): + parts = line.split("\t", 1) + if len(parts) == 2: + path_id, path_name = parts + results.append({"id": path_id, "name": path_name, "source": "KEGG"}) + except Exception as e: + print(f"[WARNING] KEGG pathway search failed: {e}", file=sys.stderr) + + return results[:10] + + +def get_kegg_pathway_genes(pathway_id: str) -> list[str]: + """Get genes in a KEGG pathway.""" + # Convert to species-specific ID if needed + if not pathway_id.startswith("path:"): + pathway_id = f"path:{pathway_id}" + + url = f"https://rest.kegg.jp/get/{pathway_id}" + genes = [] + try: + r = requests.get(url, timeout=15) + if r.status_code == 200: + in_gene_section = False + for line in r.text.split("\n"): + if line.startswith("GENE"): + in_gene_section = True + elif line.startswith("COMPOUND") or line.startswith("REACTION") or line.startswith("///"): + in_gene_section = False + if in_gene_section and line.strip(): + # Extract gene symbol from format: " 1234 GENE_SYMBOL; description" + parts = line.strip().split(";")[0].split() + if len(parts) >= 2: + genes.append(parts[1]) + except Exception as e: + print(f"[WARNING] KEGG gene fetch failed: {e}", file=sys.stderr) + + return genes[:30] + + +def search_reactome(query: str) -> list[dict]: + """Search Reactome for pathways.""" + url = "https://reactome.org/ContentService/search/query" + params = { + "query": query, + "species": "Homo sapiens", + "types": "Pathway", + "cluster": "true", + } + results = [] + + try: + r = requests.get(url, params=params, timeout=20) + if r.status_code == 200: + data = r.json() + entries = data.get("results", [{}])[0].get("entries", []) if data.get("results") else [] + for entry in entries[:8]: + results.append({ + "id": entry.get("stId", ""), + "name": entry.get("name", ""), + "type": entry.get("type", ""), + "species": entry.get("species", ""), + "source": "Reactome", + "url": f"https://reactome.org/PathwayBrowser/#/{entry.get('stId', '')}", + }) + except Exception as e: + print(f"[WARNING] Reactome search failed: {e}", file=sys.stderr) + + return results + + +def search_string_interactions(gene_symbol: str, species: int = 9606, limit: int = 20) -> list[dict]: + """Get protein-protein interactions from STRING.""" + url = "https://string-db.org/api/json/network" + params = { + "identifiers": gene_symbol, + "species": species, + "limit": limit, + "required_score": 700, # high confidence + "caller_identity": "bioclaw-research-pipeline", + } + interactions = [] + + try: + r = requests.get(url, params=params, timeout=20) + if r.status_code == 200: + data = r.json() + for edge in data[:limit]: + interactions.append({ + "protein_a": edge.get("preferredName_A", ""), + "protein_b": edge.get("preferredName_B", ""), + "score": edge.get("score", 0), + "source": "STRING", + }) + except Exception as e: + print(f"[WARNING] STRING search failed: {e}", file=sys.stderr) + + return interactions + + +def search_string_functional(gene_symbol: str, species: int = 9606) -> list[dict]: + """Get functional enrichment for a gene from STRING.""" + url = "https://string-db.org/api/json/functional_annotation" + params = { + "identifiers": gene_symbol, + "species": species, + "caller_identity": "bioclaw-research-pipeline", + } + annotations = [] + + try: + r = requests.get(url, params=params, timeout=20) + if r.status_code == 200: + data = r.json() + for item in data[:15]: + category = item.get("category", "") + if category in ("KEGG", "Reactome", "Process"): + annotations.append({ + "category": category, + "term": item.get("term", ""), + "description": item.get("description", ""), + "fdr": item.get("fdr", 1.0), + }) + except Exception as e: + print(f"[WARNING] STRING functional annotation failed: {e}", file=sys.stderr) + + return annotations + + +def main(): + parser = argparse.ArgumentParser(description="Search pathway databases (KEGG, Reactome, STRING)") + parser.add_argument("query", help="Gene name, protein name, or pathway keyword") + parser.add_argument("--gene", help="Specific gene symbol for STRING interaction search") + parser.add_argument("--species", default="hsa", help="Species code (hsa=human, mmu=mouse)") + parser.add_argument("--json", action="store_true", help="Output raw JSON") + args = parser.parse_args() + + species_ncbi = 9606 if args.species in ("hsa", "human") else 10090 # mouse fallback + + print(f"[Pathway] Searching for: {args.query}", file=sys.stderr) + + # Run searches + kegg_pathways = search_kegg_pathways(args.query, species=args.species) + time.sleep(0.3) + + reactome_pathways = search_reactome(args.query) + time.sleep(0.3) + + gene_symbol = args.gene or args.query.split()[0] + string_interactions = search_string_interactions(gene_symbol, species=species_ncbi) + time.sleep(0.3) + + string_functions = search_string_functional(gene_symbol, species=species_ncbi) if args.gene else [] + + # Get genes for top KEGG pathway + kegg_genes = [] + if kegg_pathways: + top_pathway_id = kegg_pathways[0]["id"] + kegg_genes = get_kegg_pathway_genes(top_pathway_id) + + result = { + "query": args.query, + "kegg_pathways": kegg_pathways, + "kegg_genes_top_pathway": kegg_genes, + "reactome_pathways": reactome_pathways, + "string_interactions": string_interactions, + "string_functional": string_functions, + } + + if args.json: + print(json.dumps(result, ensure_ascii=False, indent=2)) + return + + # Formatted output + print(f"\n{'='*60}") + print(f"PATHWAY DATABASE RESULTS: {args.query}") + print(f"{'='*60}\n") + + # KEGG + if kegg_pathways: + print(f"KEGG PATHWAYS ({len(kegg_pathways)} found):") + for p in kegg_pathways: + print(f" {p['id']:20s} {p['name']}") + if kegg_genes: + print(f"\n Genes in top pathway ({kegg_pathways[0]['name']}):") + print(f" {', '.join(kegg_genes)}") + else: + print("KEGG: No pathways found for this query.") + + print() + + # Reactome + if reactome_pathways: + print(f"REACTOME PATHWAYS ({len(reactome_pathways)} found):") + for p in reactome_pathways: + print(f" [{p['id']}] {p['name']}") + print(f" URL: {p['url']}") + else: + print("Reactome: No pathways found.") + + print() + + # STRING + if string_interactions: + print(f"STRING INTERACTIONS for '{gene_symbol}' (high-confidence, score≥0.7):") + for edge in string_interactions[:15]: + score_pct = int(edge["score"] * 100) + print(f" {edge['protein_a']:12s} ↔ {edge['protein_b']:12s} (confidence: {score_pct}%)") + else: + print(f"STRING: No high-confidence interactions found for '{gene_symbol}'.") + + if string_functions: + print(f"\n Functional annotations:") + for fn in string_functions[:8]: + print(f" [{fn['category']}] {fn['description']} (FDR={fn['fdr']:.2e})") + + print(f"\n{'='*60}") + print("END OF PATHWAY RESULTS") + print(f"{'='*60}") + + print("\nNOTE FOR AGENT: Use these pathways as context when formulating hypotheses.") + print("Cross-reference pathway membership with literature findings to identify key nodes.") + + +if __name__ == "__main__": + main() diff --git a/container/skills/bio-research-pipeline/scripts/preprint-fetch b/container/skills/bio-research-pipeline/scripts/preprint-fetch new file mode 100755 index 0000000..9b537d7 --- /dev/null +++ b/container/skills/bio-research-pipeline/scripts/preprint-fetch @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +""" +Preprint fetcher (bioRxiv + medRxiv via Europe PMC) for bio-research-pipeline. + +Usage: + preprint-fetch "research topic" [--max 30] [--days 180] [--server biorxiv|medrxiv|both] + +Output: structured text summary of relevant preprints +""" + +import sys +import argparse +import json +from datetime import datetime, timedelta +from urllib.parse import quote + +try: + import requests +except ImportError: + print("ERROR: requests not installed. Run: pip3 install requests", file=sys.stderr) + sys.exit(1) + + +EUROPEPMC_API = "https://www.ebi.ac.uk/europepmc/webservices/rest/search" + + +def fetch_preprints( + topic: str, + days_back: int = 180, + max_results: int = 30, + server: str = "both", +) -> list[dict]: + end_date = datetime.now().strftime("%Y-%m-%d") + start_date = (datetime.now() - timedelta(days=days_back)).strftime("%Y-%m-%d") + + print(f"[Preprint] Date range: {start_date} to {end_date}", file=sys.stderr) + print(f"[Preprint] Searching for: {topic}", file=sys.stderr) + + # Build Europe PMC query with preprint source filter + # SRC:PPR = preprint sources (bioRxiv, medRxiv, etc.) + source_filter = "SRC:PPR" + if server == "biorxiv": + source_filter = '(PUBLISHER:"bioRxiv")' + elif server == "medrxiv": + source_filter = '(PUBLISHER:"medRxiv")' + + query = f'({topic}) AND {source_filter} AND (FIRST_PDATE:[{start_date} TO {end_date}])' + + print(f"[Preprint] Query: {query}", file=sys.stderr) + + all_papers = [] + page_size = min(max_results, 100) + + try: + r = requests.get( + EUROPEPMC_API, + params={ + "query": query, + "format": "json", + "pageSize": page_size, + "sort": "CITED desc", # most cited first + "resultType": "core", + }, + timeout=30, + ) + r.raise_for_status() + data = r.json() + + results = data.get("resultList", {}).get("result", []) + print(f"[Preprint] API returned {len(results)} results (total: {data.get('hitCount', '?')})", file=sys.stderr) + + for paper in results: + authors_list = paper.get("authorString", "") + all_papers.append({ + "server": paper.get("bookOrReportDetails", {}).get("publisher", "Preprint"), + "title": paper.get("title", ""), + "authors": authors_list, + "date": paper.get("firstPublicationDate", ""), + "doi": paper.get("doi", ""), + "abstract": paper.get("abstractText", ""), + "category": paper.get("journalInfo", {}).get("journal", {}).get("title", "") if paper.get("journalInfo") else "", + "pmcid": paper.get("pmcid", ""), + "source": paper.get("source", ""), + "cited_by": paper.get("citedByCount", 0), + }) + + except requests.exceptions.Timeout: + print(f"[WARNING] Europe PMC API timed out, trying fallback...", file=sys.stderr) + return _fallback_biorxiv(topic, start_date, end_date, max_results, server) + except Exception as e: + print(f"[WARNING] Europe PMC API error: {e}", file=sys.stderr) + return _fallback_biorxiv(topic, start_date, end_date, max_results, server) + + return all_papers[:max_results] + + +def _fallback_biorxiv(topic: str, start_date: str, end_date: str, max_results: int, server: str) -> list[dict]: + """Fallback to bioRxiv API with shorter date range to avoid timeout.""" + # Shorten to 30 days to avoid the massive response + recent_start = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d") + keywords = [w for w in topic.split() if len(w) > 3] + + BIORXIV_API = "https://api.biorxiv.org/details/biorxiv" + MEDRXIV_API = "https://api.biorxiv.org/details/medrxiv" + + servers_to_search = [] + if server in ("biorxiv", "both"): + servers_to_search.append(("bioRxiv", BIORXIV_API)) + if server in ("medrxiv", "both"): + servers_to_search.append(("medRxiv", MEDRXIV_API)) + + all_papers = [] + for server_name, api_url in servers_to_search: + url = f"{api_url}/{recent_start}/{end_date}/0/json" + try: + r = requests.get(url, timeout=20) + r.raise_for_status() + data = r.json() + for paper in data.get("collection", []): + title = paper.get("title", "").lower() + abstract = paper.get("abstract", "").lower() + if any(kw.lower() in title or kw.lower() in abstract for kw in keywords): + all_papers.append({ + "server": server_name, + "title": paper.get("title", ""), + "authors": paper.get("authors", ""), + "date": paper.get("date", ""), + "doi": paper.get("doi", ""), + "abstract": paper.get("abstract", ""), + "category": paper.get("category", ""), + }) + except Exception as e: + print(f"[WARNING] {server_name} fallback error: {e}", file=sys.stderr) + + return all_papers[:max_results] + + +def main(): + parser = argparse.ArgumentParser(description="Fetch and summarize bioRxiv/medRxiv preprints") + parser.add_argument("topic", help="Research topic") + parser.add_argument("--max", type=int, default=30, help="Max preprints to return (default: 30)") + parser.add_argument("--days", type=int, default=180, help="Days back to search (default: 180)") + parser.add_argument("--server", choices=["biorxiv", "medrxiv", "both"], default="both") + parser.add_argument("--json", action="store_true", help="Output raw JSON") + args = parser.parse_args() + + print(f"[Preprint] Searching for: {args.topic}", file=sys.stderr) + + papers = fetch_preprints( + topic=args.topic, + days_back=args.days, + max_results=args.max, + server=args.server, + ) + + print(f"[Preprint] Found {len(papers)} relevant preprints", file=sys.stderr) + + if args.json: + print(json.dumps(papers, ensure_ascii=False, indent=2)) + return + + print(f"\n{'='*60}") + print(f"PREPRINT RESULTS: {args.topic}") + print(f"Papers found: {len(papers)} | Servers: {args.server} | Period: last {args.days} days") + print(f"{'='*60}\n") + + if not papers: + print("No relevant preprints found for this topic in the specified date range.") + print("Suggestions:") + print(" - Try broader search terms") + print(" - Increase --days parameter") + print(f" - Search directly at https://www.biorxiv.org/search/{quote(args.topic)}") + return + + for i, paper in enumerate(papers, 1): + authors = paper["authors"] + if len(authors) > 80: + authors = authors[:80] + "..." + + server_tag = paper.get("server", "Preprint") + cited = paper.get("cited_by") + cited_str = f" | Cited: {cited}" if cited else "" + + print(f"[{i}] [{server_tag}] {paper['title']}") + print(f" {authors}") + print(f" Date: {paper['date']}{cited_str}") + print(f" DOI: https://doi.org/{paper['doi']}" if paper.get('doi') else " DOI: N/A") + + abstract = paper.get("abstract", "") + if len(abstract) > 450: + abstract = abstract[:450] + "..." + if abstract: + print(f" {abstract}") + print() + + print(f"\n{'='*60}") + print("END OF PREPRINT RESULTS") + print(f"{'='*60}") + + print("\nNOTE FOR AGENT: Preprints have NOT been peer-reviewed.") + print("Treat findings as preliminary. Check if any have since been published in journals.") + print(f"Direct search URL: https://www.biorxiv.org/search/{quote(args.topic)}") + + +if __name__ == "__main__": + main() diff --git a/container/skills/bio-research-pipeline/scripts/pubmed-fetch b/container/skills/bio-research-pipeline/scripts/pubmed-fetch new file mode 100755 index 0000000..4441d4b --- /dev/null +++ b/container/skills/bio-research-pipeline/scripts/pubmed-fetch @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +""" +PubMed literature fetcher for bio-research-pipeline. + +Usage: + pubmed-fetch "research topic" [--max 40] [--years 5] [--mode abstract|full] + +Output: structured text summary of relevant papers +""" + +import sys +import argparse +import json +import re +from datetime import datetime + +try: + from Bio import Entrez, Medline +except ImportError: + print("ERROR: biopython not installed. Run: pip3 install biopython", file=sys.stderr) + sys.exit(1) + +Entrez.email = "bioclaw-agent@research.ai" +Entrez.tool = "BioClaw-ResearchPipeline" + + +def search_pubmed(query: str, max_results: int = 40, years_back: int = 5) -> list[str]: + current_year = datetime.now().year + min_year = current_year - years_back + + handle = Entrez.esearch( + db="pubmed", + term=query, + retmax=max_results, + sort="relevance", + datetype="pdat", + mindate=str(min_year), + maxdate=str(current_year), + ) + record = Entrez.read(handle) + handle.close() + return record["IdList"] + + +def fetch_abstracts(pmids: list[str]) -> list[dict]: + if not pmids: + return [] + + handle = Entrez.efetch( + db="pubmed", + id=",".join(pmids), + rettype="medline", + retmode="text", + ) + records = list(Medline.parse(handle)) + handle.close() + + papers = [] + for rec in records: + papers.append({ + "pmid": rec.get("PMID", ""), + "title": rec.get("TI", "No title"), + "authors": rec.get("AU", [])[:3], # first 3 authors + "journal": rec.get("TA", ""), + "year": rec.get("DP", "")[:4], + "abstract": rec.get("AB", "No abstract available"), + "mesh_terms": rec.get("MH", [])[:10], + "keywords": rec.get("OT", [])[:10], + }) + return papers + + +def extract_key_entities(papers: list[dict]) -> dict: + """Simple heuristic extraction of genes, proteins, pathways from abstracts.""" + all_text = " ".join(p["abstract"] for p in papers).upper() + + # Common pathway keywords + pathways = [] + pathway_keywords = [ + "MAPK", "PI3K", "AKT", "mTOR", "NF-κB", "NFKB", "Wnt", "WNT", + "Notch", "NOTCH", "Hedgehog", "JAK", "STAT", "TGF-β", "TGFB", + "p53", "TP53", "AMPK", "HIF", "VEGF", "TNF", "IL-6", "IL6", + "Hippo", "YAP", "TAZ", "KRAS", "EGFR", "ERK", "JNK", "p38", + "CDK", "RB", "E2F", "Autophagy", "AUTOPHAGY", "Apoptosis", "APOPTOSIS", + "Ferroptosis", "FERROPTOSIS", "Pyroptosis", "Ubiquitin", "UBIQUITIN", + ] + for kw in pathway_keywords: + if kw.upper() in all_text: + pathways.append(kw) + + return {"mentioned_pathways": list(set(pathways))[:20]} + + +def main(): + parser = argparse.ArgumentParser(description="Fetch and summarize PubMed literature") + parser.add_argument("topic", help="Research topic or query string") + parser.add_argument("--max", type=int, default=40, help="Max papers to retrieve (default: 40)") + parser.add_argument("--years", type=int, default=5, help="Years back to search (default: 5)") + parser.add_argument("--json", action="store_true", help="Output raw JSON instead of formatted text") + args = parser.parse_args() + + print(f"[PubMed] Searching for: {args.topic}", file=sys.stderr) + print(f"[PubMed] Parameters: max={args.max}, years_back={args.years}", file=sys.stderr) + + # Build enriched query + base_query = args.topic + mechanism_query = f"({base_query}[Title/Abstract]) AND (mechanism[Title/Abstract] OR pathway[Title/Abstract] OR signaling[Title/Abstract] OR molecular[Title/Abstract])" + + pmids = search_pubmed(mechanism_query, max_results=args.max, years_back=args.years) + print(f"[PubMed] Found {len(pmids)} papers", file=sys.stderr) + + if not pmids: + # Fallback: broader search without mechanism filter + pmids = search_pubmed(base_query, max_results=args.max, years_back=args.years) + print(f"[PubMed] Fallback search: {len(pmids)} papers", file=sys.stderr) + + papers = fetch_abstracts(pmids[:30]) + entities = extract_key_entities(papers) + + if args.json: + print(json.dumps({"papers": papers, "entities": entities}, ensure_ascii=False, indent=2)) + return + + # Formatted output + print(f"\n{'='*60}") + print(f"PUBMED SEARCH RESULTS: {args.topic}") + print(f"Papers retrieved: {len(papers)} | Search period: last {args.years} years") + print(f"{'='*60}\n") + + if entities["mentioned_pathways"]: + print(f"KEY PATHWAYS MENTIONED ACROSS PAPERS:") + print(f" {', '.join(entities['mentioned_pathways'])}\n") + + for i, paper in enumerate(papers, 1): + authors_str = ", ".join(paper["authors"]) if paper["authors"] else "Unknown" + if len(paper["authors"]) >= 3: + authors_str += " et al." + + print(f"[{i}] {paper['title']}") + print(f" {authors_str} | {paper['journal']} {paper['year']} | PMID: {paper['pmid']}") + + # Truncate abstract to 400 chars + abstract = paper["abstract"] + if len(abstract) > 400: + abstract = abstract[:400] + "..." + print(f" {abstract}") + + if paper["mesh_terms"]: + print(f" MeSH: {', '.join(paper['mesh_terms'][:5])}") + print() + + print(f"\n{'='*60}") + print("END OF PUBMED RESULTS") + print(f"{'='*60}") + + +if __name__ == "__main__": + main() diff --git a/container/skills/bio-rna-modification/scripts/analyze b/container/skills/bio-rna-modification/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-rna-modification/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-rna-modification/scripts/process b/container/skills/bio-rna-modification/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-rna-modification/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-rna-modification/scripts/run b/container/skills/bio-rna-modification/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-rna-modification/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-rna-modification/scripts/search b/container/skills/bio-rna-modification/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-rna-modification/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-rna-modification/skill.md b/container/skills/bio-rna-modification/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-rna-modification/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-rna-quantification-salmon/scripts/analyze b/container/skills/bio-rna-quantification-salmon/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-rna-quantification-salmon/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-rna-quantification-salmon/scripts/process b/container/skills/bio-rna-quantification-salmon/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-rna-quantification-salmon/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-rna-quantification-salmon/scripts/run b/container/skills/bio-rna-quantification-salmon/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-rna-quantification-salmon/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-rna-quantification-salmon/scripts/search b/container/skills/bio-rna-quantification-salmon/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-rna-quantification-salmon/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-rna-quantification-salmon/skill.md b/container/skills/bio-rna-quantification-salmon/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-rna-quantification-salmon/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-scatac-seq-analysis/scripts/analyze b/container/skills/bio-scatac-seq-analysis/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-scatac-seq-analysis/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-scatac-seq-analysis/scripts/process b/container/skills/bio-scatac-seq-analysis/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-scatac-seq-analysis/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-scatac-seq-analysis/scripts/run b/container/skills/bio-scatac-seq-analysis/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-scatac-seq-analysis/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-scatac-seq-analysis/scripts/search b/container/skills/bio-scatac-seq-analysis/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-scatac-seq-analysis/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-scatac-seq-analysis/skill.md b/container/skills/bio-scatac-seq-analysis/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-scatac-seq-analysis/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-sequence-alignment/scripts/analyze b/container/skills/bio-sequence-alignment/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-sequence-alignment/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-sequence-alignment/scripts/process b/container/skills/bio-sequence-alignment/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-sequence-alignment/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-sequence-alignment/scripts/run b/container/skills/bio-sequence-alignment/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-sequence-alignment/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-sequence-alignment/scripts/search b/container/skills/bio-sequence-alignment/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-sequence-alignment/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-sequence-alignment/skill.md b/container/skills/bio-sequence-alignment/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-sequence-alignment/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-annotation/scripts/analyze b/container/skills/bio-single-cell-annotation/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-annotation/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-annotation/scripts/process b/container/skills/bio-single-cell-annotation/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-annotation/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-annotation/scripts/run b/container/skills/bio-single-cell-annotation/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-annotation/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-annotation/scripts/search b/container/skills/bio-single-cell-annotation/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-annotation/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-annotation/skill.md b/container/skills/bio-single-cell-annotation/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-annotation/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-clustering/scripts/analyze b/container/skills/bio-single-cell-clustering/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-clustering/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-clustering/scripts/process b/container/skills/bio-single-cell-clustering/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-clustering/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-clustering/scripts/run b/container/skills/bio-single-cell-clustering/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-clustering/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-clustering/scripts/search b/container/skills/bio-single-cell-clustering/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-clustering/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-clustering/skill.md b/container/skills/bio-single-cell-clustering/skill.md new file mode 100644 index 0000000..db1bea4 --- /dev/null +++ b/container/skills/bio-single-cell-clustering/skill.md @@ -0,0 +1,330 @@ +--- +name: bio-single-cell-clustering +description: Dimensionality reduction and clustering for single-cell RNA-seq using Seurat (R) and Scanpy (Python). Use for running PCA, computing neighbors, clustering with Leiden/Louvain algorithms, generating UMAP/tSNE embeddings, and visualizing clusters. Use when performing dimensionality reduction and clustering on single-cell data. +tool_type: mixed +primary_tool: Seurat +--- + +## Version Compatibility + +Reference examples tested with: ggplot2 3.5+, matplotlib 3.8+, scanpy 1.10+ + +Before using code patterns, verify installed versions match. If versions differ: +- Python: `pip show ` then `help(module.function)` to check signatures +- R: `packageVersion('')` then `?function_name` to verify parameters + +If code throws ImportError, AttributeError, or TypeError, introspect the installed +package and adapt the example to match the actual API rather than retrying. + +# Single-Cell Clustering + +Dimensionality reduction, neighbor graph construction, and clustering. + +## Scanpy (Python) + +**Goal:** Reduce dimensions, build neighbor graphs, cluster cells, and visualize with UMAP/tSNE using Scanpy. + +**Approach:** Run PCA for dimensionality reduction, construct a k-NN graph, apply Leiden community detection, and compute UMAP embedding. + +**"Cluster cells and find groups"** → Reduce dimensionality with PCA, build a neighborhood graph, partition cells into clusters, and embed in 2D for visualization. + +### Required Imports + +```python +import scanpy as sc +import matplotlib.pyplot as plt +``` + +### PCA + +```python +# Run PCA +sc.tl.pca(adata, n_comps=50, svd_solver='arpack') + +# Visualize variance explained +sc.pl.pca_variance_ratio(adata, n_pcs=50) + +# Visualize PCA +sc.pl.pca(adata, color='n_genes_by_counts') +``` + +### Determine Number of PCs + +```python +# Elbow plot to choose number of PCs +sc.pl.pca_variance_ratio(adata, n_pcs=50, log=True) + +# Typically use 10-50 PCs based on elbow +n_pcs = 30 +``` + +### Compute Neighbors + +```python +# Build k-nearest neighbor graph +sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30) +``` + +### Clustering (Leiden - Recommended) + +```python +# Leiden clustering (preferred over Louvain) +sc.tl.leiden(adata, resolution=0.5) + +# Higher resolution = more clusters +sc.tl.leiden(adata, resolution=1.0, key_added='leiden_r1') + +# View cluster sizes +adata.obs['leiden'].value_counts() +``` + +### Clustering (Louvain) + +```python +# Louvain clustering (alternative) +sc.tl.louvain(adata, resolution=0.5) +``` + +### UMAP + +```python +# Compute UMAP embedding +sc.tl.umap(adata, min_dist=0.3, spread=1.0) + +# Visualize clusters on UMAP +sc.pl.umap(adata, color='leiden') + +# Color by gene expression +sc.pl.umap(adata, color=['leiden', 'CD3D', 'MS4A1', 'CD14']) +``` + +### tSNE + +```python +# Compute tSNE (slower than UMAP) +sc.tl.tsne(adata, n_pcs=30, perplexity=30) + +# Visualize +sc.pl.tsne(adata, color='leiden') +``` + +### Complete Clustering Pipeline + +**Goal:** Run end-to-end clustering from preprocessed data to UMAP visualization. + +**Approach:** Chain PCA, neighbor computation, Leiden clustering, and UMAP into a single pipeline. + +```python +import scanpy as sc + +# Assumes preprocessed data +adata = sc.read_h5ad('preprocessed.h5ad') + +# PCA +sc.tl.pca(adata, n_comps=50) + +# Neighbors +sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30) + +# Cluster +sc.tl.leiden(adata, resolution=0.5) + +# UMAP +sc.tl.umap(adata) + +# Visualize +sc.pl.umap(adata, color='leiden') +``` + +### Exploring Different Resolutions + +**Goal:** Evaluate clustering at multiple resolutions to find the appropriate granularity. + +**Approach:** Iterate over resolution values, cluster at each, and compare cluster counts on UMAP. + +```python +# Try multiple resolutions +for res in [0.2, 0.5, 0.8, 1.0, 1.5]: + sc.tl.leiden(adata, resolution=res, key_added=f'leiden_r{res}') + n_clusters = adata.obs[f'leiden_r{res}'].nunique() + print(f'Resolution {res}: {n_clusters} clusters') + +# Compare on UMAP +sc.pl.umap(adata, color=['leiden_r0.2', 'leiden_r0.5', 'leiden_r1.0'], ncols=3) +``` + +### PAGA (Trajectory Inference) + +```python +# Partition-based graph abstraction +sc.tl.paga(adata, groups='leiden') +sc.pl.paga(adata, color='leiden') + +# Use PAGA for UMAP initialization +sc.tl.umap(adata, init_pos='paga') +``` + +--- + +## Seurat (R) + +**Goal:** Reduce dimensions, build neighbor graphs, cluster cells, and visualize with UMAP/tSNE using Seurat. + +**Approach:** Run PCA, determine optimal PC count, construct SNN graph, apply Louvain clustering, and compute UMAP embedding. + +### Required Libraries + +```r +library(Seurat) +library(ggplot2) +``` + +### PCA + +```r +# Run PCA +seurat_obj <- RunPCA(seurat_obj, features = VariableFeatures(seurat_obj), npcs = 50) + +# Visualize PCA +DimPlot(seurat_obj, reduction = 'pca') +VizDimLoadings(seurat_obj, dims = 1:2, reduction = 'pca') + +# Heatmaps of PC genes +DimHeatmap(seurat_obj, dims = 1:6, cells = 500, balanced = TRUE) +``` + +### Determine Number of PCs + +```r +# Elbow plot +ElbowPlot(seurat_obj, ndims = 50) + +# JackStraw (more rigorous but slow) +seurat_obj <- JackStraw(seurat_obj, num.replicate = 100) +seurat_obj <- ScoreJackStraw(seurat_obj, dims = 1:20) +JackStrawPlot(seurat_obj, dims = 1:20) +``` + +### Find Neighbors + +```r +# Build KNN graph +seurat_obj <- FindNeighbors(seurat_obj, dims = 1:30) +``` + +### Find Clusters + +```r +# Louvain clustering (default) +seurat_obj <- FindClusters(seurat_obj, resolution = 0.5) + +# View cluster assignments +head(Idents(seurat_obj)) +table(Idents(seurat_obj)) +``` + +### Exploring Different Resolutions + +```r +# Try multiple resolutions +seurat_obj <- FindClusters(seurat_obj, resolution = c(0.2, 0.5, 0.8, 1.0, 1.5)) + +# Results stored in metadata +head(seurat_obj@meta.data) + +# Compare resolutions +library(clustree) +clustree(seurat_obj, prefix = 'RNA_snn_res.') +``` + +### UMAP + +```r +# Run UMAP +seurat_obj <- RunUMAP(seurat_obj, dims = 1:30) + +# Visualize +DimPlot(seurat_obj, reduction = 'umap', label = TRUE) + +# Split by sample +DimPlot(seurat_obj, reduction = 'umap', split.by = 'sample') +``` + +### tSNE + +```r +# Run tSNE +seurat_obj <- RunTSNE(seurat_obj, dims = 1:30) + +# Visualize +DimPlot(seurat_obj, reduction = 'tsne') +``` + +### Complete Clustering Pipeline + +**Goal:** Run end-to-end Seurat clustering from preprocessed data to UMAP visualization. + +**Approach:** Chain PCA, neighbor finding, cluster detection, and UMAP into a single pipeline. + +```r +library(Seurat) + +# Assumes preprocessed data +seurat_obj <- readRDS('preprocessed.rds') + +# PCA +seurat_obj <- RunPCA(seurat_obj, npcs = 50, verbose = FALSE) + +# Neighbors +seurat_obj <- FindNeighbors(seurat_obj, dims = 1:30) + +# Cluster +seurat_obj <- FindClusters(seurat_obj, resolution = 0.5) + +# UMAP +seurat_obj <- RunUMAP(seurat_obj, dims = 1:30) + +# Visualize +DimPlot(seurat_obj, reduction = 'umap', label = TRUE) +``` + +### Access Embeddings + +```r +# Get PCA coordinates +pca_coords <- Embeddings(seurat_obj, reduction = 'pca') + +# Get UMAP coordinates +umap_coords <- Embeddings(seurat_obj, reduction = 'umap') + +# Add to metadata for custom plotting +seurat_obj$UMAP_1 <- umap_coords[, 1] +seurat_obj$UMAP_2 <- umap_coords[, 2] +``` + +--- + +## Parameter Reference + +| Parameter | Typical Values | Effect | +|-----------|---------------|--------| +| n_pcs | 10-50 | More PCs capture more variance | +| n_neighbors | 10-30 | Higher = smoother, lower = more local | +| resolution | 0.2-2.0 | Higher = more clusters | +| min_dist (UMAP) | 0.1-0.5 | Lower = tighter clusters | + +## Method Comparison + +| Step | Scanpy | Seurat | +|------|--------|--------| +| PCA | `sc.tl.pca()` | `RunPCA()` | +| Neighbors | `sc.pp.neighbors()` | `FindNeighbors()` | +| Cluster | `sc.tl.leiden()` | `FindClusters()` | +| UMAP | `sc.tl.umap()` | `RunUMAP()` | +| tSNE | `sc.tl.tsne()` | `RunTSNE()` | + +## Related Skills + +- preprocessing - Data must be preprocessed before clustering +- markers-annotation - Find markers for each cluster +- data-io - Save clustered results diff --git a/container/skills/bio-single-cell-integration/scripts/analyze b/container/skills/bio-single-cell-integration/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-integration/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-integration/scripts/process b/container/skills/bio-single-cell-integration/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-integration/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-integration/scripts/run b/container/skills/bio-single-cell-integration/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-integration/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-integration/scripts/search b/container/skills/bio-single-cell-integration/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-integration/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-integration/skill.md b/container/skills/bio-single-cell-integration/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-integration/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-preprocessing/scripts/analyze b/container/skills/bio-single-cell-preprocessing/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-preprocessing/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-preprocessing/scripts/process b/container/skills/bio-single-cell-preprocessing/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-preprocessing/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-preprocessing/scripts/run b/container/skills/bio-single-cell-preprocessing/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-preprocessing/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-preprocessing/scripts/search b/container/skills/bio-single-cell-preprocessing/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-preprocessing/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-preprocessing/skill.md b/container/skills/bio-single-cell-preprocessing/skill.md new file mode 100644 index 0000000..e9e6845 --- /dev/null +++ b/container/skills/bio-single-cell-preprocessing/skill.md @@ -0,0 +1,338 @@ +--- +name: bio-single-cell-preprocessing +description: Quality control, filtering, and normalization for single-cell RNA-seq using Seurat (R) and Scanpy (Python). Use for calculating QC metrics, filtering cells and genes, normalizing counts, identifying highly variable genes, and scaling data. Use when filtering, normalizing, and selecting features in single-cell data. +tool_type: mixed +primary_tool: Seurat +--- + +## Version Compatibility + +Reference examples tested with: ggplot2 3.5+, matplotlib 3.8+, numpy 1.26+, scanpy 1.10+ + +Before using code patterns, verify installed versions match. If versions differ: +- Python: `pip show ` then `help(module.function)` to check signatures +- R: `packageVersion('')` then `?function_name` to verify parameters + +If code throws ImportError, AttributeError, or TypeError, introspect the installed +package and adapt the example to match the actual API rather than retrying. + +# Single-Cell Preprocessing + +**"Preprocess my scRNA-seq data"** → Filter low-quality cells/genes, normalize counts, identify highly variable genes, and prepare data for dimensionality reduction and clustering. +- Python: `scanpy.pp.filter_cells()` → `normalize_total()` → `log1p()` → `highly_variable_genes()` +- R: `Seurat::NormalizeData()` → `FindVariableFeatures()` → `ScaleData()` + +Quality control, filtering, normalization, and feature selection for scRNA-seq data. + +## Scanpy (Python) + +**Goal:** Preprocess scRNA-seq data through QC filtering, normalization, and feature selection using Scanpy. + +**Approach:** Calculate per-cell quality metrics, filter low-quality cells/genes, normalize library sizes, identify highly variable genes, and scale for downstream analysis. + +### Required Imports + +```python +import scanpy as sc +import numpy as np +``` + +### Calculate QC Metrics + +```python +# Calculate mitochondrial gene percentage +adata.var['mt'] = adata.var_names.str.startswith('MT-') +sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) + +# Key metrics added to adata.obs: +# - n_genes_by_counts: genes detected per cell +# - total_counts: total UMI counts per cell +# - pct_counts_mt: percentage mitochondrial +``` + +### Visualize QC Metrics + +```python +import matplotlib.pyplot as plt + +sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True) +sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt') +sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts') +``` + +### Filter Cells and Genes + +```python +# Filter cells by QC metrics +sc.pp.filter_cells(adata, min_genes=200) +sc.pp.filter_cells(adata, max_genes=5000) + +# Filter by mitochondrial percentage +adata = adata[adata.obs['pct_counts_mt'] < 20, :].copy() + +# Filter genes +sc.pp.filter_genes(adata, min_cells=3) + +print(f'After filtering: {adata.n_obs} cells, {adata.n_vars} genes') +``` + +### Store Raw Counts + +```python +# Store raw counts before normalization +adata.raw = adata.copy() +# Or use layers +adata.layers['counts'] = adata.X.copy() +``` + +### Normalization + +```python +# Library size normalization (normalize to 10,000 counts per cell) +sc.pp.normalize_total(adata, target_sum=1e4) + +# Log transform +sc.pp.log1p(adata) +``` + +### Highly Variable Genes + +```python +# Identify highly variable genes (default: top 2000) +sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor='seurat_v3', layer='counts') + +# Visualize +sc.pl.highly_variable_genes(adata) + +# Check results +print(f'Highly variable genes: {adata.var.highly_variable.sum()}') +``` + +### Subset to HVGs (Optional) + +```python +# Keep only highly variable genes for downstream analysis +adata_hvg = adata[:, adata.var.highly_variable].copy() +``` + +### Scaling (Z-score) + +```python +# Scale to unit variance and zero mean +sc.pp.scale(adata, max_value=10) +``` + +### Regress Out Confounders + +```python +# Regress out unwanted variation (e.g., cell cycle, mitochondrial) +sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt']) +``` + +### Complete Preprocessing Pipeline + +**Goal:** Run end-to-end preprocessing from raw 10X counts to analysis-ready data. + +**Approach:** Chain QC, filtering, normalization, HVG selection, and scaling into a single pipeline. + +```python +import scanpy as sc + +adata = sc.read_10x_mtx('filtered_feature_bc_matrix/') + +# QC +adata.var['mt'] = adata.var_names.str.startswith('MT-') +sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True) + +# Filter +sc.pp.filter_cells(adata, min_genes=200) +sc.pp.filter_genes(adata, min_cells=3) +adata = adata[adata.obs['pct_counts_mt'] < 20, :].copy() + +# Store raw +adata.raw = adata.copy() + +# Normalize +sc.pp.normalize_total(adata, target_sum=1e4) +sc.pp.log1p(adata) + +# HVGs +sc.pp.highly_variable_genes(adata, n_top_genes=2000) + +# Scale +adata = adata[:, adata.var.highly_variable].copy() +sc.pp.scale(adata, max_value=10) +``` + +--- + +## Seurat (R) + +**Goal:** Preprocess scRNA-seq data through QC filtering, normalization, and feature selection using Seurat. + +**Approach:** Calculate mitochondrial percentages, filter cells by QC thresholds, normalize with log or SCTransform, identify variable features, and scale for PCA. + +### Required Libraries + +```r +library(Seurat) +library(ggplot2) +``` + +### Calculate QC Metrics + +```r +# Calculate mitochondrial percentage +seurat_obj[['percent.mt']] <- PercentageFeatureSet(seurat_obj, pattern = '^MT-') + +# View QC metrics +head(seurat_obj@meta.data) +``` + +### Visualize QC Metrics + +```r +# Violin plots +VlnPlot(seurat_obj, features = c('nFeature_RNA', 'nCount_RNA', 'percent.mt'), ncol = 3) + +# Scatter plots +plot1 <- FeatureScatter(seurat_obj, feature1 = 'nCount_RNA', feature2 = 'percent.mt') +plot2 <- FeatureScatter(seurat_obj, feature1 = 'nCount_RNA', feature2 = 'nFeature_RNA') +plot1 + plot2 +``` + +### Filter Cells + +```r +# Filter by QC metrics +seurat_obj <- subset(seurat_obj, + subset = nFeature_RNA > 200 & + nFeature_RNA < 5000 & + percent.mt < 20) + +cat('After filtering:', ncol(seurat_obj), 'cells\n') +``` + +### Normalization (Log Normalization) + +```r +# Standard log normalization +seurat_obj <- NormalizeData(seurat_obj, normalization.method = 'LogNormalize', scale.factor = 10000) +``` + +### Normalization (SCTransform) + +```r +# SCTransform - recommended for most workflows +# Combines normalization, scaling, and HVG selection +seurat_obj <- SCTransform(seurat_obj, vars.to.regress = 'percent.mt', verbose = FALSE) +``` + +### Find Variable Features + +```r +# Identify highly variable features (if not using SCTransform) +seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = 'vst', nfeatures = 2000) + +# Visualize +top10 <- head(VariableFeatures(seurat_obj), 10) +plot1 <- VariableFeaturePlot(seurat_obj) +plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE) +plot2 +``` + +### Scaling + +```r +# Scale data (if not using SCTransform) +all.genes <- rownames(seurat_obj) +seurat_obj <- ScaleData(seurat_obj, features = all.genes) + +# Or scale only variable features (faster) +seurat_obj <- ScaleData(seurat_obj) +``` + +### Regress Out Confounders + +```r +# Regress out unwanted variation during scaling +seurat_obj <- ScaleData(seurat_obj, vars.to.regress = c('percent.mt', 'nCount_RNA')) +``` + +### Complete Preprocessing Pipeline (Log Normalization) + +**Goal:** Run end-to-end Seurat preprocessing with standard log normalization. + +**Approach:** Load 10X data, compute QC metrics, filter, normalize with LogNormalize, select variable features, and scale. + +```r +library(Seurat) + +counts <- Read10X(data.dir = 'filtered_feature_bc_matrix/') +seurat_obj <- CreateSeuratObject(counts = counts, min.cells = 3, min.features = 200) + +# QC +seurat_obj[['percent.mt']] <- PercentageFeatureSet(seurat_obj, pattern = '^MT-') + +# Filter +seurat_obj <- subset(seurat_obj, + subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 20) + +# Normalize +seurat_obj <- NormalizeData(seurat_obj) + +# HVGs +seurat_obj <- FindVariableFeatures(seurat_obj, nfeatures = 2000) + +# Scale +seurat_obj <- ScaleData(seurat_obj) +``` + +### Complete Preprocessing Pipeline (SCTransform) + +**Goal:** Run end-to-end Seurat preprocessing with SCTransform for variance-stabilized normalization. + +**Approach:** Load 10X data, compute QC metrics, filter, and apply SCTransform which jointly normalizes, selects HVGs, and scales. + +```r +library(Seurat) + +counts <- Read10X(data.dir = 'filtered_feature_bc_matrix/') +seurat_obj <- CreateSeuratObject(counts = counts, min.cells = 3, min.features = 200) + +# QC +seurat_obj[['percent.mt']] <- PercentageFeatureSet(seurat_obj, pattern = '^MT-') + +# Filter +seurat_obj <- subset(seurat_obj, + subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 20) + +# SCTransform (does normalization, HVG, and scaling) +seurat_obj <- SCTransform(seurat_obj, vars.to.regress = 'percent.mt', verbose = FALSE) +``` + +--- + +## QC Thresholds Reference + +| Metric | Typical Range | Notes | +|--------|---------------|-------| +| min_genes | 200-500 | Remove empty droplets | +| max_genes | 2500-5000 | Remove doublets | +| max_mt | 5-20% | Remove dying cells (tissue-dependent) | +| min_cells | 3-10 | Remove rarely detected genes | + +## Method Comparison + +| Step | Scanpy | Seurat (Standard) | Seurat (SCTransform) | +|------|--------|-------------------|---------------------| +| Normalize | `normalize_total` + `log1p` | `NormalizeData` | `SCTransform` | +| HVGs | `highly_variable_genes` | `FindVariableFeatures` | (included) | +| Scale | `scale` | `ScaleData` | (included) | +| Regress | `regress_out` | `ScaleData(vars.to.regress)` | `SCTransform(vars.to.regress)` | + +## Related Skills + +- data-io - Load data before preprocessing +- clustering - PCA and clustering after preprocessing +- markers-annotation - Find markers after clustering diff --git a/container/skills/bio-single-cell-trajectory/scripts/analyze b/container/skills/bio-single-cell-trajectory/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-trajectory/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-trajectory/scripts/process b/container/skills/bio-single-cell-trajectory/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-trajectory/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-trajectory/scripts/run b/container/skills/bio-single-cell-trajectory/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-trajectory/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-trajectory/scripts/search b/container/skills/bio-single-cell-trajectory/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-trajectory/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-single-cell-trajectory/skill.md b/container/skills/bio-single-cell-trajectory/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-single-cell-trajectory/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-spatial-multiomics/scripts/analyze b/container/skills/bio-spatial-multiomics/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-spatial-multiomics/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-spatial-multiomics/scripts/process b/container/skills/bio-spatial-multiomics/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-spatial-multiomics/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-spatial-multiomics/scripts/run b/container/skills/bio-spatial-multiomics/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-spatial-multiomics/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-spatial-multiomics/scripts/search b/container/skills/bio-spatial-multiomics/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-spatial-multiomics/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-spatial-multiomics/skill.md b/container/skills/bio-spatial-multiomics/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-spatial-multiomics/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-spatial-transcriptomics-preprocessing/scripts/analyze b/container/skills/bio-spatial-transcriptomics-preprocessing/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-spatial-transcriptomics-preprocessing/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-spatial-transcriptomics-preprocessing/scripts/process b/container/skills/bio-spatial-transcriptomics-preprocessing/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-spatial-transcriptomics-preprocessing/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-spatial-transcriptomics-preprocessing/scripts/run b/container/skills/bio-spatial-transcriptomics-preprocessing/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-spatial-transcriptomics-preprocessing/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-spatial-transcriptomics-preprocessing/scripts/search b/container/skills/bio-spatial-transcriptomics-preprocessing/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-spatial-transcriptomics-preprocessing/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-spatial-transcriptomics-preprocessing/skill.md b/container/skills/bio-spatial-transcriptomics-preprocessing/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-spatial-transcriptomics-preprocessing/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-structural-variant/scripts/analyze b/container/skills/bio-structural-variant/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-structural-variant/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-structural-variant/scripts/process b/container/skills/bio-structural-variant/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-structural-variant/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-structural-variant/scripts/run b/container/skills/bio-structural-variant/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-structural-variant/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-structural-variant/scripts/search b/container/skills/bio-structural-variant/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-structural-variant/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-structural-variant/skill.md b/container/skills/bio-structural-variant/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-structural-variant/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-tcr-seq/scripts/analyze b/container/skills/bio-tcr-seq/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-tcr-seq/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-tcr-seq/scripts/process b/container/skills/bio-tcr-seq/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-tcr-seq/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-tcr-seq/scripts/run b/container/skills/bio-tcr-seq/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-tcr-seq/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-tcr-seq/scripts/search b/container/skills/bio-tcr-seq/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-tcr-seq/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-tcr-seq/skill.md b/container/skills/bio-tcr-seq/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-tcr-seq/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-tools/SKILL.md b/container/skills/bio-tools/SKILL.md index 417b5e5..6379a44 100644 --- a/container/skills/bio-tools/SKILL.md +++ b/container/skills/bio-tools/SKILL.md @@ -101,156 +101,5 @@ print(f"LogP: {Descriptors.MolLogP(mol):.2f}") - For remote BLAST against NCBI, use `Bio.Blast.NCBIWWW.qblast()` — this sends the query over the network - For large files, prefer streaming with `SeqIO.parse()` over `SeqIO.read()` -- **Plots**: Save to `/workspace/group/plot.png` with `dpi=150, bbox_inches="tight"`. For publication-ready figures, use `cnsplots` or `pyGenomeTracks` (see below). +- Save plots to files (`plt.savefig("/workspace/group/plot.png")`) since there's no display - Write output files to `/workspace/group/` so the user can access them -- **Versioning**: When re-running analysis, save to `output/YYYY-MM-DD/` to avoid overwriting; update `_latest.md` with paths to newest outputs - -## Reusable Figure Templates - -Prefer these built-in scripts when creating common BioClaw figures, instead of writing one-off plotting code from scratch. - -### Volcano Plot Template -Path: - -```bash -/home/node/.claude/skills/bio-tools/volcano_plot_template.py -``` - -Example: - -```bash -python /home/node/.claude/skills/bio-tools/volcano_plot_template.py \ - --input /workspace/group/counts.csv \ - --output /workspace/group/volcano_plot.png \ - --title "Differential Expression Volcano Plot" -``` - -Expected columns by default: `gene`, `log2FC`, `pvalue` - -### QC Summary Plot Template -Path: - -```bash -/home/node/.claude/skills/bio-tools/qc_summary_plot_template.py -``` - -Example: - -```bash -python /home/node/.claude/skills/bio-tools/qc_summary_plot_template.py \ - --input /workspace/group/qc_metrics.csv \ - --output /workspace/group/qc_summary.png \ - --title "Sequencing QC Summary" -``` - -Expected sample column by default: `sample` - -Useful metric columns: `total_reads`, `q30_pct`, `gc_pct`, `duplication_pct` - -### PyMOL Render Template -Path: - -```bash -/home/node/.claude/skills/bio-tools/pymol_render_template.py -``` - -Examples: - -```bash -python /home/node/.claude/skills/bio-tools/pymol_render_template.py \ - --input 1M17 \ - --output /workspace/group/1m17_render.png \ - --highlight-selection "resn AQ4" -``` - -```bash -python /home/node/.claude/skills/bio-tools/pymol_render_template.py \ - --input /workspace/group/structure.pdb \ - --output /workspace/group/structure_render.png \ - --style cartoon -``` - -### Inline Plot Snippets (Heatmap, PCA, Bar) - -When the built-in scripts don't fit, use these patterns. Save to `/workspace/group/.png`. - -**Heatmap** (rows=genes, columns=samples): -```python -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -import seaborn as sns -df = pd.read_csv("/workspace/group/expression.csv", index_col=0) -sns.heatmap(np.log1p(df).iloc[:50], cmap='RdBu_r', center=0) -plt.savefig("/workspace/group/heatmap.png", dpi=150, bbox_inches="tight") -``` - -**PCA scatter** (columns: PC1, PC2, condition): -```python -import pandas as pd -import matplotlib.pyplot as plt -coords = pd.read_csv("/workspace/group/pca_coords.csv") -for c in coords['condition'].unique(): - sub = coords[coords['condition'] == c] - plt.scatter(sub['PC1'], sub['PC2'], label=c) -plt.legend() -plt.savefig("/workspace/group/pca.png", dpi=150, bbox_inches="tight") -``` - -**Bar plot** (columns: gene, count): -```python -import pandas as pd -import matplotlib.pyplot as plt -df = pd.read_csv("/workspace/group/top_genes.csv").head(20).sort_values('count', ascending=True) -plt.barh(df['gene'], df['count']) -plt.savefig("/workspace/group/barplot.png", dpi=150, bbox_inches="tight") -``` - -## Publication-Ready Plots (cnsplots) - -**cnsplots** provides Cell/Nature/Science journal-style figures. Use for volcano, bar, box, violin, heatmap, etc. - -```python -import cnsplots as cns -import pandas as pd -import numpy as np - -# Volcano plot (columns: gene, log2FC, pvalue or padj) -df = pd.read_csv("/workspace/group/counts.csv") -df["-log10(p)"] = -np.log10(df["pvalue"].clip(lower=1e-300)) # or use padj -cns.figure(height=200, width=200) -cns.volcanoplot(data=df, x="log2FC", y="-log10(p)", symbol="gene") -cns.savefig("/workspace/group/volcano_cns.png") -``` - -```python -# Boxplot with Mann-Whitney test -cns.figure(150, 150) -cns.boxplot(data=df, x="group", y="value", pairs="all") -cns.savefig("/workspace/group/boxplot.png") -``` - -```python -# Heatmap from AnnData (single-cell) -import scanpy as sc -adata = sc.read_h5ad("/workspace/group/data.h5ad") -cns.figure(200, 200) -cns.heatmapplot(adata, row_cluster=True, col_cluster=True, cmap="bwr") -cns.savefig("/workspace/group/heatmap_cns.png") -``` - -See [cnsplots docs](https://cnsplots.farid.one/) for more: violin, scatter, survival, ROC, GSEA, etc. - -## Genome Browser Tracks (pyGenomeTracks) - -**pyGenomeTracks** plots genome browser tracks (BED, BigWig, GTF, etc.). BEDTools must be installed (already in container). - -```bash -# 1. Create config from your files -make_tracks_file --trackFiles /workspace/group/peaks.bed /workspace/group/coverage.bw -o /workspace/group/tracks.ini - -# 2. Plot a region (chr:start-end) -pyGenomeTracks --tracks /workspace/group/tracks.ini --region chr1:1000000-4000000 -o /workspace/group/genome_tracks.png --dpi 150 -``` - -Supported file types: `.bed`, `.bw` (bigwig), `.gtf`, `.gff`, `.arcs`, `.links`. Edit `tracks.ini` to adjust track colors, heights, titles. diff --git a/container/skills/bio-tools/pymol_render_template.py b/container/skills/bio-tools/pymol_render_template.py deleted file mode 100644 index 8f13c99..0000000 --- a/container/skills/bio-tools/pymol_render_template.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python3 -import argparse -from pathlib import Path - -from pymol import cmd - - -def build_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser(description="Render a clean protein structure image with PyMOL.") - parser.add_argument("--input", required=True, help="PDB/mmCIF file path or 4-character PDB ID.") - parser.add_argument("--output", required=True, help="Output image path.") - parser.add_argument("--title", default="", help="Optional title stored in PyMOL scene name.") - parser.add_argument("--highlight-selection", default="", help="Optional PyMOL selection to highlight.") - parser.add_argument("--highlight-color", default="tv_orange", help="Color for highlight selection.") - parser.add_argument("--width", type=int, default=1800, help="Output image width in pixels.") - parser.add_argument("--height", type=int, default=1400, help="Output image height in pixels.") - parser.add_argument("--style", choices=["cartoon", "surface"], default="cartoon", help="Main representation style.") - return parser - - -def load_structure(input_value: str) -> None: - input_path = Path(input_value) - if input_path.exists(): - cmd.load(str(input_path), "structure") - elif len(input_value) == 4 and input_value.isalnum(): - cmd.fetch(input_value, name="structure", type="pdb1") - else: - raise SystemExit(f"Input not found and does not look like a PDB ID: {input_value}") - - -def main() -> None: - args = build_parser().parse_args() - output_path = Path(args.output) - output_path.parent.mkdir(parents=True, exist_ok=True) - - cmd.reinitialize() - load_structure(args.input) - - cmd.remove("solvent") - cmd.hide("everything", "all") - cmd.show(args.style, "polymer.protein") - cmd.color("gray80", "polymer.protein") - cmd.set("cartoon_fancy_helices", 1) - cmd.set("ray_opaque_background", 0) - cmd.bg_color("white") - cmd.orient("all") - cmd.zoom("all", buffer=3.0) - - if args.highlight_selection: - cmd.show("sticks", args.highlight_selection) - cmd.color(args.highlight_color, args.highlight_selection) - - if args.title: - cmd.scene(args.title, "store") - - cmd.ray(args.width, args.height) - cmd.png(str(output_path), width=args.width, height=args.height, dpi=300, ray=1) - - -if __name__ == "__main__": - main() diff --git a/container/skills/bio-tools/qc_summary_plot_template.py b/container/skills/bio-tools/qc_summary_plot_template.py deleted file mode 100644 index dbffd96..0000000 --- a/container/skills/bio-tools/qc_summary_plot_template.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python3 -import argparse -from pathlib import Path - -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns - - -DEFAULT_COLUMNS = [ - "total_reads", - "q30_pct", - "gc_pct", - "duplication_pct", -] - - -def build_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser(description="Create a compact QC summary plot from a CSV/TSV table.") - parser.add_argument("--input", required=True, help="QC summary table in CSV/TSV format.") - parser.add_argument("--output", required=True, help="Output PNG path.") - parser.add_argument("--title", default="QC Summary", help="Plot title.") - parser.add_argument("--sample-col", default="sample", help="Sample column name.") - parser.add_argument( - "--metrics", - nargs="*", - default=DEFAULT_COLUMNS, - help="Metric columns to visualize. Missing columns are skipped.", - ) - return parser - - -def read_table(input_path: Path) -> pd.DataFrame: - suffix = input_path.suffix.lower() - sep = "\t" if suffix in {".tsv", ".txt"} else "," - return pd.read_csv(input_path, sep=sep) - - -def main() -> None: - args = build_parser().parse_args() - input_path = Path(args.input) - output_path = Path(args.output) - output_path.parent.mkdir(parents=True, exist_ok=True) - - df = read_table(input_path).copy() - if args.sample_col not in df.columns: - raise SystemExit(f"Missing sample column: {args.sample_col}") - - metrics = [metric for metric in args.metrics if metric in df.columns] - if not metrics: - raise SystemExit( - f"None of the requested metrics were found. Available columns: {', '.join(df.columns)}" - ) - - sns.set_theme(style="whitegrid", context="talk") - rows = 2 if len(metrics) > 2 else 1 - cols = 2 if len(metrics) > 1 else 1 - fig, axes = plt.subplots(rows, cols, figsize=(12, 6 if rows == 1 else 8), dpi=300) - axes_list = axes.flatten() if hasattr(axes, "flatten") else [axes] - palette = sns.color_palette("Blues_r", n_colors=max(len(df), 3)) - - for index, metric in enumerate(metrics): - ax = axes_list[index] - metric_df = df.sort_values(metric, ascending=False) - sns.barplot( - data=metric_df, - x=args.sample_col, - y=metric, - palette=palette, - ax=ax, - ) - ax.set_title(metric.replace("_", " ").title(), fontsize=13, weight="bold") - ax.set_xlabel("") - ax.set_ylabel(metric.replace("_", " ").title(), fontsize=11) - ax.tick_params(axis="x", rotation=35, labelsize=9) - ax.tick_params(axis="y", labelsize=9) - - for index in range(len(metrics), len(axes_list)): - axes_list[index].axis("off") - - fig.suptitle(args.title, fontsize=17, weight="bold", y=1.02) - plt.tight_layout() - fig.savefig(output_path, dpi=300, bbox_inches="tight") - - -if __name__ == "__main__": - main() diff --git a/container/skills/bio-tools/volcano_plot_template.py b/container/skills/bio-tools/volcano_plot_template.py deleted file mode 100644 index 86172d4..0000000 --- a/container/skills/bio-tools/volcano_plot_template.py +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import math -from pathlib import Path - -import matplotlib.pyplot as plt -import pandas as pd -import seaborn as sns - - -def build_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser(description="Create a publication-style volcano plot.") - parser.add_argument("--input", required=True, help="CSV/TSV file with gene, log2FC, and pvalue columns.") - parser.add_argument("--output", required=True, help="Output PNG path.") - parser.add_argument("--title", default="Volcano Plot", help="Plot title.") - parser.add_argument("--fc-col", default="log2FC", help="Fold-change column.") - parser.add_argument("--p-col", default="pvalue", help="P-value column.") - parser.add_argument("--label-col", default="gene", help="Label column.") - parser.add_argument("--fc-threshold", type=float, default=1.0, help="Absolute log2 fold-change threshold.") - parser.add_argument("--p-threshold", type=float, default=0.05, help="P-value significance threshold.") - parser.add_argument("--top-labels", type=int, default=12, help="Maximum number of labels to annotate.") - return parser - - -def read_table(input_path: Path) -> pd.DataFrame: - suffix = input_path.suffix.lower() - sep = "\t" if suffix in {".tsv", ".txt"} else "," - return pd.read_csv(input_path, sep=sep) - - -def main() -> None: - args = build_parser().parse_args() - input_path = Path(args.input) - output_path = Path(args.output) - output_path.parent.mkdir(parents=True, exist_ok=True) - - df = read_table(input_path).copy() - required = {args.fc_col, args.p_col, args.label_col} - missing = required.difference(df.columns) - if missing: - raise SystemExit(f"Missing required columns: {', '.join(sorted(missing))}") - - df = df[[args.label_col, args.fc_col, args.p_col]].dropna().copy() - df["neg_log10_p"] = -df[args.p_col].clip(lower=1e-300).map(math.log10) - - def classify(row: pd.Series) -> str: - if row[args.p_col] < args.p_threshold and row[args.fc_col] >= args.fc_threshold: - return "Upregulated" - if row[args.p_col] < args.p_threshold and row[args.fc_col] <= -args.fc_threshold: - return "Downregulated" - return "Not significant" - - df["group"] = df.apply(classify, axis=1) - - palette = { - "Upregulated": "#C0392B", - "Downregulated": "#2E86C1", - "Not significant": "#B3B6B7", - } - - sns.set_theme(style="whitegrid", context="talk") - fig, ax = plt.subplots(figsize=(9, 6), dpi=300) - sns.scatterplot( - data=df, - x=args.fc_col, - y="neg_log10_p", - hue="group", - palette=palette, - s=24, - linewidth=0, - alpha=0.85, - ax=ax, - ) - - ax.axvline(args.fc_threshold, color="#7F8C8D", linestyle="--", linewidth=1) - ax.axvline(-args.fc_threshold, color="#7F8C8D", linestyle="--", linewidth=1) - ax.axhline(-math.log10(args.p_threshold), color="#7F8C8D", linestyle="--", linewidth=1) - - label_candidates = df[df["group"] != "Not significant"].copy() - label_candidates["label_score"] = label_candidates["neg_log10_p"] * label_candidates[args.fc_col].abs() - label_candidates = label_candidates.nlargest(args.top_labels, "label_score") - - for _, row in label_candidates.iterrows(): - ax.text( - row[args.fc_col], - row["neg_log10_p"] + 0.08, - str(row[args.label_col]), - fontsize=9, - ha="center", - va="bottom", - ) - - ax.set_title(args.title, fontsize=16, weight="bold") - ax.set_xlabel("log2 Fold Change", fontsize=13) - ax.set_ylabel("-log10(p-value)", fontsize=13) - ax.legend(title="", frameon=False, loc="upper right") - sns.despine(ax=ax) - plt.tight_layout() - fig.savefig(output_path, dpi=300, bbox_inches="tight") - - -if __name__ == "__main__": - main() diff --git a/container/skills/bio-variant-annotation/scripts/analyze b/container/skills/bio-variant-annotation/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-variant-annotation/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-variant-annotation/scripts/process b/container/skills/bio-variant-annotation/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-variant-annotation/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-variant-annotation/scripts/run b/container/skills/bio-variant-annotation/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-variant-annotation/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-variant-annotation/scripts/search b/container/skills/bio-variant-annotation/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-variant-annotation/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-variant-annotation/skill.md b/container/skills/bio-variant-annotation/skill.md new file mode 100644 index 0000000..a5dd0d1 --- /dev/null +++ b/container/skills/bio-variant-annotation/skill.md @@ -0,0 +1,410 @@ +--- +name: bio-variant-annotation +description: Comprehensive variant annotation using bcftools annotate/csq, VEP, SnpEff, and ANNOVAR. Add database annotations, predict functional consequences, and assess clinical significance. Use when annotating variants with functional and clinical information. +tool_type: mixed +primary_tool: VEP +--- + +## Version Compatibility + +Reference examples tested with: bcftools 1.19+ + +Before using code patterns, verify installed versions match. If versions differ: +- Python: `pip show ` then `help(module.function)` to check signatures +- CLI: ` --version` then ` --help` to confirm flags + +If code throws ImportError, AttributeError, or TypeError, introspect the installed +package and adapt the example to match the actual API rather than retrying. + +# Variant Annotation + +## Tool Comparison + +| Tool | Best For | Speed | Output | +|------|----------|-------|--------| +| bcftools csq | Simple consequence prediction | Fast | VCF | +| VEP | Comprehensive with plugins | Moderate | VCF/TXT | +| SnpEff | Fast batch annotation | Fast | VCF | +| ANNOVAR | Flexible databases | Moderate | TXT | + +## bcftools annotate + +**Goal:** Add or remove INFO/ID annotations from external databases using bcftools. + +**Approach:** Match variants by position and allele against annotation VCF/BED/TAB files, copying specified columns. + +**"Add rsIDs to my VCF from dbSNP"** → Match variant positions against a database and copy identifiers or annotation fields into the VCF. + +### Add Annotations from Database + +```bash +bcftools annotate -a dbsnp.vcf.gz -c ID input.vcf.gz -Oz -o annotated.vcf.gz +``` + +### Annotation Columns (`-c`) + +| Option | Description | +|--------|-------------| +| `ID` | Copy ID column | +| `INFO` | Copy all INFO fields | +| `INFO/TAG` | Copy specific INFO field | +| `+INFO/TAG` | Add to existing values | + +### Add rsIDs from dbSNP + +```bash +bcftools annotate -a dbsnp.vcf.gz -c ID input.vcf.gz -Oz -o with_rsids.vcf.gz +``` + +### Add Multiple Annotations + +```bash +bcftools annotate -a database.vcf.gz -c ID,INFO/AF,INFO/CAF input.vcf.gz -Oz -o annotated.vcf.gz +``` + +### Add from BED/TAB Files + +```bash +# BED with 4th column as annotation +bcftools annotate -a regions.bed.gz -c CHROM,FROM,TO,INFO/REGION \ + -h <(echo '##INFO=') \ + input.vcf.gz -Oz -o annotated.vcf.gz + +# Tab file: CHROM POS VALUE +bcftools annotate -a annotations.tab.gz -c CHROM,POS,INFO/SCORE \ + -h <(echo '##INFO=') \ + input.vcf.gz -Oz -o annotated.vcf.gz +``` + +### Remove Annotations + +```bash +bcftools annotate -x INFO/DP,INFO/MQ input.vcf.gz -Oz -o clean.vcf.gz +bcftools annotate -x INFO input.vcf.gz -Oz -o minimal.vcf.gz # Remove all INFO +``` + +### Set ID from Fields + +```bash +bcftools annotate --set-id '%CHROM\_%POS\_%REF\_%ALT' input.vcf.gz -Oz -o with_ids.vcf.gz +``` + +## bcftools csq + +**Goal:** Predict functional consequences of variants using gene annotations. + +**Approach:** Map variants to GFF3 gene models and classify as synonymous, missense, frameshift, etc. + +Simple consequence prediction using GFF annotation. + +```bash +bcftools csq -f reference.fa -g genes.gff3.gz input.vcf.gz -Oz -o consequences.vcf.gz +``` + +### Consequence Types + +| Consequence | Description | +|-------------|-------------| +| `synonymous` | No amino acid change | +| `missense` | Amino acid change | +| `stop_gained` | Introduces stop codon | +| `frameshift` | Changes reading frame | +| `splice_donor/acceptor` | Affects splicing | + +## Ensembl VEP + +**Goal:** Annotate variants comprehensively with consequence, impact, pathogenicity scores, and population frequencies. + +**Approach:** Run VEP with offline cache, enabling SIFT, PolyPhen, HGVS, frequency, and plugin-based predictions. + +**"Annotate my variants with functional consequences"** → Predict coding effects, impact severity, and pathogenicity using Ensembl's Variant Effect Predictor. + +### Installation + +```bash +conda install -c bioconda ensembl-vep +vep_install -a cf -s homo_sapiens -y GRCh38 --CONVERT +``` + +### Basic Annotation + +```bash +vep -i input.vcf -o output.vcf --vcf --cache --offline +``` + +### Comprehensive Annotation + +```bash +vep -i input.vcf -o output.vcf \ + --vcf \ + --cache --offline \ + --species homo_sapiens \ + --assembly GRCh38 \ + --everything \ + --fork 4 +``` + +### --everything Enables + +- `--sift b` - SIFT predictions +- `--polyphen b` - PolyPhen predictions +- `--hgvs` - HGVS nomenclature +- `--symbol` - Gene symbols +- `--canonical` - Canonical transcript +- `--af` - 1000 Genomes frequencies +- `--af_gnomade/g` - gnomAD frequencies +- `--pubmed` - PubMed IDs + +### Filter by Impact + +```bash +vep -i input.vcf -o output.vcf --vcf \ + --cache --offline \ + --pick \ + --filter "IMPACT in HIGH,MODERATE" +``` + +### Plugins + +```bash +# CADD scores +vep -i input.vcf -o output.vcf --vcf \ + --cache --offline \ + --plugin CADD,whole_genome_SNVs.tsv.gz + +# dbNSFP (multiple predictors) +vep -i input.vcf -o output.vcf --vcf \ + --cache --offline \ + --plugin dbNSFP,dbNSFP4.3a.gz,ALL + +# Multiple plugins +vep -i input.vcf -o output.vcf --vcf \ + --cache --offline \ + --plugin CADD,cadd.tsv.gz \ + --plugin dbNSFP,dbnsfp.gz,SIFT_score,Polyphen2_HDIV_score \ + --plugin SpliceAI,spliceai.vcf.gz +``` + +### VEP Output Fields + +| Field | Description | +|-------|-------------| +| Consequence | SO term (e.g., missense_variant) | +| IMPACT | HIGH, MODERATE, LOW, MODIFIER | +| SYMBOL | Gene symbol | +| HGVSc/HGVSp | HGVS coding/protein change | +| SIFT/PolyPhen | Pathogenicity predictions | + +## SnpEff + +**Goal:** Annotate variants with gene effects and impact categories using SnpEff. + +**Approach:** Run SnpEff ann against a genome database, then use SnpSift for database cross-referencing and filtering. + +### Installation + +```bash +conda install -c bioconda snpeff +snpEff download GRCh38.105 +``` + +### Basic Annotation + +```bash +snpEff ann GRCh38.105 input.vcf > output.vcf +``` + +### With Statistics + +```bash +snpEff ann -v -stats stats.html -csvStats stats.csv GRCh38.105 input.vcf > output.vcf +``` + +### Filter by Impact + +```bash +snpEff ann GRCh38.105 input.vcf | \ + SnpSift filter "(ANN[*].IMPACT = 'HIGH')" > high_impact.vcf +``` + +### SnpEff Impact Categories + +| Impact | Examples | +|--------|----------| +| HIGH | Stop gained, frameshift, splice donor/acceptor | +| MODERATE | Missense, inframe indel | +| LOW | Synonymous, splice region | +| MODIFIER | Intron, intergenic, UTR | + +### SnpSift Database Annotations + +```bash +# dbSNP +SnpSift annotate dbsnp.vcf.gz input.vcf > annotated.vcf + +# ClinVar +SnpSift annotate clinvar.vcf.gz input.vcf > annotated.vcf + +# dbNSFP +SnpSift dbnsfp -db dbNSFP4.3a.txt.gz input.vcf > annotated.vcf + +# Chain multiple +snpEff ann GRCh38.105 input.vcf | \ + SnpSift annotate dbsnp.vcf.gz | \ + SnpSift annotate clinvar.vcf.gz > fully_annotated.vcf +``` + +### SnpSift Filtering + +```bash +SnpSift filter "(QUAL >= 30) & (DP >= 10)" input.vcf > filtered.vcf +SnpSift filter "(exists CLNSIG) & (CLNSIG has 'Pathogenic')" input.vcf > pathogenic.vcf +``` + +## ANNOVAR + +**Goal:** Annotate variants with gene, frequency, and pathogenicity databases using ANNOVAR. + +**Approach:** Run table_annovar.pl with multiple protocols (gene, filter, region) against downloaded annotation databases. + +### Installation + +```bash +# Download from https://annovar.openbioinformatics.org/ (registration required) +annotate_variation.pl -buildver hg38 -downdb -webfrom annovar refGene humandb/ +annotate_variation.pl -buildver hg38 -downdb -webfrom annovar gnomad30_genome humandb/ +``` + +### Table Annotation + +```bash +table_annovar.pl input.vcf humandb/ \ + -buildver hg38 \ + -out annotated \ + -remove \ + -protocol refGene,gnomad30_genome,clinvar_20230416,dbnsfp42a \ + -operation g,f,f,f \ + -nastring . \ + -vcfinput +``` + +## Python: Parse Annotated VCF + +**Goal:** Extract and interpret annotation fields from VEP CSQ or SnpEff ANN strings in Python. + +**Approach:** Parse pipe-delimited annotation strings against the header-defined field order, then filter by impact or consequence. + +### Parse VEP CSQ + +```python +from cyvcf2 import VCF + +def parse_vep_csq(csq_string, csq_header): + fields = csq_header.split('|') + values = csq_string.split('|') + return dict(zip(fields, values)) + +vcf = VCF('vep_output.vcf') +csq_header = None +for h in vcf.header_iter(): + if h['HeaderType'] == 'INFO' and h['ID'] == 'CSQ': + csq_header = h['Description'].split('Format: ')[1].rstrip('"') + break + +for variant in vcf: + csq = variant.INFO.get('CSQ') + if csq: + for transcript in csq.split(','): + parsed = parse_vep_csq(transcript, csq_header) + if parsed.get('IMPACT') in ('HIGH', 'MODERATE'): + print(f"{variant.CHROM}:{variant.POS} {parsed['SYMBOL']} {parsed['Consequence']}") +``` + +### Parse SnpEff ANN + +```python +from cyvcf2 import VCF + +def parse_snpeff_ann(ann_string): + fields = ['Allele', 'Annotation', 'Impact', 'Gene_Name', 'Gene_ID', + 'Feature_Type', 'Feature_ID', 'Transcript_BioType', 'Rank', + 'HGVS_c', 'HGVS_p', 'cDNA_pos', 'CDS_pos', 'Protein_pos', 'Distance'] + values = ann_string.split('|') + return dict(zip(fields, values[:len(fields)])) + +for variant in VCF('snpeff_output.vcf'): + ann = variant.INFO.get('ANN') + if ann: + for transcript in ann.split(','): + parsed = parse_snpeff_ann(transcript) + if parsed['Impact'] == 'HIGH': + print(f"{variant.CHROM}:{variant.POS} {parsed['Gene_Name']} {parsed['Annotation']}") +``` + +## Complete Annotation Pipeline + +**Goal:** Run a full annotation workflow from normalization through VEP annotation to impact filtering. + +**Approach:** Normalize variants, annotate with VEP (--everything --pick), then filter for HIGH/MODERATE impact. + +```bash +#!/bin/bash +set -euo pipefail + +INPUT=$1 +REFERENCE=$2 +VEP_CACHE=$3 +OUTPUT_PREFIX=$4 + +# Normalize variants +bcftools norm -f $REFERENCE -m-any $INPUT -Oz -o ${OUTPUT_PREFIX}_norm.vcf.gz +bcftools index ${OUTPUT_PREFIX}_norm.vcf.gz + +# VEP annotation +vep -i ${OUTPUT_PREFIX}_norm.vcf.gz \ + -o ${OUTPUT_PREFIX}_vep.vcf \ + --vcf --cache --offline --dir_cache $VEP_CACHE \ + --assembly GRCh38 --everything --pick --fork 4 + +bgzip ${OUTPUT_PREFIX}_vep.vcf +bcftools index ${OUTPUT_PREFIX}_vep.vcf.gz + +# Filter high/moderate impact +bcftools view -i 'INFO/CSQ~"HIGH" || INFO/CSQ~"MODERATE"' \ + ${OUTPUT_PREFIX}_vep.vcf.gz -Oz -o ${OUTPUT_PREFIX}_filtered.vcf.gz +``` + +## Pathogenicity Predictors + +| Predictor | Deleterious | Benign | +|-----------|-------------|--------| +| SIFT | < 0.05 | >= 0.05 | +| PolyPhen-2 (HDIV) | > 0.957 (probably), > 0.453 (possibly) | <= 0.453 | +| CADD | > 20 (top 1%), > 30 (top 0.1%) | < 10 | +| REVEL | > 0.5 | < 0.5 | + +## Clinical Significance (ClinVar) + +| Code | Meaning | +|------|---------| +| Pathogenic | Disease-causing | +| Likely_pathogenic | Probably disease-causing | +| Uncertain_significance | VUS | +| Likely_benign | Probably not disease-causing | +| Benign | Not disease-causing | + +## Quick Reference + +| Task | Command | +|------|---------| +| Add rsIDs | `bcftools annotate -a dbsnp.vcf.gz -c ID in.vcf.gz` | +| VEP annotation | `vep -i in.vcf -o out.vcf --vcf --cache --everything` | +| SnpEff annotation | `snpEff ann GRCh38.105 in.vcf > out.vcf` | +| Consequences only | `bcftools csq -f ref.fa -g genes.gff in.vcf.gz` | + +## Related Skills + +- variant-calling/variant-normalization - Normalize before annotating +- variant-calling/filtering-best-practices - Filter by annotations +- variant-calling/vcf-basics - Query annotated fields +- database-access/entrez-fetch - Download annotation databases diff --git a/container/skills/bio-variant-calling/scripts/analyze b/container/skills/bio-variant-calling/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-variant-calling/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-variant-calling/scripts/process b/container/skills/bio-variant-calling/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-variant-calling/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-variant-calling/scripts/run b/container/skills/bio-variant-calling/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-variant-calling/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-variant-calling/scripts/search b/container/skills/bio-variant-calling/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/bio-variant-calling/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/bio-variant-calling/skill.md b/container/skills/bio-variant-calling/skill.md new file mode 100644 index 0000000..c624edb --- /dev/null +++ b/container/skills/bio-variant-calling/skill.md @@ -0,0 +1,266 @@ +--- +name: bio-variant-calling +description: Call SNPs and indels from aligned reads using bcftools mpileup and call. Use when detecting variants from BAM files or generating VCF from alignments. +tool_type: cli +primary_tool: bcftools +--- + +## Version Compatibility + +Reference examples tested with: bcftools 1.19+ + +Before using code patterns, verify installed versions match. If versions differ: +- CLI: ` --version` then ` --help` to confirm flags + +If code throws ImportError, AttributeError, or TypeError, introspect the installed +package and adapt the example to match the actual API rather than retrying. + +# Variant Calling + +Call SNPs and indels from aligned reads using bcftools. + +## Basic Workflow + +``` +BAM file + Reference FASTA + | + v + bcftools mpileup (generate pileup) + | + v + bcftools call (call variants) + | + v + VCF file +``` + +## bcftools mpileup + call + +**Goal:** Detect SNPs and indels from aligned reads using the bcftools pileup-and-call pipeline. + +**Approach:** Generate per-position pileup likelihoods with mpileup, then call genotypes with the multiallelic caller. + +**"Call variants from my BAM file"** → Generate genotype likelihoods from aligned reads and identify variant sites using a Bayesian caller. + +### Basic Variant Calling +```bash +bcftools mpileup -f reference.fa input.bam | bcftools call -mv -o variants.vcf +``` + +### Output Compressed VCF +```bash +bcftools mpileup -f reference.fa input.bam | bcftools call -mv -Oz -o variants.vcf.gz +bcftools index variants.vcf.gz +``` + +### Call Specific Region +```bash +bcftools mpileup -f reference.fa -r chr1:1000000-2000000 input.bam | \ + bcftools call -mv -o region.vcf +``` + +### Call from Multiple BAMs +```bash +bcftools mpileup -f reference.fa sample1.bam sample2.bam sample3.bam | \ + bcftools call -mv -o variants.vcf +``` + +### BAM List File +```bash +# bams.txt: one BAM path per line +bcftools mpileup -f reference.fa -b bams.txt | bcftools call -mv -o variants.vcf +``` + +## mpileup Options + +**Goal:** Control pileup generation with quality thresholds, annotations, and region restrictions. + +**Approach:** Set minimum mapping/base quality, request specific FORMAT/INFO tags, and restrict to target regions. + +### Quality Filtering +```bash +bcftools mpileup -f reference.fa \ + -q 20 \ # Min mapping quality + -Q 20 \ # Min base quality + input.bam | bcftools call -mv -o variants.vcf +``` + +### Annotate with Read Depth +```bash +bcftools mpileup -f reference.fa -a DP,AD input.bam | bcftools call -mv -o variants.vcf +``` + +### Full Annotation Set +```bash +bcftools mpileup -f reference.fa \ + -a FORMAT/DP,FORMAT/AD,FORMAT/ADF,FORMAT/ADR,INFO/AD \ + input.bam | bcftools call -mv -o variants.vcf +``` + +### Target Regions (BED) +```bash +bcftools mpileup -f reference.fa -R targets.bed input.bam | \ + bcftools call -mv -o variants.vcf +``` + +### Max Depth +```bash +bcftools mpileup -f reference.fa -d 1000 input.bam | bcftools call -mv -o variants.vcf +``` + +## call Options + +### Calling Models + +| Flag | Model | Use Case | +|------|-------|----------| +| `-m` | Multiallelic caller | Default, recommended | +| `-c` | Consensus caller | Legacy, single sample | + +### Output Variants Only +```bash +bcftools mpileup -f reference.fa input.bam | bcftools call -mv -o variants.vcf +# -v outputs variant sites only (not reference calls) +``` + +### Output All Sites +```bash +bcftools mpileup -f reference.fa input.bam | bcftools call -m -o all_sites.vcf +# Without -v, outputs all sites including reference +``` + +### Ploidy +```bash +# Haploid calling +bcftools mpileup -f reference.fa input.bam | bcftools call -m --ploidy 1 -o variants.vcf + +# Specify ploidy file +bcftools mpileup -f reference.fa input.bam | bcftools call -m --ploidy-file ploidy.txt -o variants.vcf +``` + +### Prior Probability +```bash +# Adjust variant prior (default 1.1e-3) +bcftools mpileup -f reference.fa input.bam | bcftools call -m -P 0.001 -o variants.vcf +``` + +## Common Pipelines + +**Goal:** Run production-ready variant calling workflows for single-sample and multi-sample analyses. + +**Approach:** Chain mpileup and call with quality filters, annotations, and compressed output, optionally parallelized by chromosome. + +### Standard SNP/Indel Calling +```bash +bcftools mpileup -Ou -f reference.fa \ + -q 20 -Q 20 \ + -a FORMAT/DP,FORMAT/AD \ + input.bam | \ +bcftools call -mv -Oz -o variants.vcf.gz + +bcftools index variants.vcf.gz +``` + +### Multi-sample Calling +```bash +bcftools mpileup -Ou -f reference.fa \ + -a FORMAT/DP,FORMAT/AD \ + sample1.bam sample2.bam sample3.bam | \ +bcftools call -mv -Oz -o cohort.vcf.gz + +bcftools index cohort.vcf.gz +``` + +### Calling with Regions +```bash +bcftools mpileup -Ou -f reference.fa \ + -R targets.bed \ + -a FORMAT/DP,FORMAT/AD \ + input.bam | \ +bcftools call -mv -Oz -o targets.vcf.gz +``` + +### Parallel by Chromosome +```bash +for chr in chr1 chr2 chr3; do + bcftools mpileup -Ou -f reference.fa -r "$chr" input.bam | \ + bcftools call -mv -Oz -o "${chr}.vcf.gz" & +done +wait + +# Concatenate results +bcftools concat -Oz -o all.vcf.gz chr*.vcf.gz +bcftools index all.vcf.gz +``` + +## Annotation Tags + +### INFO Tags +| Tag | Description | +|-----|-------------| +| `DP` | Total read depth | +| `AD` | Allelic depths | +| `MQ` | Mapping quality | +| `FS` | Fisher strand bias | +| `SGB` | Segregation based metric | + +### FORMAT Tags +| Tag | Description | +|-----|-------------| +| `GT` | Genotype | +| `DP` | Read depth per sample | +| `AD` | Allelic depths per sample | +| `ADF` | Forward strand allelic depths | +| `ADR` | Reverse strand allelic depths | +| `GQ` | Genotype quality | +| `PL` | Phred-scaled likelihoods | + +### Request Specific Annotations +```bash +bcftools mpileup -f reference.fa \ + -a FORMAT/DP,FORMAT/AD,FORMAT/SP,INFO/AD \ + input.bam | bcftools call -mv -o variants.vcf +``` + +## Performance Options + +**Goal:** Speed up variant calling for large datasets. + +**Approach:** Use multi-threading and uncompressed BCF piping to reduce I/O overhead. + +### Multi-threading +```bash +bcftools mpileup -f reference.fa --threads 4 input.bam | \ + bcftools call -mv --threads 4 -o variants.vcf +``` + +### Uncompressed BCF for Speed +```bash +bcftools mpileup -Ou -f reference.fa input.bam | bcftools call -mv -Ou | \ + bcftools filter -Oz -o filtered.vcf.gz +``` + +## Quick Reference + +| Task | Command | +|------|---------| +| Basic calling | `bcftools mpileup -f ref.fa in.bam \| bcftools call -mv -o out.vcf` | +| With quality filter | `bcftools mpileup -f ref.fa -q 20 -Q 20 in.bam \| bcftools call -mv` | +| Region | `bcftools mpileup -f ref.fa -r chr1:1-1000 in.bam \| bcftools call -mv` | +| Multi-sample | `bcftools mpileup -f ref.fa s1.bam s2.bam \| bcftools call -mv` | +| With annotations | `bcftools mpileup -f ref.fa -a DP,AD in.bam \| bcftools call -mv` | + +## Common Errors + +| Error | Cause | Solution | +|-------|-------|----------| +| `no FASTA reference` | Missing -f | Add `-f reference.fa` | +| `reference mismatch` | Wrong reference | Use same reference as alignment | +| `no variants called` | Low quality/depth | Lower quality thresholds | + +## Related Skills + +- vcf-basics - View and query resulting VCF +- filtering-best-practices - Filter variants by quality +- variant-normalization - Normalize indels +- alignment-files/pileup-generation - Alternative pileup generation diff --git a/container/skills/drugbank-database/scripts/analyze b/container/skills/drugbank-database/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/drugbank-database/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/drugbank-database/scripts/process b/container/skills/drugbank-database/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/drugbank-database/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/drugbank-database/scripts/run b/container/skills/drugbank-database/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/drugbank-database/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/drugbank-database/scripts/search b/container/skills/drugbank-database/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/drugbank-database/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/drugbank-database/skill.md b/container/skills/drugbank-database/skill.md new file mode 100644 index 0000000..488daa0 --- /dev/null +++ b/container/skills/drugbank-database/skill.md @@ -0,0 +1,184 @@ +--- +name: drugbank-database +description: Access and analyze comprehensive drug information from the DrugBank database including drug properties, interactions, targets, pathways, chemical structures, and pharmacology data. This skill should be used when working with pharmaceutical data, drug discovery research, pharmacology studies, drug-drug interaction analysis, target identification, chemical similarity searches, ADMET predictions, or any task requiring detailed drug and drug target information from DrugBank. +--- + +# DrugBank Database + +## Overview + +DrugBank is a comprehensive bioinformatics and cheminformatics database containing detailed information on drugs and drug targets. This skill enables programmatic access to DrugBank data including ~9,591 drug entries (2,037 FDA-approved small molecules, 241 biotech drugs, 96 nutraceuticals, and 6,000+ experimental compounds) with 200+ data fields per entry. + +## Core Capabilities + +### 1. Data Access and Authentication + +Download and access DrugBank data using Python with proper authentication. The skill provides guidance on: + +- Installing and configuring the `drugbank-downloader` package +- Managing credentials securely via environment variables or config files +- Downloading specific or latest database versions +- Opening and parsing XML data efficiently +- Working with cached data to optimize performance + +**When to use**: Setting up DrugBank access, downloading database updates, initial project configuration. + +**Reference**: See `references/data-access.md` for detailed authentication, download procedures, API access, caching strategies, and troubleshooting. + +### 2. Drug Information Queries + +Extract comprehensive drug information from the database including identifiers, chemical properties, pharmacology, clinical data, and cross-references to external databases. + +**Query capabilities**: +- Search by DrugBank ID, name, CAS number, or keywords +- Extract basic drug information (name, type, description, indication) +- Retrieve chemical properties (SMILES, InChI, molecular formula) +- Get pharmacology data (mechanism of action, pharmacodynamics, ADME) +- Access external identifiers (PubChem, ChEMBL, UniProt, KEGG) +- Build searchable drug datasets and export to DataFrames +- Filter drugs by type (small molecule, biotech, nutraceutical) + +**When to use**: Retrieving specific drug information, building drug databases, pharmacology research, literature review, drug profiling. + +**Reference**: See `references/drug-queries.md` for XML navigation, query functions, data extraction methods, and performance optimization. + +### 3. Drug-Drug Interactions Analysis + +Analyze drug-drug interactions (DDIs) including mechanism, clinical significance, and interaction networks for pharmacovigilance and clinical decision support. + +**Analysis capabilities**: +- Extract all interactions for specific drugs +- Build bidirectional interaction networks +- Classify interactions by severity and mechanism +- Check interactions between drug pairs +- Identify drugs with most interactions +- Analyze polypharmacy regimens for safety +- Create interaction matrices and network graphs +- Perform community detection in interaction networks +- Calculate interaction risk scores + +**When to use**: Polypharmacy safety analysis, clinical decision support, drug interaction prediction, pharmacovigilance research, identifying contraindications. + +**Reference**: See `references/interactions.md` for interaction extraction, classification methods, network analysis, and clinical applications. + +### 4. Drug Targets and Pathways + +Access detailed information about drug-protein interactions including targets, enzymes, transporters, carriers, and biological pathways. + +**Target analysis capabilities**: +- Extract drug targets with actions (inhibitor, agonist, antagonist) +- Identify metabolic enzymes (CYP450, Phase II enzymes) +- Analyze transporters (uptake, efflux) for ADME studies +- Map drugs to biological pathways (SMPDB) +- Find drugs targeting specific proteins +- Identify drugs with shared targets for repurposing +- Analyze polypharmacology and off-target effects +- Extract Gene Ontology (GO) terms for targets +- Cross-reference with UniProt for protein data + +**When to use**: Mechanism of action studies, drug repurposing research, target identification, pathway analysis, predicting off-target effects, understanding drug metabolism. + +**Reference**: See `references/targets-pathways.md` for target extraction, pathway analysis, repurposing strategies, CYP450 profiling, and transporter analysis. + +### 5. Chemical Properties and Similarity + +Perform structure-based analysis including molecular similarity searches, property calculations, substructure searches, and ADMET predictions. + +**Chemical analysis capabilities**: +- Extract chemical structures (SMILES, InChI, molecular formula) +- Calculate physicochemical properties (MW, logP, PSA, H-bonds) +- Apply Lipinski's Rule of Five and Veber's rules +- Calculate Tanimoto similarity between molecules +- Generate molecular fingerprints (Morgan, MACCS, topological) +- Perform substructure searches with SMARTS patterns +- Find structurally similar drugs for repurposing +- Create similarity matrices for drug clustering +- Predict oral absorption and BBB permeability +- Analyze chemical space with PCA and clustering +- Export chemical property databases + +**When to use**: Structure-activity relationship (SAR) studies, drug similarity searches, QSAR modeling, drug-likeness assessment, ADMET prediction, chemical space exploration. + +**Reference**: See `references/chemical-analysis.md` for structure extraction, similarity calculations, fingerprint generation, ADMET predictions, and chemical space analysis. + +## Typical Workflows + +### Drug Discovery Workflow +1. Use `data-access.md` to download and access latest DrugBank data +2. Use `drug-queries.md` to build searchable drug database +3. Use `chemical-analysis.md` to find similar compounds +4. Use `targets-pathways.md` to identify shared targets +5. Use `interactions.md` to check safety of candidate combinations + +### Polypharmacy Safety Analysis +1. Use `drug-queries.md` to look up patient medications +2. Use `interactions.md` to check all pairwise interactions +3. Use `interactions.md` to classify interaction severity +4. Use `interactions.md` to calculate overall risk score +5. Use `targets-pathways.md` to understand interaction mechanisms + +### Drug Repurposing Research +1. Use `targets-pathways.md` to find drugs with shared targets +2. Use `chemical-analysis.md` to find structurally similar drugs +3. Use `drug-queries.md` to extract indication and pharmacology data +4. Use `interactions.md` to assess potential combination therapies + +### Pharmacology Study +1. Use `drug-queries.md` to extract drug of interest +2. Use `targets-pathways.md` to identify all protein interactions +3. Use `targets-pathways.md` to map to biological pathways +4. Use `chemical-analysis.md` to predict ADMET properties +5. Use `interactions.md` to identify potential contraindications + +## Installation Requirements + +### Python Packages +```bash +uv pip install drugbank-downloader # Core access +uv pip install bioversions # Latest version detection +uv pip install lxml # XML parsing optimization +uv pip install pandas # Data manipulation +uv pip install rdkit # Chemical informatics (for similarity) +uv pip install networkx # Network analysis (for interactions) +uv pip install scikit-learn # ML/clustering (for chemical space) +``` + +### Account Setup +1. Create free account at go.drugbank.com +2. Accept license agreement (free for academic use) +3. Obtain username and password credentials +4. Configure credentials as documented in `references/data-access.md` + +## Data Version and Reproducibility + +Always specify the DrugBank version for reproducible research: + +```python +from drugbank_downloader import download_drugbank +path = download_drugbank(version='5.1.10') # Specify exact version +``` + +Document the version used in publications and analysis scripts. + +## Best Practices + +1. **Credentials**: Use environment variables or config files, never hardcode +2. **Versioning**: Specify exact database version for reproducibility +3. **Caching**: Cache parsed data to avoid re-downloading and re-parsing +4. **Namespaces**: Handle XML namespaces properly when parsing +5. **Validation**: Validate chemical structures with RDKit before use +6. **Cross-referencing**: Use external identifiers (UniProt, PubChem) for integration +7. **Clinical Context**: Always consider clinical context when interpreting interaction data +8. **License Compliance**: Ensure proper licensing for your use case + +## Reference Documentation + +All detailed implementation guidance is organized in modular reference files: + +- **references/data-access.md**: Authentication, download, parsing, API access, caching +- **references/drug-queries.md**: XML navigation, query methods, data extraction, indexing +- **references/interactions.md**: DDI extraction, classification, network analysis, safety scoring +- **references/targets-pathways.md**: Target/enzyme/transporter extraction, pathway mapping, repurposing +- **references/chemical-analysis.md**: Structure extraction, similarity, fingerprints, ADMET prediction + +Load these references as needed based on your specific analysis requirements. diff --git a/container/skills/ensembl-database/scripts/analyze b/container/skills/ensembl-database/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/ensembl-database/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/ensembl-database/scripts/process b/container/skills/ensembl-database/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/ensembl-database/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/ensembl-database/scripts/run b/container/skills/ensembl-database/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/ensembl-database/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/ensembl-database/scripts/search b/container/skills/ensembl-database/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/ensembl-database/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/ensembl-database/skill.md b/container/skills/ensembl-database/skill.md new file mode 100644 index 0000000..2359900 --- /dev/null +++ b/container/skills/ensembl-database/skill.md @@ -0,0 +1,305 @@ +--- +name: ensembl-database +description: "Query Ensembl genome database REST API for 250+ species. Gene lookups, sequence retrieval, variant analysis, comparative genomics, orthologs, VEP predictions, for genomic research." +--- + +# Ensembl Database + +## Overview + +Access and query the Ensembl genome database, a comprehensive resource for vertebrate genomic data maintained by EMBL-EBI. The database provides gene annotations, sequences, variants, regulatory information, and comparative genomics data for over 250 species. Current release is 115 (September 2025). + +## When to Use This Skill + +This skill should be used when: + +- Querying gene information by symbol or Ensembl ID +- Retrieving DNA, transcript, or protein sequences +- Analyzing genetic variants using the Variant Effect Predictor (VEP) +- Finding orthologs and paralogs across species +- Accessing regulatory features and genomic annotations +- Converting coordinates between genome assemblies (e.g., GRCh37 to GRCh38) +- Performing comparative genomics analyses +- Integrating Ensembl data into genomic research pipelines + +## Core Capabilities + +### 1. Gene Information Retrieval + +Query gene data by symbol, Ensembl ID, or external database identifiers. + +**Common operations:** +- Look up gene information by symbol (e.g., "BRCA2", "TP53") +- Retrieve transcript and protein information +- Get gene coordinates and chromosomal locations +- Access cross-references to external databases (UniProt, RefSeq, etc.) + +**Using the ensembl_rest package:** +```python +from ensembl_rest import EnsemblClient + +client = EnsemblClient() + +# Look up gene by symbol +gene_data = client.symbol_lookup( + species='human', + symbol='BRCA2' +) + +# Get detailed gene information +gene_info = client.lookup_id( + id='ENSG00000139618', # BRCA2 Ensembl ID + expand=True +) +``` + +**Direct REST API (no package):** +```python +import requests + +server = "https://rest.ensembl.org" + +# Symbol lookup +response = requests.get( + f"{server}/lookup/symbol/homo_sapiens/BRCA2", + headers={"Content-Type": "application/json"} +) +gene_data = response.json() +``` + +### 2. Sequence Retrieval + +Fetch genomic, transcript, or protein sequences in various formats (JSON, FASTA, plain text). + +**Operations:** +- Get DNA sequences for genes or genomic regions +- Retrieve transcript sequences (cDNA) +- Access protein sequences +- Extract sequences with flanking regions or modifications + +**Example:** +```python +# Using ensembl_rest package +sequence = client.sequence_id( + id='ENSG00000139618', # Gene ID + content_type='application/json' +) + +# Get sequence for a genomic region +region_seq = client.sequence_region( + species='human', + region='7:140424943-140624564' # chromosome:start-end +) +``` + +### 3. Variant Analysis + +Query genetic variation data and predict variant consequences using the Variant Effect Predictor (VEP). + +**Capabilities:** +- Look up variants by rsID or genomic coordinates +- Predict functional consequences of variants +- Access population frequency data +- Retrieve phenotype associations + +**VEP example:** +```python +# Predict variant consequences +vep_result = client.vep_hgvs( + species='human', + hgvs_notation='ENST00000380152.7:c.803C>T' +) + +# Query variant by rsID +variant = client.variation_id( + species='human', + id='rs699' +) +``` + +### 4. Comparative Genomics + +Perform cross-species comparisons to identify orthologs, paralogs, and evolutionary relationships. + +**Operations:** +- Find orthologs (same gene in different species) +- Identify paralogs (related genes in same species) +- Access gene trees showing evolutionary relationships +- Retrieve gene family information + +**Example:** +```python +# Find orthologs for a human gene +orthologs = client.homology_ensemblgene( + id='ENSG00000139618', # Human BRCA2 + target_species='mouse' +) + +# Get gene tree +gene_tree = client.genetree_member_symbol( + species='human', + symbol='BRCA2' +) +``` + +### 5. Genomic Region Analysis + +Find all genomic features (genes, transcripts, regulatory elements) in a specific region. + +**Use cases:** +- Identify all genes in a chromosomal region +- Find regulatory features (promoters, enhancers) +- Locate variants within a region +- Retrieve structural features + +**Example:** +```python +# Find all features in a region +features = client.overlap_region( + species='human', + region='7:140424943-140624564', + feature='gene' +) +``` + +### 6. Assembly Mapping + +Convert coordinates between different genome assemblies (e.g., GRCh37 to GRCh38). + +**Important:** Use `https://grch37.rest.ensembl.org` for GRCh37/hg19 queries and `https://rest.ensembl.org` for current assemblies. + +**Example:** +```python +from ensembl_rest import AssemblyMapper + +# Map coordinates from GRCh37 to GRCh38 +mapper = AssemblyMapper( + species='human', + asm_from='GRCh37', + asm_to='GRCh38' +) + +mapped = mapper.map(chrom='7', start=140453136, end=140453136) +``` + +## API Best Practices + +### Rate Limiting + +The Ensembl REST API has rate limits. Follow these practices: + +1. **Respect rate limits:** Maximum 15 requests per second for anonymous users +2. **Handle 429 responses:** When rate-limited, check the `Retry-After` header and wait +3. **Use batch endpoints:** When querying multiple items, use batch endpoints where available +4. **Cache results:** Store frequently accessed data to reduce API calls + +### Error Handling + +Always implement proper error handling: + +```python +import requests +import time + +def query_ensembl(endpoint, params=None, max_retries=3): + server = "https://rest.ensembl.org" + headers = {"Content-Type": "application/json"} + + for attempt in range(max_retries): + response = requests.get( + f"{server}{endpoint}", + headers=headers, + params=params + ) + + if response.status_code == 200: + return response.json() + elif response.status_code == 429: + # Rate limited - wait and retry + retry_after = int(response.headers.get('Retry-After', 1)) + time.sleep(retry_after) + else: + response.raise_for_status() + + raise Exception(f"Failed after {max_retries} attempts") +``` + +## Installation + +### Python Package (Recommended) + +```bash +uv pip install ensembl_rest +``` + +The `ensembl_rest` package provides a Pythonic interface to all Ensembl REST API endpoints. + +### Direct REST API + +No installation needed - use standard HTTP libraries like `requests`: + +```bash +uv pip install requests +``` + +## Resources + +### references/ + +- `api_endpoints.md`: Comprehensive documentation of all 17 API endpoint categories with examples and parameters + +### scripts/ + +- `ensembl_query.py`: Reusable Python script for common Ensembl queries with built-in rate limiting and error handling + +## Common Workflows + +### Workflow 1: Gene Annotation Pipeline + +1. Look up gene by symbol to get Ensembl ID +2. Retrieve transcript information +3. Get protein sequences for all transcripts +4. Find orthologs in other species +5. Export results + +### Workflow 2: Variant Analysis + +1. Query variant by rsID or coordinates +2. Use VEP to predict functional consequences +3. Check population frequencies +4. Retrieve phenotype associations +5. Generate report + +### Workflow 3: Comparative Analysis + +1. Start with gene of interest in reference species +2. Find orthologs in target species +3. Retrieve sequences for all orthologs +4. Compare gene structures and features +5. Analyze evolutionary conservation + +## Species and Assembly Information + +To query available species and assemblies: + +```python +# List all available species +species_list = client.info_species() + +# Get assembly information for a species +assembly_info = client.info_assembly(species='human') +``` + +Common species identifiers: +- Human: `homo_sapiens` or `human` +- Mouse: `mus_musculus` or `mouse` +- Zebrafish: `danio_rerio` or `zebrafish` +- Fruit fly: `drosophila_melanogaster` + +## Additional Resources + +- **Official Documentation:** https://rest.ensembl.org/documentation +- **Python Package Docs:** https://ensemblrest.readthedocs.io +- **EBI Training:** https://www.ebi.ac.uk/training/online/courses/ensembl-rest-api/ +- **Ensembl Browser:** https://useast.ensembl.org +- **GitHub Examples:** https://github.com/Ensembl/ensembl-rest/wiki diff --git a/container/skills/gnomad-database/scripts/analyze b/container/skills/gnomad-database/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/gnomad-database/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/gnomad-database/scripts/process b/container/skills/gnomad-database/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/gnomad-database/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/gnomad-database/scripts/run b/container/skills/gnomad-database/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/gnomad-database/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/gnomad-database/scripts/search b/container/skills/gnomad-database/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/gnomad-database/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/gnomad-database/skill.md b/container/skills/gnomad-database/skill.md new file mode 100644 index 0000000..26cc4ab --- /dev/null +++ b/container/skills/gnomad-database/skill.md @@ -0,0 +1,395 @@ +--- +name: gnomad-database +description: Query gnomAD (Genome Aggregation Database) for population allele frequencies, variant constraint scores (pLI, LOEUF), and loss-of-function intolerance. Essential for variant pathogenicity interpretation, rare disease genetics, and identifying loss-of-function intolerant genes. +license: CC0-1.0 +metadata: + skill-author: Kuan-lin Huang +--- + +# gnomAD Database + +## Overview + +The Genome Aggregation Database (gnomAD) is the largest publicly available collection of human genetic variation, aggregated from large-scale sequencing projects. gnomAD v4 contains exome sequences from 730,947 individuals and genome sequences from 76,215 individuals across diverse ancestries. It provides population allele frequencies, variant consequence annotations, and gene-level constraint metrics that are essential for interpreting the clinical significance of genetic variants. + +**Key resources:** +- gnomAD browser: https://gnomad.broadinstitute.org/ +- GraphQL API: https://gnomad.broadinstitute.org/api +- Data downloads: https://gnomad.broadinstitute.org/downloads +- Documentation: https://gnomad.broadinstitute.org/help + +## When to Use This Skill + +Use gnomAD when: + +- **Variant frequency lookup**: Checking if a variant is rare, common, or absent in the general population +- **Pathogenicity assessment**: Rare variants (MAF < 1%) are candidates for disease causation; gnomAD helps filter benign common variants +- **Loss-of-function intolerance**: Using pLI and LOEUF scores to assess whether a gene tolerates protein-truncating variants +- **Population-stratified frequencies**: Comparing allele frequencies across ancestries (African/African American, Admixed American, Ashkenazi Jewish, East Asian, Finnish, Middle Eastern, Non-Finnish European, South Asian) +- **ClinVar/ACMG variant classification**: gnomAD frequency data feeds into BA1/BS1 evidence codes for variant classification +- **Constraint analysis**: Identifying genes depleted of missense or loss-of-function variation (z-scores, pLI, LOEUF) + +## Core Capabilities + +### 1. gnomAD GraphQL API + +gnomAD uses a GraphQL API accessible at `https://gnomad.broadinstitute.org/api`. Most queries fetch variants by gene or specific genomic position. + +**Datasets available:** +- `gnomad_r4` — gnomAD v4 exomes (recommended default, GRCh38) +- `gnomad_r4_genomes` — gnomAD v4 genomes (GRCh38) +- `gnomad_r3` — gnomAD v3 genomes (GRCh38) +- `gnomad_r2_1` — gnomAD v2 exomes (GRCh37) + +**Reference genomes:** +- `GRCh38` — default for v3/v4 +- `GRCh37` — for v2 + +### 2. Querying Variants by Gene + +```python +import requests + +def query_gnomad_gene(gene_symbol, dataset="gnomad_r4", reference_genome="GRCh38"): + """Fetch variants in a gene from gnomAD.""" + url = "https://gnomad.broadinstitute.org/api" + + query = """ + query GeneVariants($gene_symbol: String!, $dataset: DatasetId!, $reference_genome: ReferenceGenomeId!) { + gene(gene_symbol: $gene_symbol, reference_genome: $reference_genome) { + gene_id + gene_symbol + variants(dataset: $dataset) { + variant_id + pos + ref + alt + consequence + genome { + af + ac + an + ac_hom + populations { + id + ac + an + af + } + } + exome { + af + ac + an + ac_hom + } + lof + lof_flags + lof_filter + } + } + } + """ + + variables = { + "gene_symbol": gene_symbol, + "dataset": dataset, + "reference_genome": reference_genome + } + + response = requests.post(url, json={"query": query, "variables": variables}) + return response.json() + +# Example +result = query_gnomad_gene("BRCA1") +gene_data = result["data"]["gene"] +variants = gene_data["variants"] + +# Filter to rare PTVs +rare_ptvs = [ + v for v in variants + if v.get("lof") == "LC" or v.get("consequence") in ["stop_gained", "frameshift_variant"] + and v.get("genome", {}).get("af", 1) < 0.001 +] +print(f"Found {len(rare_ptvs)} rare PTVs in {gene_data['gene_symbol']}") +``` + +### 3. Querying a Specific Variant + +```python +import requests + +def query_gnomad_variant(variant_id, dataset="gnomad_r4"): + """Fetch details for a specific variant (e.g., '1-55516888-G-GA').""" + url = "https://gnomad.broadinstitute.org/api" + + query = """ + query VariantDetails($variantId: String!, $dataset: DatasetId!) { + variant(variantId: $variantId, dataset: $dataset) { + variant_id + chrom + pos + ref + alt + genome { + af + ac + an + ac_hom + populations { + id + ac + an + af + } + } + exome { + af + ac + an + ac_hom + populations { + id + ac + an + af + } + } + consequence + lof + rsids + in_silico_predictors { + id + value + flags + } + clinvar_variation_id + } + } + """ + + response = requests.post( + url, + json={"query": query, "variables": {"variantId": variant_id, "dataset": dataset}} + ) + return response.json() + +# Example: query a specific variant +result = query_gnomad_variant("17-43094692-G-A") # BRCA1 missense +variant = result["data"]["variant"] + +if variant: + genome_af = variant.get("genome", {}).get("af", "N/A") + exome_af = variant.get("exome", {}).get("af", "N/A") + print(f"Variant: {variant['variant_id']}") + print(f" Consequence: {variant['consequence']}") + print(f" Genome AF: {genome_af}") + print(f" Exome AF: {exome_af}") + print(f" LoF: {variant.get('lof')}") +``` + +### 4. Gene Constraint Scores + +gnomAD constraint scores assess how tolerant a gene is to variation relative to expectation: + +```python +import requests + +def query_gnomad_constraint(gene_symbol, reference_genome="GRCh38"): + """Fetch constraint scores for a gene.""" + url = "https://gnomad.broadinstitute.org/api" + + query = """ + query GeneConstraint($gene_symbol: String!, $reference_genome: ReferenceGenomeId!) { + gene(gene_symbol: $gene_symbol, reference_genome: $reference_genome) { + gene_id + gene_symbol + gnomad_constraint { + exp_lof + exp_mis + exp_syn + obs_lof + obs_mis + obs_syn + oe_lof + oe_mis + oe_syn + oe_lof_lower + oe_lof_upper + lof_z + mis_z + syn_z + pLI + } + } + } + """ + + response = requests.post( + url, + json={"query": query, "variables": {"gene_symbol": gene_symbol, "reference_genome": reference_genome}} + ) + return response.json() + +# Example +result = query_gnomad_constraint("KCNQ2") +gene = result["data"]["gene"] +constraint = gene["gnomad_constraint"] + +print(f"Gene: {gene['gene_symbol']}") +print(f" pLI: {constraint['pLI']:.3f} (>0.9 = LoF intolerant)") +print(f" LOEUF: {constraint['oe_lof_upper']:.3f} (<0.35 = highly constrained)") +print(f" Obs/Exp LoF: {constraint['oe_lof']:.3f}") +print(f" Missense Z: {constraint['mis_z']:.3f}") +``` + +**Constraint score interpretation:** +| Score | Range | Meaning | +|-------|-------|---------| +| `pLI` | 0–1 | Probability of LoF intolerance; >0.9 = highly intolerant | +| `LOEUF` | 0–∞ | LoF observed/expected upper bound; <0.35 = constrained | +| `oe_lof` | 0–∞ | Observed/expected ratio for LoF variants | +| `mis_z` | −∞ to ∞ | Missense constraint z-score; >3.09 = constrained | +| `syn_z` | −∞ to ∞ | Synonymous z-score (control; should be near 0) | + +### 5. Population Frequency Analysis + +```python +import requests +import pandas as pd + +def get_population_frequencies(variant_id, dataset="gnomad_r4"): + """Extract per-population allele frequencies for a variant.""" + url = "https://gnomad.broadinstitute.org/api" + + query = """ + query PopFreqs($variantId: String!, $dataset: DatasetId!) { + variant(variantId: $variantId, dataset: $dataset) { + variant_id + genome { + populations { + id + ac + an + af + ac_hom + } + } + } + } + """ + + response = requests.post( + url, + json={"query": query, "variables": {"variantId": variant_id, "dataset": dataset}} + ) + data = response.json() + populations = data["data"]["variant"]["genome"]["populations"] + + df = pd.DataFrame(populations) + df = df[df["an"] > 0].copy() + df["af"] = df["ac"] / df["an"] + df = df.sort_values("af", ascending=False) + return df + +# Population IDs in gnomAD v4: +# afr = African/African American +# ami = Amish +# amr = Admixed American +# asj = Ashkenazi Jewish +# eas = East Asian +# fin = Finnish +# mid = Middle Eastern +# nfe = Non-Finnish European +# sas = South Asian +# remaining = Other +``` + +### 6. Structural Variants (gnomAD-SV) + +gnomAD also contains a structural variant dataset: + +```python +import requests + +def query_gnomad_sv(gene_symbol): + """Query structural variants overlapping a gene.""" + url = "https://gnomad.broadinstitute.org/api" + + query = """ + query SVsByGene($gene_symbol: String!) { + gene(gene_symbol: $gene_symbol, reference_genome: GRCh38) { + structural_variants { + variant_id + type + chrom + pos + end + af + ac + an + } + } + } + """ + + response = requests.post(url, json={"query": query, "variables": {"gene_symbol": gene_symbol}}) + return response.json() +``` + +## Query Workflows + +### Workflow 1: Variant Pathogenicity Assessment + +1. **Check population frequency** — Is the variant rare enough to be pathogenic? + - Use gnomAD AF < 1% for recessive, < 0.1% for dominant conditions + - Check ancestry-specific frequencies (a variant rare overall may be common in one population) + +2. **Assess functional impact** — LoF variants have highest prior probability + - Check `lof` field: `HC` = high-confidence LoF, `LC` = low-confidence + - Check `lof_flags` for issues like "NAGNAG_SITE", "PHYLOCSF_WEAK" + +3. **Apply ACMG criteria:** + - BA1: AF > 5% → Benign Stand-Alone + - BS1: AF > disease prevalence threshold → Benign Supporting + - PM2: Absent or very rare in gnomAD → Pathogenic Moderate + +### Workflow 2: Gene Prioritization in Rare Disease + +1. Query constraint scores for candidate genes +2. Filter for pLI > 0.9 (haploinsufficient) or LOEUF < 0.35 +3. Cross-reference with observed LoF variants in the gene +4. Integrate with ClinVar and disease databases + +### Workflow 3: Population Genetics Research + +1. Identify variant of interest from GWAS or clinical data +2. Query per-population frequencies +3. Compare frequency differences across ancestries +4. Test for enrichment in specific founder populations + +## Best Practices + +- **Use gnomAD v4 (gnomad_r4)** for the most current data; use v2 (gnomad_r2_1) only for GRCh37 compatibility +- **Handle null responses**: Variants not observed in gnomAD are not necessarily pathogenic — absence is informative +- **Distinguish exome vs. genome data**: Genome data has more uniform coverage; exome data is larger but may have coverage gaps +- **Rate limit GraphQL queries**: Add delays between requests; batch queries when possible +- **Homozygous counts** (`ac_hom`) are relevant for recessive disease analysis +- **LOEUF is preferred over pLI** for gene constraint (less sensitive to sample size) + +## Data Access + +- **Browser**: https://gnomad.broadinstitute.org/ — interactive variant and gene browsing +- **GraphQL API**: https://gnomad.broadinstitute.org/api — programmatic access +- **Downloads**: https://gnomad.broadinstitute.org/downloads — VCF, Hail tables, constraint tables +- **Google Cloud**: gs://gcp-public-data--gnomad/ + +## Additional Resources + +- **gnomAD website**: https://gnomad.broadinstitute.org/ +- **gnomAD blog**: https://gnomad.broadinstitute.org/news +- **Downloads**: https://gnomad.broadinstitute.org/downloads +- **API explorer**: https://gnomad.broadinstitute.org/api (interactive GraphiQL) +- **Constraint documentation**: https://gnomad.broadinstitute.org/help/constraint +- **Citation**: Karczewski KJ et al. (2020) Nature. PMID: 32461654; Chen S et al. (2024) Nature. PMID: 38conservation +- **GitHub**: https://github.com/broadinstitute/gnomad-browser diff --git a/container/skills/kegg-database/scripts/analyze b/container/skills/kegg-database/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/kegg-database/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/kegg-database/scripts/process b/container/skills/kegg-database/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/kegg-database/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/kegg-database/scripts/run b/container/skills/kegg-database/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/kegg-database/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/kegg-database/scripts/search b/container/skills/kegg-database/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/kegg-database/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/kegg-database/skill.md b/container/skills/kegg-database/skill.md new file mode 100644 index 0000000..9a96cf0 --- /dev/null +++ b/container/skills/kegg-database/skill.md @@ -0,0 +1,371 @@ +--- +name: kegg-database +description: "Direct REST API access to KEGG (academic use only). Pathway analysis, gene-pathway mapping, metabolic pathways, drug interactions, ID conversion. For Python workflows with multiple databases, prefer bioservices. Use this for direct HTTP/REST work or KEGG-specific control." +--- + +# KEGG Database + +## Overview + +KEGG (Kyoto Encyclopedia of Genes and Genomes) is a comprehensive bioinformatics resource for biological pathway analysis and molecular interaction networks. + +**Important**: KEGG API is made available only for academic use by academic users. + +## When to Use This Skill + +This skill should be used when querying pathways, genes, compounds, enzymes, diseases, and drugs across multiple organisms using KEGG's REST API. + +## Quick Start + +The skill provides: +1. Python helper functions (`scripts/kegg_api.py`) for all KEGG REST API operations +2. Comprehensive reference documentation (`references/kegg_reference.md`) with detailed API specifications + +When users request KEGG data, determine which operation is needed and use the appropriate function from `scripts/kegg_api.py`. + +## Core Operations + +### 1. Database Information (`kegg_info`) + +Retrieve metadata and statistics about KEGG databases. + +**When to use**: Understanding database structure, checking available data, getting release information. + +**Usage**: +```python +from scripts.kegg_api import kegg_info + +# Get pathway database info +info = kegg_info('pathway') + +# Get organism-specific info +hsa_info = kegg_info('hsa') # Human genome +``` + +**Common databases**: `kegg`, `pathway`, `module`, `brite`, `genes`, `genome`, `compound`, `glycan`, `reaction`, `enzyme`, `disease`, `drug` + +### 2. Listing Entries (`kegg_list`) + +List entry identifiers and names from KEGG databases. + +**When to use**: Getting all pathways for an organism, listing genes, retrieving compound catalogs. + +**Usage**: +```python +from scripts.kegg_api import kegg_list + +# List all reference pathways +pathways = kegg_list('pathway') + +# List human-specific pathways +hsa_pathways = kegg_list('pathway', 'hsa') + +# List specific genes (max 10) +genes = kegg_list('hsa:10458+hsa:10459') +``` + +**Common organism codes**: `hsa` (human), `mmu` (mouse), `dme` (fruit fly), `sce` (yeast), `eco` (E. coli) + +### 3. Searching (`kegg_find`) + +Search KEGG databases by keywords or molecular properties. + +**When to use**: Finding genes by name/description, searching compounds by formula or mass, discovering entries by keywords. + +**Usage**: +```python +from scripts.kegg_api import kegg_find + +# Keyword search +results = kegg_find('genes', 'p53') +shiga_toxin = kegg_find('genes', 'shiga toxin') + +# Chemical formula search (exact match) +compounds = kegg_find('compound', 'C7H10N4O2', 'formula') + +# Molecular weight range search +drugs = kegg_find('drug', '300-310', 'exact_mass') +``` + +**Search options**: `formula` (exact match), `exact_mass` (range), `mol_weight` (range) + +### 4. Retrieving Entries (`kegg_get`) + +Get complete database entries or specific data formats. + +**When to use**: Retrieving pathway details, getting gene/protein sequences, downloading pathway maps, accessing compound structures. + +**Usage**: +```python +from scripts.kegg_api import kegg_get + +# Get pathway entry +pathway = kegg_get('hsa00010') # Glycolysis pathway + +# Get multiple entries (max 10) +genes = kegg_get(['hsa:10458', 'hsa:10459']) + +# Get protein sequence (FASTA) +sequence = kegg_get('hsa:10458', 'aaseq') + +# Get nucleotide sequence +nt_seq = kegg_get('hsa:10458', 'ntseq') + +# Get compound structure +mol_file = kegg_get('cpd:C00002', 'mol') # ATP in MOL format + +# Get pathway as JSON (single entry only) +pathway_json = kegg_get('hsa05130', 'json') + +# Get pathway image (single entry only) +pathway_img = kegg_get('hsa05130', 'image') +``` + +**Output formats**: `aaseq` (protein FASTA), `ntseq` (nucleotide FASTA), `mol` (MOL format), `kcf` (KCF format), `image` (PNG), `kgml` (XML), `json` (pathway JSON) + +**Important**: Image, KGML, and JSON formats allow only one entry at a time. + +### 5. ID Conversion (`kegg_conv`) + +Convert identifiers between KEGG and external databases. + +**When to use**: Integrating KEGG data with other databases, mapping gene IDs, converting compound identifiers. + +**Usage**: +```python +from scripts.kegg_api import kegg_conv + +# Convert all human genes to NCBI Gene IDs +conversions = kegg_conv('ncbi-geneid', 'hsa') + +# Convert specific gene +gene_id = kegg_conv('ncbi-geneid', 'hsa:10458') + +# Convert to UniProt +uniprot_id = kegg_conv('uniprot', 'hsa:10458') + +# Convert compounds to PubChem +pubchem_ids = kegg_conv('pubchem', 'compound') + +# Reverse conversion (NCBI Gene ID to KEGG) +kegg_id = kegg_conv('hsa', 'ncbi-geneid') +``` + +**Supported conversions**: `ncbi-geneid`, `ncbi-proteinid`, `uniprot`, `pubchem`, `chebi` + +### 6. Cross-Referencing (`kegg_link`) + +Find related entries within and between KEGG databases. + +**When to use**: Finding pathways containing genes, getting genes in a pathway, mapping genes to KO groups, finding compounds in pathways. + +**Usage**: +```python +from scripts.kegg_api import kegg_link + +# Find pathways linked to human genes +pathways = kegg_link('pathway', 'hsa') + +# Get genes in a specific pathway +genes = kegg_link('genes', 'hsa00010') # Glycolysis genes + +# Find pathways containing a specific gene +gene_pathways = kegg_link('pathway', 'hsa:10458') + +# Find compounds in a pathway +compounds = kegg_link('compound', 'hsa00010') + +# Map genes to KO (orthology) groups +ko_groups = kegg_link('ko', 'hsa:10458') +``` + +**Common links**: genes ↔ pathway, pathway ↔ compound, pathway ↔ enzyme, genes ↔ ko (orthology) + +### 7. Drug-Drug Interactions (`kegg_ddi`) + +Check for drug-drug interactions. + +**When to use**: Analyzing drug combinations, checking for contraindications, pharmacological research. + +**Usage**: +```python +from scripts.kegg_api import kegg_ddi + +# Check single drug +interactions = kegg_ddi('D00001') + +# Check multiple drugs (max 10) +interactions = kegg_ddi(['D00001', 'D00002', 'D00003']) +``` + +## Common Analysis Workflows + +### Workflow 1: Gene to Pathway Mapping + +**Use case**: Finding pathways associated with genes of interest (e.g., for pathway enrichment analysis). + +```python +from scripts.kegg_api import kegg_find, kegg_link, kegg_get + +# Step 1: Find gene ID by name +gene_results = kegg_find('genes', 'p53') + +# Step 2: Link gene to pathways +pathways = kegg_link('pathway', 'hsa:7157') # TP53 gene + +# Step 3: Get detailed pathway information +for pathway_line in pathways.split('\n'): + if pathway_line: + pathway_id = pathway_line.split('\t')[1].replace('path:', '') + pathway_info = kegg_get(pathway_id) + # Process pathway information +``` + +### Workflow 2: Pathway Enrichment Context + +**Use case**: Getting all genes in organism pathways for enrichment analysis. + +```python +from scripts.kegg_api import kegg_list, kegg_link + +# Step 1: List all human pathways +pathways = kegg_list('pathway', 'hsa') + +# Step 2: For each pathway, get associated genes +for pathway_line in pathways.split('\n'): + if pathway_line: + pathway_id = pathway_line.split('\t')[0] + genes = kegg_link('genes', pathway_id) + # Process genes for enrichment analysis +``` + +### Workflow 3: Compound to Pathway Analysis + +**Use case**: Finding metabolic pathways containing compounds of interest. + +```python +from scripts.kegg_api import kegg_find, kegg_link, kegg_get + +# Step 1: Search for compound +compound_results = kegg_find('compound', 'glucose') + +# Step 2: Link compound to reactions +reactions = kegg_link('reaction', 'cpd:C00031') # Glucose + +# Step 3: Link reactions to pathways +pathways = kegg_link('pathway', 'rn:R00299') # Specific reaction + +# Step 4: Get pathway details +pathway_info = kegg_get('map00010') # Glycolysis +``` + +### Workflow 4: Cross-Database Integration + +**Use case**: Integrating KEGG data with UniProt, NCBI, or PubChem databases. + +```python +from scripts.kegg_api import kegg_conv, kegg_get + +# Step 1: Convert KEGG gene IDs to external database IDs +uniprot_map = kegg_conv('uniprot', 'hsa') +ncbi_map = kegg_conv('ncbi-geneid', 'hsa') + +# Step 2: Parse conversion results +for line in uniprot_map.split('\n'): + if line: + kegg_id, uniprot_id = line.split('\t') + # Use external IDs for integration + +# Step 3: Get sequences using KEGG +sequence = kegg_get('hsa:10458', 'aaseq') +``` + +### Workflow 5: Organism-Specific Pathway Analysis + +**Use case**: Comparing pathways across different organisms. + +```python +from scripts.kegg_api import kegg_list, kegg_get + +# Step 1: List pathways for multiple organisms +human_pathways = kegg_list('pathway', 'hsa') +mouse_pathways = kegg_list('pathway', 'mmu') +yeast_pathways = kegg_list('pathway', 'sce') + +# Step 2: Get reference pathway for comparison +ref_pathway = kegg_get('map00010') # Reference glycolysis + +# Step 3: Get organism-specific versions +hsa_glycolysis = kegg_get('hsa00010') +mmu_glycolysis = kegg_get('mmu00010') +``` + +## Pathway Categories + +KEGG organizes pathways into seven major categories. When interpreting pathway IDs or recommending pathways to users: + +1. **Metabolism** (e.g., `map00010` - Glycolysis, `map00190` - Oxidative phosphorylation) +2. **Genetic Information Processing** (e.g., `map03010` - Ribosome, `map03040` - Spliceosome) +3. **Environmental Information Processing** (e.g., `map04010` - MAPK signaling, `map02010` - ABC transporters) +4. **Cellular Processes** (e.g., `map04140` - Autophagy, `map04210` - Apoptosis) +5. **Organismal Systems** (e.g., `map04610` - Complement cascade, `map04910` - Insulin signaling) +6. **Human Diseases** (e.g., `map05200` - Pathways in cancer, `map05010` - Alzheimer disease) +7. **Drug Development** (chronological and target-based classifications) + +Reference `references/kegg_reference.md` for detailed pathway lists and classifications. + +## Important Identifiers and Formats + +### Pathway IDs +- `map#####` - Reference pathway (generic, not organism-specific) +- `hsa#####` - Human pathway +- `mmu#####` - Mouse pathway + +### Gene IDs +- Format: `organism:gene_number` (e.g., `hsa:10458`) + +### Compound IDs +- Format: `cpd:C#####` (e.g., `cpd:C00002` for ATP) + +### Drug IDs +- Format: `dr:D#####` (e.g., `dr:D00001`) + +### Enzyme IDs +- Format: `ec:EC_number` (e.g., `ec:1.1.1.1`) + +### KO (KEGG Orthology) IDs +- Format: `ko:K#####` (e.g., `ko:K00001`) + +## API Limitations + +Respect these constraints when using the KEGG API: + +1. **Entry limits**: Maximum 10 entries per operation (except image/kgml/json: 1 entry only) +2. **Academic use**: API is for academic use only; commercial use requires licensing +3. **HTTP status codes**: Check for 200 (success), 400 (bad request), 404 (not found) +4. **Rate limiting**: No explicit limit, but avoid rapid-fire requests + +## Detailed Reference + +For comprehensive API documentation, database specifications, organism codes, and advanced usage, refer to `references/kegg_reference.md`. This includes: + +- Complete list of KEGG databases +- Detailed API operation syntax +- All organism codes +- HTTP status codes and error handling +- Integration with Biopython and R/Bioconductor +- Best practices for API usage + +## Troubleshooting + +**404 Not Found**: Entry or database doesn't exist; verify IDs and organism codes +**400 Bad Request**: Syntax error in API call; check parameter formatting +**Empty results**: Search term may not match entries; try broader keywords +**Image/KGML errors**: These formats only work with single entries; remove batch processing + +## Additional Tools + +For interactive pathway visualization and annotation: +- **KEGG Mapper**: https://www.kegg.jp/kegg/mapper/ +- **BlastKOALA**: Automated genome annotation +- **GhostKOALA**: Metagenome/metatranscriptome annotation diff --git a/container/skills/literature-search/scripts/search b/container/skills/literature-search/scripts/search new file mode 100755 index 0000000..23910df --- /dev/null +++ b/container/skills/literature-search/scripts/search @@ -0,0 +1,20 @@ +#!/bin/bash + +# Search CLI wrapper + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SCRIPT_PATH="${SCRIPT_DIR}/search" + +# Special commands for discoverability +case "$1" in + --path|--location|--where) + echo "$SCRIPT_PATH" + exit 0 + ;; + --script-dir) + echo "$SCRIPT_DIR" + exit 0 + ;; +esac + +node "${SCRIPT_DIR}/search.mjs" "$@" diff --git a/container/skills/literature-search/scripts/search.mjs b/container/skills/literature-search/scripts/search.mjs new file mode 100644 index 0000000..dba6e1f --- /dev/null +++ b/container/skills/literature-search/scripts/search.mjs @@ -0,0 +1,169 @@ +#!/usr/bin/env node + +/** + * literature-search Search via Valyu API + * Full-text search across multiple data sources with semantic search capabilities + */ + +import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs'; +import { homedir } from 'os'; +import { join } from 'path'; + +const VALYU_API_BASE = 'https://api.valyu.ai/v1'; +const CONFIG_DIR = join(homedir(), '.valyu'); +const CONFIG_FILE = join(CONFIG_DIR, 'config.json'); + +/** + * Get API key from multiple sources (in order of priority): + * 1. Environment variable (VALYU_API_KEY) + * 2. Config file (~/.valyu/config.json) + */ +function getApiKey() { + if (process.env.VALYU_API_KEY) { + return process.env.VALYU_API_KEY; + } + + if (existsSync(CONFIG_FILE)) { + try { + const config = JSON.parse(readFileSync(CONFIG_FILE, 'utf-8')); + if (config.apiKey) { + return config.apiKey; + } + } catch (e) { + // Ignore parse errors + } + } + + return null; +} + +/** + * Save API key to config file + */ +function saveApiKey(apiKey) { + if (!existsSync(CONFIG_DIR)) { + mkdirSync(CONFIG_DIR, { recursive: true }); + } + + let config = {}; + if (existsSync(CONFIG_FILE)) { + try { + config = JSON.parse(readFileSync(CONFIG_FILE, 'utf-8')); + } catch (e) { + // Start fresh if parse fails + } + } + + config.apiKey = apiKey; + writeFileSync(CONFIG_FILE, JSON.stringify(config, null, 2)); + return true; +} + +/** + * Return setup required response + */ +function setupRequiredResponse() { + return { + success: false, + setup_required: true, + message: "Valyu API key not configured. Get your free API key ($10 credits) at https://platform.valyu.ai" + }; +} + +/** + * Search multiple sources via Valyu API + */ +async function search(query, maxResults = 10) { + const apiKey = getApiKey(); + + if (!apiKey) { + return setupRequiredResponse(); + } + + try { + const response = await fetch(`${VALYU_API_BASE}/search`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'x-api-key': apiKey + }, + body: JSON.stringify({ + query: query, + search_type: 'proprietary', + included_sources: ['valyu/valyu-pubmed', 'valyu/valyu-arxiv', 'valyu/valyu-biorxiv', 'valyu/valyu-medrxiv'], + limit: maxResults + }) + }); + + const data = await response.json(); + + if (!response.ok) { + return { + success: false, + error: data.detail || data.message || `HTTP ${response.status}`, + status: response.status + }; + } + + return { + success: true, + type: 'literature_search', + query: query, + result_count: data.results?.length || 0, + results: data.results || [], + cost: data.cost || 0 + }; + + } catch (error) { + return { + success: false, + error: error.message + }; + } +} + +/** + * Setup command - save API key + */ +function setup(apiKey) { + if (!apiKey) { + return { + success: false, + error: "API key required. Usage: search setup " + }; + } + + saveApiKey(apiKey); + return { + success: true, + type: 'setup', + message: "API key saved to ~/.valyu/config.json" + }; +} + +// Main CLI handler +const [,, command, ...args] = process.argv; + +(async () => { + let result; + + if (command === 'setup') { + result = setup(args[0]); + } else { + // Treat first arg as query, second as maxResults + const query = command || ''; + const maxResults = args[0] ? parseInt(args[0], 10) : 10; + + if (!query) { + result = { + success: false, + error: "Query required. Usage: search [maxResults]" + }; + } else { + result = await search(query, maxResults); + } + } + + console.log(JSON.stringify(result, null, 2)); + process.exit(result.success ? 0 : 1); +})(); diff --git a/container/skills/literature-search/skill.md b/container/skills/literature-search/skill.md new file mode 100644 index 0000000..64612df --- /dev/null +++ b/container/skills/literature-search/skill.md @@ -0,0 +1,214 @@ +--- +name: literature-search +description: Comprehensive scientific literature search across PubMed, arXiv, bioRxiv, medRxiv. Natural language queries powered by Valyu semantic search. +keywords: + - literature-search + - scientific-literature + - multi-source-search + - comprehensive-search + - research-aggregation + - semantic-search +license: MIT +--- + + +# Literature Search + +Search across all major scientific literature databases (PubMed, arXiv, bioRxiv, medRxiv) simultaneously using natural language queries powered by Valyu's semantic search API. + +## Why This Skill is Powerful + +- **No API Parameter Parsing**: Just pass natural language queries directly - no need to construct complex search parameters +- **Semantic Search**: Understands the meaning of your query, not just keyword matching +- **Full-Text Access**: Returns complete article content, not just abstracts +- **Image Links**: Includes figures and images from papers +- **Comprehensive Coverage**: Search across PubMed, arXiv, bioRxiv, and medRxiv simultaneously +- **Unified Results**: Get results from all sources in a single query + +## Requirements + +1. Node.js 18+ (uses built-in fetch) +2. Valyu API key from https://platform.valyu.ai ($10 free credits) + +## CRITICAL: Script Path Resolution + +The `scripts/search` commands in this documentation are relative to this skill's installation directory. + +Before running any command, locate the script using: + +```bash +LITERATURE_SCRIPT=$(find ~/.claude/plugins/cache -name "search" -path "*/literature-search/*/scripts/*" -type f 2>/dev/null | head -1) +``` + +Then use the full path for all commands: +```bash +$LITERATURE_SCRIPT "CRISPR gene editing advances" 15 +``` + +## API Key Setup Flow + +When you run a search and receive `"setup_required": true`, follow this flow: + +1. **Ask the user for their API key:** + "To search scientific literature, I need your Valyu API key. Get one free ($10 credits) at https://platform.valyu.ai" + +2. **Once the user provides the key, run:** + ```bash + scripts/search setup + ``` + +3. **Retry the original search.** + +## When to Use This Skill + +- Comprehensive literature reviews across all domains +- Finding all relevant research on a topic +- Cross-domain scientific discovery +- Combining biomedical, physics, and preprint literature +- Emerging research across disciplines +## Output Format + +```json +{ + "success": true, + "type": "literature_search", + "query": "CRISPR gene editing advances", + "result_count": 15, + "results": [ + { + "title": "Article Title", + "url": "https://...", + "content": "Full article text with figures...", + "source": "pubmed|arxiv|biorxiv|medrxiv", + "relevance_score": 0.95, + "images": ["https://example.com/figure1.jpg"] + } + ], + "cost": 0.025 +} +``` + +## Processing Results + +### With jq + +```bash +# Get article titles +scripts/search "query" 20 | jq -r '.results[].title' + +# Get URLs +scripts/search "query" 20 | jq -r '.results[].url' + +# Extract full content +scripts/search "query" 20 | jq -r '.results[].content' + +# Filter by source +scripts/search "query" 20 | jq -r '.results[] | select(.source == "arxiv") | .title' +``` + +## Common Use Cases + +### Comprehensive Literature Review + +```bash +# Search across all sources for thorough review +scripts/search "mechanisms of cellular senescence" 100 +``` + +### Cross-Disciplinary Research + +```bash +# Find papers spanning multiple fields +scripts/search "quantum computing applications in drug discovery" 50 +``` + +### Recent Developments + +```bash +# Get latest preprints and publications +scripts/search "foundation models for protein folding" 30 +``` + +### Medical Research + +```bash +# Search biomedical literature comprehensively +scripts/search "immunotherapy checkpoint inhibitors resistance" 40 +``` + + +## Error Handling + +All commands return JSON with `success` field: + +```json +{ + "success": false, + "error": "Error message" +} +``` + +Exit codes: +- `0` - Success +- `1` - Error (check JSON for details) + +## API Endpoint + +- Base URL: `https://api.valyu.ai/v1` +- Endpoint: `/search` +- Authentication: X-API-Key header + +## Architecture + +``` +scripts/ +├── search # Bash wrapper +└── search.mjs # Node.js CLI +``` + +Direct API calls using Node.js built-in `fetch()`, zero external dependencies. + +## Adding to Your Project + +If you're building an AI project and want to integrate Literature Search directly into your application, use the Valyu SDK: + +### Python Integration + +```python +from valyu import Valyu + +client = Valyu(api_key="your-api-key") + +response = client.search( + query="your search query here", + included_sources=["valyu/valyu-pubmed", "valyu/valyu-arxiv", "valyu/valyu-biorxiv", "valyu/valyu-medrxiv"], + max_results=20 +) + +for result in response["results"]: + print(f"Title: {result['title']}") + print(f"URL: {result['url']}") + print(f"Content: {result['content'][:500]}...") +``` + +### TypeScript Integration + +```typescript +import { Valyu } from "valyu-js"; + +const client = new Valyu("your-api-key"); + +const response = await client.search({ + query: "your search query here", + includedSources: ["valyu/valyu-pubmed", "valyu/valyu-arxiv", "valyu/valyu-biorxiv", "valyu/valyu-medrxiv"], + maxResults: 20 +}); + +response.results.forEach((result) => { + console.log(`Title: ${result.title}`); + console.log(`URL: ${result.url}`); + console.log(`Content: ${result.content.substring(0, 500)}...`); +}); +``` + +See the [Valyu docs](https://docs.valyu.ai) for full integration examples and SDK reference. diff --git a/container/skills/pubmed-search/scripts/analyze b/container/skills/pubmed-search/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/pubmed-search/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/pubmed-search/scripts/process b/container/skills/pubmed-search/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/pubmed-search/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/pubmed-search/scripts/run b/container/skills/pubmed-search/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/pubmed-search/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/pubmed-search/scripts/search b/container/skills/pubmed-search/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/pubmed-search/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/pubmed-search/skill.md b/container/skills/pubmed-search/skill.md new file mode 100644 index 0000000..6cf8ef3 --- /dev/null +++ b/container/skills/pubmed-search/skill.md @@ -0,0 +1,103 @@ +--- +name: pubmed-search +description: Search PubMed for scientific literature. Use when the user asks to find papers, search literature, look up research, find publications, or asks about recent studies. Triggers on "pubmed", "papers", "literature", "publications", "research on", "studies about". +--- + +# PubMed Search + +Search NCBI PubMed for scientific literature using BioPython's Entrez module. + +## When to Use + +- User asks to find papers on a topic +- User wants recent publications in a field +- User asks for references or citations +- User wants to know the state of research on a topic + +## How to Execute + +### 1. Set up Entrez + +```python +from Bio import Entrez +Entrez.email = "medclaw@freedomai.com" +``` + +### 2. Search PubMed + +```python +# Search +handle = Entrez.esearch(db="pubmed", term="CRISPR delivery methods", retmax=20, sort="date") +record = Entrez.read(handle) +handle.close() + +id_list = record["IdList"] +print(f"Found {record['Count']} results, showing top {len(id_list)}") +``` + +### 3. Fetch article details + +```python +# Fetch details +handle = Entrez.efetch(db="pubmed", id=id_list, rettype="xml") +records = Entrez.read(handle) +handle.close() + +for article in records['PubmedArticle']: + medline = article['MedlineCitation'] + pmid = str(medline['PMID']) + title = medline['Article']['ArticleTitle'] + + # Get authors + authors = medline['Article'].get('AuthorList', []) + first_author = f"{authors[0].get('LastName', '')} {authors[0].get('Initials', '')}" if authors else "Unknown" + + # Get journal and year + journal = medline['Article']['Journal']['Title'] + pub_date = medline['Article']['Journal']['JournalIssue'].get('PubDate', {}) + year = pub_date.get('Year', 'N/A') + + # Get abstract + abstract_parts = medline['Article'].get('Abstract', {}).get('AbstractText', []) + abstract = ' '.join(str(a) for a in abstract_parts)[:300] + + print(f"PMID: {pmid}") + print(f"Title: {title}") + print(f"Authors: {first_author} et al.") + print(f"Journal: {journal} ({year})") + print(f"Abstract: {abstract}...") + print(f"Link: https://pubmed.ncbi.nlm.nih.gov/{pmid}/") + print() +``` + +### 4. Output format for WhatsApp + +``` +*PubMed Search: "CRISPR delivery methods"* +_Found 1,234 results. Top 5:_ + +*1.* Lipid nanoparticle-mediated CRISPR delivery... + _Smith J et al. — Nature (2026)_ + PMID: 12345678 + pubmed.ncbi.nlm.nih.gov/12345678 + +*2.* AAV-based CRISPR therapeutics: advances and challenges + _Chen L et al. — Cell (2026)_ + PMID: 12345679 + pubmed.ncbi.nlm.nih.gov/12345679 +``` + +### 5. Advanced searches + +Support these query patterns: +- `"CRISPR"[Title] AND "delivery"[Title]` — title-specific +- `"2026"[Date - Publication]` — date filter +- `"Nature"[Journal]` — journal filter +- `review[Publication Type]` — type filter + +### 6. Follow-up suggestions + +After showing results, suggest: +- "Want me to summarize any of these papers?" +- "Should I search with different keywords?" +- "Want me to find related papers to any of these?" diff --git a/container/skills/scrna-qc/scripts/analyze b/container/skills/scrna-qc/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/scrna-qc/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/scrna-qc/scripts/process b/container/skills/scrna-qc/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/scrna-qc/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/scrna-qc/scripts/run b/container/skills/scrna-qc/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/scrna-qc/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/scrna-qc/scripts/search b/container/skills/scrna-qc/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/scrna-qc/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/scrna-qc/skill.md b/container/skills/scrna-qc/skill.md new file mode 100644 index 0000000..b086498 --- /dev/null +++ b/container/skills/scrna-qc/skill.md @@ -0,0 +1,43 @@ + + +--- +name: scrna-qc +description: Execute the MAD-based single-cell RNA-seq QC workflow (scripts + Python API) to filter low-quality cells and emit reports plus filtered AnnData files. +measurable_outcome: Produce filtered .h5ad files, before/after plots, and qc_summary.json within 20 minutes per dataset. +allowed-tools: + - read_file + - run_shell_command +--- + +## At-a-Glance +- **description (10-20 chars):** QC autopilot +- **keywords:** scRNAseq, MAD, h5ad, QC, plots + +## Workflow +1. Accept `.h5ad`, 10x `.h5`, or 10x directory inputs; set mitochondrial/ribosomal patterns as needed. +2. Run `qc_analysis.py` (CLI) or call `qc_core` helpers to compute metrics, apply MAD thresholds, and filter cells/genes. +3. Generate standard plots (metrics before/after, threshold overlays) plus filtered data artifacts. +4. Document parameters (mad_counts/genes/mt, mt_threshold, min_cells, log1p flag) inside the summary JSON. +5. Provide guidance on next steps (doublet detection, downstream analysis). + +## Guardrails +- Adjust MT% expectations for tissue context; avoid over-filtering rare populations. +- This workflow is QC only—doublet handling and batch correction stay separate. +- Keep reproducibility by storing command invocations and environment info. + +## References +- See `README.md`, `qc_core.py`, `qc_analysis.py`, and `qc_plotting.py` for API usage and schema details. + + + \ No newline at end of file diff --git a/container/skills/visium-analysis/scripts/analyze b/container/skills/visium-analysis/scripts/analyze new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/visium-analysis/scripts/analyze @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/visium-analysis/scripts/process b/container/skills/visium-analysis/scripts/process new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/visium-analysis/scripts/process @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/visium-analysis/scripts/run b/container/skills/visium-analysis/scripts/run new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/visium-analysis/scripts/run @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/visium-analysis/scripts/search b/container/skills/visium-analysis/scripts/search new file mode 100755 index 0000000..1becba2 --- /dev/null +++ b/container/skills/visium-analysis/scripts/search @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/container/skills/visium-analysis/skill.md b/container/skills/visium-analysis/skill.md new file mode 100644 index 0000000..1becba2 --- /dev/null +++ b/container/skills/visium-analysis/skill.md @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index dae9e01..d9cf76a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8,15 +8,19 @@ "name": "bioclaw", "version": "0.1.0", "dependencies": { + "@larksuiteoapi/node-sdk": "^1.59.0", + "@types/ws": "^8.18.1", "@wecom/aibot-node-sdk": "^1.0.1", "@whiskeysockets/baileys": "^7.0.0-rc.9", "better-sqlite3": "^11.8.1", "cron-parser": "^5.5.0", "discord.js": "^14.25.1", - "dotenv": "^17.3.1", + "grammy": "^1.41.1", + "pdf-parse": "^2.4.5", "pino": "^9.6.0", "pino-pretty": "^13.0.0", "qrcode-terminal": "^0.12.0", + "ws": "^8.19.0", "zod": "^4.3.6" }, "devDependencies": { @@ -270,9 +274,9 @@ } }, "node_modules/@emnapi/runtime": { - "version": "1.8.1", - "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.8.1.tgz", - "integrity": "sha512-mehfKSMWjjNol8659Z8KxEMrdSJDDot5SXMq00dM8BN4o+CLNXQ0xH2V7EchNHV4RmbZLmmPdEaXZc5H2FXmDg==", + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.9.0.tgz", + "integrity": "sha512-QN75eB0IH2ywSpRpNddCRfQIhmJYBCJ1x5Lb3IscKAL8bMnVAKnRg8dCoXbHzVLLH7P38N2Z3mtulB7W0J0FKw==", "license": "MIT", "optional": true, "peer": true, @@ -722,6 +726,12 @@ "node": ">=18" } }, + "node_modules/@grammyjs/types": { + "version": "3.25.0", + "resolved": "https://registry.npmjs.org/@grammyjs/types/-/types-3.25.0.tgz", + "integrity": "sha512-iN9i5p+8ZOu9OMxWNcguojQfz4K/PDyMPOnL7PPCON+SoA/F8OKMH3uR7CVUkYfdNe0GCz8QOzAWrnqusQYFOg==", + "license": "MIT" + }, "node_modules/@hapi/boom": { "version": "9.1.4", "resolved": "https://registry.npmjs.org/@hapi/boom/-/boom-9.1.4.tgz", @@ -1277,6 +1287,205 @@ "integrity": "sha512-dXn3FZhPv0US+7dtJsIi2R+c7qWYiReoEh5zUntWCf4oSpMNib8FDhSoed6m3QyZdx5hK7iLFkYk3rNxwt8vTA==", "license": "MIT" }, + "node_modules/@larksuiteoapi/node-sdk": { + "version": "1.59.0", + "resolved": "https://registry.npmjs.org/@larksuiteoapi/node-sdk/-/node-sdk-1.59.0.tgz", + "integrity": "sha512-sBpkruTvZDOxnVtoTbepWKRX0j1Y1ZElQYu0x7+v088sI9pcpbVp6ZzCGn62dhrKPatzNyCJyzYCPXPYQWccrA==", + "license": "MIT", + "dependencies": { + "axios": "~1.13.3", + "lodash.identity": "^3.0.0", + "lodash.merge": "^4.6.2", + "lodash.pickby": "^4.6.0", + "protobufjs": "^7.2.6", + "qs": "^6.14.2", + "ws": "^8.19.0" + } + }, + "node_modules/@napi-rs/canvas": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.80.tgz", + "integrity": "sha512-DxuT1ClnIPts1kQx8FBmkk4BQDTfI5kIzywAaMjQSXfNnra5UFU9PwurXrl+Je3bJ6BGsp/zmshVVFbCmyI+ww==", + "license": "MIT", + "workspaces": [ + "e2e/*" + ], + "engines": { + "node": ">= 10" + }, + "optionalDependencies": { + "@napi-rs/canvas-android-arm64": "0.1.80", + "@napi-rs/canvas-darwin-arm64": "0.1.80", + "@napi-rs/canvas-darwin-x64": "0.1.80", + "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.80", + "@napi-rs/canvas-linux-arm64-gnu": "0.1.80", + "@napi-rs/canvas-linux-arm64-musl": "0.1.80", + "@napi-rs/canvas-linux-riscv64-gnu": "0.1.80", + "@napi-rs/canvas-linux-x64-gnu": "0.1.80", + "@napi-rs/canvas-linux-x64-musl": "0.1.80", + "@napi-rs/canvas-win32-x64-msvc": "0.1.80" + } + }, + "node_modules/@napi-rs/canvas-android-arm64": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.80.tgz", + "integrity": "sha512-sk7xhN/MoXeuExlggf91pNziBxLPVUqF2CAVnB57KLG/pz7+U5TKG8eXdc3pm0d7Od0WreB6ZKLj37sX9muGOQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-darwin-arm64": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.80.tgz", + "integrity": "sha512-O64APRTXRUiAz0P8gErkfEr3lipLJgM6pjATwavZ22ebhjYl/SUbpgM0xcWPQBNMP1n29afAC/Us5PX1vg+JNQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-darwin-x64": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.80.tgz", + "integrity": "sha512-FqqSU7qFce0Cp3pwnTjVkKjjOtxMqRe6lmINxpIZYaZNnVI0H5FtsaraZJ36SiTHNjZlUB69/HhxNDT1Aaa9vA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.80.tgz", + "integrity": "sha512-eyWz0ddBDQc7/JbAtY4OtZ5SpK8tR4JsCYEZjCE3dI8pqoWUC8oMwYSBGCYfsx2w47cQgQCgMVRVTFiiO38hHQ==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-gnu": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.80.tgz", + "integrity": "sha512-qwA63t8A86bnxhuA/GwOkK3jvb+XTQaTiVML0vAWoHyoZYTjNs7BzoOONDgTnNtr8/yHrq64XXzUoLqDzU+Uuw==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-musl": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.80.tgz", + "integrity": "sha512-1XbCOz/ymhj24lFaIXtWnwv/6eFHXDrjP0jYkc6iHQ9q8oXKzUX1Lc6bu+wuGiLhGh2GS/2JlfORC5ZcXimRcg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-riscv64-gnu": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.80.tgz", + "integrity": "sha512-XTzR125w5ZMs0lJcxRlS1K3P5RaZ9RmUsPtd1uGt+EfDyYMu4c6SEROYsxyatbbu/2+lPe7MPHOO/0a0x7L/gw==", + "cpu": [ + "riscv64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-gnu": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.80.tgz", + "integrity": "sha512-BeXAmhKg1kX3UCrJsYbdQd3hIMDH/K6HnP/pG2LuITaXhXBiNdh//TVVVVCBbJzVQaV5gK/4ZOCMrQW9mvuTqA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-musl": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.80.tgz", + "integrity": "sha512-x0XvZWdHbkgdgucJsRxprX/4o4sEed7qo9rCQA9ugiS9qE2QvP0RIiEugtZhfLH3cyI+jIRFJHV4Fuz+1BHHMg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-win32-x64-msvc": { + "version": "0.1.80", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.80.tgz", + "integrity": "sha512-Z8jPsM6df5V8B1HrCHB05+bDiCxjE9QA//3YrkKIdVDEwn5RKaqOxCJDRJkl48cJbylcrJbW4HxZbTte8juuPg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + } + }, "node_modules/@pinojs/redact": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/@pinojs/redact/-/redact-0.4.0.tgz", @@ -1979,9 +2188,9 @@ } }, "node_modules/@wecom/aibot-node-sdk": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/@wecom/aibot-node-sdk/-/aibot-node-sdk-1.0.1.tgz", - "integrity": "sha512-c/sa1IvRKIP+4rZfRV2v70FaXB92+BJIh+vedZkPa8wZ1dwIUyvGg7ydkfYRIwFDzjO9IJZUX5V14EUQYVopAg==", + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@wecom/aibot-node-sdk/-/aibot-node-sdk-1.0.2.tgz", + "integrity": "sha512-azClUIMWWF5vs8K1YWBiNykTFUawej0Z1ooN0ZMGX/PlLB/BK0dQfwbLc1a5Wj3bLRLaFb8HuCTuBrxLnJKJ7g==", "license": "MIT", "dependencies": { "axios": "^1.6.7", @@ -2028,6 +2237,18 @@ } } }, + "node_modules/abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "license": "MIT", + "dependencies": { + "event-target-shim": "^5.0.0" + }, + "engines": { + "node": ">=6.5" + } + }, "node_modules/assertion-error": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-2.0.1.tgz", @@ -2186,6 +2407,22 @@ "node": ">= 0.4" } }, + "node_modules/call-bound": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.4.tgz", + "integrity": "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "get-intrinsic": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/chai": { "version": "6.2.2", "resolved": "https://registry.npmjs.org/chai/-/chai-6.2.2.tgz", @@ -2351,18 +2588,6 @@ "url": "https://github.com/discordjs/discord.js?sponsor" } }, - "node_modules/dotenv": { - "version": "17.3.1", - "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-17.3.1.tgz", - "integrity": "sha512-IO8C/dzEb6O3F9/twg6ZLXz164a2fhTnEWb95H23Dm4OuN+92NmEAlTrupP9VW6Jm3sO26tQlqyvyi4CsnY9GA==", - "license": "BSD-2-Clause", - "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://dotenvx.com" - } - }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", @@ -2490,6 +2715,15 @@ "@types/estree": "^1.0.0" } }, + "node_modules/event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/eventemitter3": { "version": "5.0.4", "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.4.tgz", @@ -2709,6 +2943,21 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/grammy": { + "version": "1.41.1", + "resolved": "https://registry.npmjs.org/grammy/-/grammy-1.41.1.tgz", + "integrity": "sha512-wcHAQ1e7svL3fJMpDchcQVcWUmywhuepOOjHUHmMmWAwUJEIyK5ea5sbSjZd+Gy1aMpZeP8VYJa+4tP+j1YptQ==", + "license": "MIT", + "dependencies": { + "@grammyjs/types": "3.25.0", + "abort-controller": "^3.0.0", + "debug": "^4.4.3", + "node-fetch": "^2.7.0" + }, + "engines": { + "node": "^12.20.0 || >=14.13.1" + } + }, "node_modules/has-flag": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", @@ -2939,6 +3188,24 @@ "integrity": "sha512-LgVTMpQtIopCi79SJeDiP0TfWi5CNEc/L/aRdTh3yIvmZXTnheWpKjSZhnvMl8iXbC1tFg9gdHHDMLoV7CnG+w==", "license": "MIT" }, + "node_modules/lodash.identity": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/lodash.identity/-/lodash.identity-3.0.0.tgz", + "integrity": "sha512-AupTIzdLQxJS5wIYUQlgGyk2XRTfGXA+MCghDHqZk0pzUNYvd3EESS6dkChNauNYVIutcb0dfHw1ri9Q1yPV8Q==", + "license": "MIT" + }, + "node_modules/lodash.merge": { + "version": "4.6.2", + "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz", + "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==", + "license": "MIT" + }, + "node_modules/lodash.pickby": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/lodash.pickby/-/lodash.pickby-4.6.0.tgz", + "integrity": "sha512-AZV+GsS/6ckvPOVQPXSiFFacKvKB4kOQu6ynt9wz0F3LO4R9Ij4K1ddYsIytDpSgLz88JHd9P+oaLeej5/Sl7Q==", + "license": "MIT" + }, "node_modules/lodash.snakecase": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/lodash.snakecase/-/lodash.snakecase-4.1.1.tgz", @@ -3153,6 +3420,38 @@ "node": ">=10" } }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "license": "MIT", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, + "node_modules/object-inspect": { + "version": "1.13.4", + "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz", + "integrity": "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/obug": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/obug/-/obug-2.1.1.tgz", @@ -3217,6 +3516,38 @@ "dev": true, "license": "MIT" }, + "node_modules/pdf-parse": { + "version": "2.4.5", + "resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-2.4.5.tgz", + "integrity": "sha512-mHU89HGh7v+4u2ubfnevJ03lmPgQ5WU4CxAVmTSh/sxVTEDYd1er/dKS/A6vg77NX47KTEoihq8jZBLr8Cxuwg==", + "license": "Apache-2.0", + "dependencies": { + "@napi-rs/canvas": "0.1.80", + "pdfjs-dist": "5.4.296" + }, + "bin": { + "pdf-parse": "bin/cli.mjs" + }, + "engines": { + "node": ">=20.16.0 <21 || >=22.3.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/mehmet-kozan" + } + }, + "node_modules/pdfjs-dist": { + "version": "5.4.296", + "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.296.tgz", + "integrity": "sha512-DlOzet0HO7OEnmUmB6wWGJrrdvbyJKftI1bhMitK7O2N8W2gc757yyYBbINy9IDafXAV9wmKr9t7xsTaNKRG5Q==", + "license": "Apache-2.0", + "engines": { + "node": ">=20.16.0 || >=22.3.0" + }, + "optionalDependencies": { + "@napi-rs/canvas": "^0.1.80" + } + }, "node_modules/picocolors": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", @@ -3454,6 +3785,21 @@ "qrcode-terminal": "bin/qrcode-terminal.js" } }, + "node_modules/qs": { + "version": "6.15.0", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.15.0.tgz", + "integrity": "sha512-mAZTtNCeetKMH+pSjrb76NAM8V9a05I9aBZOHztWy/UqcJdQYNsf59vrRKWnojAT9Y+GbIvoTBC++CPHqpDBhQ==", + "license": "BSD-3-Clause", + "dependencies": { + "side-channel": "^1.1.0" + }, + "engines": { + "node": ">=0.6" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/quick-format-unescaped": { "version": "4.0.4", "resolved": "https://registry.npmjs.org/quick-format-unescaped/-/quick-format-unescaped-4.0.4.tgz", @@ -3664,6 +4010,78 @@ "@img/sharp-win32-x64": "0.34.5" } }, + "node_modules/side-channel": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.1.0.tgz", + "integrity": "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "object-inspect": "^1.13.3", + "side-channel-list": "^1.0.0", + "side-channel-map": "^1.0.1", + "side-channel-weakmap": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/side-channel-list": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.0.tgz", + "integrity": "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "object-inspect": "^1.13.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/side-channel-map": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/side-channel-map/-/side-channel-map-1.0.1.tgz", + "integrity": "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==", + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.5", + "object-inspect": "^1.13.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/side-channel-weakmap": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/side-channel-weakmap/-/side-channel-weakmap-1.0.2.tgz", + "integrity": "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==", + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.5", + "object-inspect": "^1.13.3", + "side-channel-map": "^1.0.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/siginfo": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/siginfo/-/siginfo-2.0.0.tgz", @@ -3907,6 +4325,12 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", + "license": "MIT" + }, "node_modules/ts-mixer": { "version": "6.0.4", "resolved": "https://registry.npmjs.org/ts-mixer/-/ts-mixer-6.0.4.tgz", @@ -4151,6 +4575,22 @@ } } }, + "node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", + "license": "BSD-2-Clause" + }, + "node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "license": "MIT", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, "node_modules/why-is-node-running": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/why-is-node-running/-/why-is-node-running-2.3.0.tgz", diff --git a/package.json b/package.json index 5cc4646..b893bd9 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,7 @@ "scripts": { "build": "tsc", "start": "node dist/index.js", - "dev": "tsx src/index.ts", + "dev": "tsx --env-file=.env src/index.ts", "cli": "tsx src/cli.ts", "auth": "tsx src/whatsapp-auth.ts", "typecheck": "tsc --noEmit", @@ -17,15 +17,19 @@ "test:watch": "vitest" }, "dependencies": { + "@larksuiteoapi/node-sdk": "^1.59.0", + "@types/ws": "^8.18.1", "@wecom/aibot-node-sdk": "^1.0.1", "@whiskeysockets/baileys": "^7.0.0-rc.9", "better-sqlite3": "^11.8.1", "cron-parser": "^5.5.0", "discord.js": "^14.25.1", - "dotenv": "^17.3.1", + "grammy": "^1.41.1", + "pdf-parse": "^2.4.5", "pino": "^9.6.0", "pino-pretty": "^13.0.0", "qrcode-terminal": "^0.12.0", + "ws": "^8.19.0", "zod": "^4.3.6" }, "devDependencies": { diff --git a/src/channels/feishu.ts b/src/channels/feishu.ts new file mode 100644 index 0000000..5d5f21e --- /dev/null +++ b/src/channels/feishu.ts @@ -0,0 +1,516 @@ +import fs from 'fs'; +import https from 'https'; +import path from 'path'; +import * as lark from '@larksuiteoapi/node-sdk'; + +import { ASSISTANT_NAME, GROUPS_DIR, TRIGGER_PATTERN } from '../config.js'; +import { logger } from '../logger.js'; +import { + Channel, + OnInboundMessage, + OnChatMetadata, + RegisteredGroup, +} from '../types.js'; + +// Feishu post message max content size (30 KB but we stay conservative) +const POST_MAX_CHARS = 20_000; + +// --- Text chunking (same pattern as WeCom) --- +function chunkText(text: string): string[] { + if (text.length <= POST_MAX_CHARS) return [text]; + + const sections: string[] = []; + let current = ''; + for (const line of text.split('\n')) { + if (/^#{1,6}\s/.test(line) && current.trim()) { + sections.push(current.trimEnd()); + current = line + '\n'; + } else { + current += line + '\n'; + } + } + if (current.trim()) sections.push(current.trimEnd()); + + const splitBlock = (block: string): string[] => { + const result: string[] = []; + let rem = block; + while (rem.length > POST_MAX_CHARS) { + let at = POST_MAX_CHARS; + const paraIdx = rem.lastIndexOf('\n\n', POST_MAX_CHARS); + if (paraIdx > POST_MAX_CHARS * 0.3) { + at = paraIdx + 2; + } else { + const lineIdx = rem.lastIndexOf('\n', POST_MAX_CHARS); + if (lineIdx > POST_MAX_CHARS * 0.3) { + at = lineIdx + 1; + } else { + const sentIdx = rem.slice(0, POST_MAX_CHARS).search(/[。!?!?.]\s/); + if (sentIdx > POST_MAX_CHARS * 0.3) at = sentIdx + 2; + } + } + result.push(rem.slice(0, at).trim()); + rem = rem.slice(at).trim(); + } + if (rem) result.push(rem); + return result; + }; + + const chunks: string[] = []; + let buf = ''; + for (const section of sections) { + const sep = buf ? '\n\n' : ''; + if ((buf + sep + section).length <= POST_MAX_CHARS) { + buf += sep + section; + } else { + if (buf) { chunks.push(buf.trim()); buf = ''; } + if (section.length <= POST_MAX_CHARS) { + buf = section; + } else { + const parts = splitBlock(section); + for (let i = 0; i < parts.length - 1; i++) chunks.push(parts[i]); + buf = parts[parts.length - 1] ?? ''; + } + } + } + if (buf.trim()) chunks.push(buf.trim()); + return chunks; +} + +// --- Markdown → Feishu post content --- +// Converts a markdown text block into Feishu post content elements. +// Each line becomes one or more elements in a paragraph (array of arrays). +function markdownToPostContent(text: string): Array> { + const paragraphs: Array> = []; + + for (const line of text.split('\n')) { + const elements: Array<{ tag: string; text?: string; href?: string }> = []; + + // Strip heading markers, keep text + const stripped = line.replace(/^#{1,6}\s*/, ''); + + if (!stripped) { + // Empty line → empty paragraph (visual spacing) + paragraphs.push([{ tag: 'text', text: '' }]); + continue; + } + + // Parse inline links [text](url) and plain text segments + const linkRe = /\[([^\]]+)\]\((https?:\/\/[^)]+)\)/g; + let lastIdx = 0; + let match: RegExpExecArray | null; + while ((match = linkRe.exec(stripped)) !== null) { + if (match.index > lastIdx) { + elements.push({ tag: 'text', text: stripped.slice(lastIdx, match.index) }); + } + elements.push({ tag: 'a', text: match[1], href: match[2] }); + lastIdx = match.index + match[0].length; + } + if (lastIdx < stripped.length) { + elements.push({ tag: 'text', text: stripped.slice(lastIdx) }); + } + + paragraphs.push(elements); + } + + return paragraphs; +} + +// Determine file_type for Feishu upload from file extension +function getFeishuFileType(filePath: string): string { + const ext = path.extname(filePath).toLowerCase().slice(1); + const map: Record = { + mp4: 'mp4', + opus: 'opus', + pdf: 'pdf', + doc: 'doc', + docx: 'doc', + xls: 'xls', + xlsx: 'xls', + ppt: 'ppt', + pptx: 'ppt', + }; + return map[ext] ?? 'stream'; +} + +/** Download a Feishu message resource (image or file) using native https. */ +function downloadMessageResource( + token: string, + messageId: string, + resourceKey: string, + type: 'image' | 'file', +): Promise { + return new Promise((resolve, reject) => { + const url = `https://open.feishu.cn/open-apis/im/v1/messages/${messageId}/resources/${resourceKey}?type=${type}`; + https.get(url, { headers: { Authorization: `Bearer ${token}` } }, (res) => { + if (res.statusCode !== 200) { + const chunks: Buffer[] = []; + res.on('data', (d: Buffer) => chunks.push(d)); + res.on('end', () => reject(new Error(`HTTP ${res.statusCode}: ${Buffer.concat(chunks).toString().slice(0, 200)}`))); + return; + } + const chunks: Buffer[] = []; + res.on('data', (d: Buffer) => chunks.push(d)); + res.on('end', () => resolve(Buffer.concat(chunks))); + }).on('error', reject); + }); +} + +export interface FeishuChannelOpts { + onMessage: OnInboundMessage; + onChatMetadata: OnChatMetadata; + registeredGroups: () => Record; + jidPrefix?: string; // default: 'fs:' + /** Auto-register new chats to this folder (skips manual dashboard registration) */ + defaultFolder?: string; + /** Agent type for auto-registered groups */ + defaultAgentType?: 'claude' | 'minimax' | 'qwen'; + /** Called when a new group is auto-registered */ + onRegisterGroup?: (jid: string, group: RegisteredGroup) => void; +} + +export class FeishuChannel implements Channel { + name = 'feishu'; + prefixAssistantName = false; + + private appId: string; + private appSecret: string; + private opts: FeishuChannelOpts; + private jidPrefix: string; + private client: lark.Client; + private wsClient: lark.WSClient; + private connected = false; + private stopped = false; + // per-JID send queue to prevent chunk interleaving + private sendQueues: Map> = new Map(); + // dedup: track processed message IDs to avoid double-processing + private processedMsgIds = new Set(); + + constructor(appId: string, appSecret: string, opts: FeishuChannelOpts) { + this.appId = appId; + this.appSecret = appSecret; + this.opts = opts; + this.jidPrefix = opts.jidPrefix ?? 'fs:'; + + this.client = new lark.Client({ + appId, + appSecret, + loggerLevel: lark.LoggerLevel.warn, + }); + + this.wsClient = new lark.WSClient({ + appId, + appSecret, + loggerLevel: lark.LoggerLevel.warn, + }); + } + + async connect(): Promise { + this.stopped = false; + + const dispatcher = new lark.EventDispatcher({}).register({ + 'im.message.receive_v1': async (data) => { + logger.debug({ chatId: data.message?.chat_id, chatType: data.message?.chat_type, msgType: data.message?.message_type, senderType: data.sender?.sender_type }, 'Feishu raw event received'); + await this._handleMessage(data); + }, + }); + + // WSClient.start() fires off the connection asynchronously (no await needed) + await this.wsClient.start({ eventDispatcher: dispatcher }); + + this.connected = true; + logger.info({ appId: this.appId }, 'Feishu WebSocket long-connection started'); + } + + isConnected(): boolean { + return this.connected && !this.stopped; + } + + async disconnect(): Promise { + this.stopped = true; + this.connected = false; + try { + this.wsClient.close({ force: true }); + } catch (_) { /* ignore */ } + logger.info('Feishu channel disconnected'); + } + + ownsJid(jid: string): boolean { + return jid.startsWith(this.jidPrefix); + } + + // --- Inbound message handling --- + private async _handleMessage(data: { + sender: { sender_id?: { open_id?: string }; sender_type: string }; + message: { + message_id: string; + chat_id: string; + chat_type: string; + message_type: string; + content: string; + mentions?: Array<{ key: string; id: { open_id?: string }; name: string }>; + }; + }): Promise { + const { sender, message } = data; + + // Skip bot's own messages + if (sender.sender_type === 'app') return; + + // Dedup + if (this.processedMsgIds.has(message.message_id)) return; + this.processedMsgIds.add(message.message_id); + // Keep set bounded + if (this.processedMsgIds.size > 500) { + const first = this.processedMsgIds.values().next().value; + if (first) this.processedMsgIds.delete(first); + } + + const msgType = message.message_type; + if (!['text', 'image', 'file', 'audio'].includes(msgType)) return; + + const jid = `${this.jidPrefix}${message.chat_id}`; + const senderId = sender.sender_id?.open_id ?? 'unknown'; + const timestamp = new Date().toISOString(); + + // Parse content based on message type + let text = ''; + let attachmentNote = ''; + + if (msgType === 'text') { + try { + const parsed = JSON.parse(message.content) as { text?: string }; + text = parsed.text ?? ''; + } catch { + logger.warn({ content: message.content }, 'Feishu: failed to parse text content'); + return; + } + } else { + // image / file / audio — download and save to group folder, then add a note + try { + const parsed = JSON.parse(message.content) as Record; + const groups = this.opts.registeredGroups(); + const group = groups[jid]; + const groupFolder = group?.folder; + + if (groupFolder) { + const groupDir = path.join(GROUPS_DIR, groupFolder); + fs.mkdirSync(groupDir, { recursive: true }); + + const token = await (this.client as any).tokenManager.getTenantAccessToken(); + + if (msgType === 'image') { + const imageKey = parsed.image_key; + if (imageKey) { + const filename = `feishu-img-${message.message_id.slice(-8)}.jpg`; + const dest = path.join(groupDir, filename); + try { + const buf = await downloadMessageResource(token, message.message_id, imageKey, 'image'); + fs.writeFileSync(dest, buf); + attachmentNote = `[用户发送了图片: ${filename}]`; + logger.info({ jid, filename }, 'Feishu: image saved'); + } catch (e) { + logger.warn({ err: e, imageKey }, 'Feishu: failed to download image'); + attachmentNote = '[用户发送了图片,下载失败]'; + } + } + } else if (msgType === 'file' || msgType === 'audio') { + const fileKey = parsed.file_key ?? parsed.audio_key; + const fileName = parsed.file_name ?? `feishu-${msgType}-${message.message_id.slice(-8)}`; + if (fileKey) { + const dest = path.join(groupDir, fileName); + try { + const buf = await downloadMessageResource(token, message.message_id, fileKey, 'file'); + fs.writeFileSync(dest, buf); + attachmentNote = `[用户发送了文件: ${fileName}]`; + logger.info({ jid, fileName }, 'Feishu: file saved'); + } catch (e) { + logger.warn({ err: e, fileKey }, 'Feishu: failed to download file'); + attachmentNote = `[用户发送了文件: ${fileName},下载失败]`; + } + } + } + } else { + // Group not yet registered — still want to trigger agent after registration + if (msgType === 'image') attachmentNote = '[用户发送了图片]'; + else attachmentNote = '[用户发送了文件]'; + } + } catch (e) { + logger.warn({ err: e }, 'Feishu: failed to handle attachment'); + return; + } + text = attachmentNote; + } + + // Strip @mention tokens from text (works for both group and p2p) + if (msgType === 'text') { + text = text.replace(/@[^\s]+/g, '').trim(); + } + + if (!text) return; + + logger.info({ jid, senderId, chatType: message.chat_type, msgType, textLen: text.length }, 'Feishu message received'); + + // Auto-register unrecognised chats when defaultFolder is configured + if (this.opts.defaultFolder && this.opts.onRegisterGroup) { + const known = this.opts.registeredGroups(); + if (!known[jid]) { + const chatType = message.chat_type as 'p2p' | 'group'; + const autoGroup: RegisteredGroup = { + name: jid, + folder: this.opts.defaultFolder, + trigger: TRIGGER_PATTERN.source, + added_at: timestamp, + agentType: this.opts.defaultAgentType, + // p2p chats don't need a trigger word; group chats respond to all messages + requiresTrigger: false, + }; + this.opts.onRegisterGroup(jid, autoGroup); + logger.info({ jid, folder: this.opts.defaultFolder, chatType }, 'Feishu: auto-registered new chat'); + } + } + + // Register chat metadata + this.opts.onChatMetadata(jid, timestamp); + + // Deliver message to orchestrator + this.opts.onMessage(jid, { + id: message.message_id, + chat_jid: jid, + sender: senderId, + sender_name: senderId, + content: text, + timestamp, + }); + } + + // --- Outbound: text --- + async sendMessage(jid: string, text: string): Promise { + const chatId = jid.slice(this.jidPrefix.length); + const chunks = chunkText(text); + + // Queue per JID to prevent chunk interleaving + const prev = this.sendQueues.get(jid) ?? Promise.resolve(); + const next = prev.then(async () => { + for (const chunk of chunks) { + await this._sendPost(chatId, chunk); + if (chunks.length > 1) { + await new Promise((r) => setTimeout(r, 300)); + } + } + }); + this.sendQueues.set(jid, next.catch(() => { /* swallow so queue keeps going */ })); + return next; + } + + // Send a single text chunk as Feishu post (rich text) — preserves markdown formatting + private async _sendPost(chatId: string, text: string): Promise { + const content = markdownToPostContent(text); + + try { + const res = await this.client.im.message.create({ + params: { receive_id_type: 'chat_id' }, + data: { + receive_id: chatId, + msg_type: 'post', + content: JSON.stringify({ zh_cn: { title: '', content } }), + }, + }); + if (res.code !== 0) { + logger.error({ chatId, code: res.code, msg: res.msg }, 'Feishu sendMessage failed'); + } + } catch (err) { + logger.error({ err, chatId }, 'Feishu sendMessage error'); + } + } + + // --- Outbound: image --- + async sendImage(jid: string, imagePath: string, caption?: string): Promise { + const chatId = jid.slice(this.jidPrefix.length); + + try { + // 1. Upload image + const imageBuffer = fs.createReadStream(imagePath); + const uploadRes = await this.client.im.image.create({ + data: { + image_type: 'message', + image: imageBuffer, + }, + }) as any; + + const imageKey: string | undefined = uploadRes?.image_key ?? uploadRes?.data?.image_key; + if (!imageKey) { + logger.error({ uploadRes }, 'Feishu image upload failed — no image_key'); + return; + } + + // 2. Send image message + const sendRes = await this.client.im.message.create({ + params: { receive_id_type: 'chat_id' }, + data: { + receive_id: chatId, + msg_type: 'image', + content: JSON.stringify({ image_key: imageKey }), + }, + }); + + if (sendRes.code !== 0) { + logger.error({ chatId, code: sendRes.code }, 'Feishu sendImage failed'); + } + + // 3. Send caption if provided + if (caption) { + await this.sendMessage(jid, caption); + } + + logger.info({ chatId, imageKey }, 'Feishu image sent'); + } catch (err) { + logger.error({ err, chatId, imagePath }, 'Feishu sendImage error'); + } + } + + // --- Outbound: file --- + async sendFile(jid: string, filePath: string): Promise { + const chatId = jid.slice(this.jidPrefix.length); + const fileName = path.basename(filePath); + const fileType = getFeishuFileType(filePath); + + try { + // 1. Upload file + const fileStream = fs.createReadStream(filePath); + const uploadRes = await this.client.im.file.create({ + data: { + file_type: fileType as any, + file_name: fileName, + file: fileStream, + }, + }) as any; + + const fileKey: string | undefined = uploadRes?.file_key ?? uploadRes?.data?.file_key; + if (!fileKey) { + logger.error({ uploadRes }, 'Feishu file upload failed — no file_key'); + return; + } + + // 2. Send file message + const sendRes = await this.client.im.message.create({ + params: { receive_id_type: 'chat_id' }, + data: { + receive_id: chatId, + msg_type: 'file', + content: JSON.stringify({ file_key: fileKey }), + }, + }); + + if (sendRes.code !== 0) { + logger.error({ chatId, code: sendRes.code }, 'Feishu sendFile failed'); + } + + logger.info({ chatId, fileKey, fileName }, 'Feishu file sent'); + } catch (err) { + logger.error({ err, chatId, filePath }, 'Feishu sendFile error'); + } + } + + async setTyping(_jid: string, _isTyping: boolean): Promise { + // Feishu doesn't support typing indicators via bot API + } +} diff --git a/src/channels/local-web.ts b/src/channels/local-web.ts index 0c648f0..00ed7f1 100644 --- a/src/channels/local-web.ts +++ b/src/channels/local-web.ts @@ -503,6 +503,8 @@ export class LocalWebChannel implements Channel { private server?: http.Server; private connected = false; private opts: LocalWebChannelOpts; + /** Foreign JIDs initiated from local-web — claimed so responses route back here via sendMessage */ + private readonly proxiedJids = new Set(); constructor(opts: LocalWebChannelOpts) { this.opts = opts; @@ -540,7 +542,7 @@ export class LocalWebChannel implements Channel { } ownsJid(jid: string): boolean { - return jid.endsWith('@local.web'); + return jid.endsWith('@local.web') || this.proxiedJids.has(jid); } async disconnect(): Promise { @@ -688,6 +690,11 @@ export class LocalWebChannel implements Channel { const trimmed = text.trim(); if (!trimmed) return; + // Register foreign JIDs so agent responses route back through local-web + if (!chatJid.endsWith('@local.web')) { + this.proxiedJids.add(chatJid); + } + const now = new Date().toISOString(); this.opts.onChatMetadata(chatJid, now, 'Local Web Chat'); this.opts.onMessage(chatJid, { diff --git a/src/channels/telegram.ts b/src/channels/telegram.ts new file mode 100644 index 0000000..8e28cfa --- /dev/null +++ b/src/channels/telegram.ts @@ -0,0 +1,213 @@ +import { Bot } from "grammy"; + +import { ASSISTANT_NAME, TRIGGER_PATTERN } from "../config.js"; +import { logger } from "../logger.js"; +import { + Channel, + OnInboundMessage, + OnChatMetadata, + RegisteredGroup, +} from "../types.js"; + +export interface TelegramChannelOpts { + onMessage: OnInboundMessage; + onChatMetadata: OnChatMetadata; + registeredGroups: () => Record; +} + +export class TelegramChannel implements Channel { + name = "telegram"; + prefixAssistantName = false; + + private bot: Bot | null = null; + private opts: TelegramChannelOpts; + private botToken: string; + + constructor(botToken: string, opts: TelegramChannelOpts) { + this.botToken = botToken; + this.opts = opts; + } + + async connect(): Promise { + this.bot = new Bot(this.botToken); + + this.bot.command("chatid", (ctx) => { + const chatId = ctx.chat.id; + const chatType = ctx.chat.type; + const chatName = + chatType === "private" + ? ctx.from?.first_name || "Private" + : (ctx.chat as any).title || "Unknown"; + ctx.reply( + `Chat ID: \`tg:${chatId}\`\nName: ${chatName}\nType: ${chatType}`, + { parse_mode: "Markdown" }, + ); + }); + + this.bot.command("ping", (ctx) => { + ctx.reply(`${ASSISTANT_NAME} is online.`); + }); + + this.bot.on("message:text", async (ctx) => { + if (ctx.message.text.startsWith("/")) return; + + const chatJid = `tg:${ctx.chat.id}`; + let content = ctx.message.text; + const timestamp = new Date(ctx.message.date * 1000).toISOString(); + const senderName = + ctx.from?.first_name || + ctx.from?.username || + ctx.from?.id.toString() || + "Unknown"; + const sender = ctx.from?.id.toString() || ""; + const msgId = ctx.message.message_id.toString(); + + const chatName = + ctx.chat.type === "private" + ? senderName + : (ctx.chat as any).title || chatJid; + + const botUsername = ctx.me?.username?.toLowerCase(); + if (botUsername) { + const entities = ctx.message.entities || []; + const isBotMentioned = entities.some((entity) => { + if (entity.type === "mention") { + const mentionText = content + .substring(entity.offset, entity.offset + entity.length) + .toLowerCase(); + return mentionText === `@${botUsername}`; + } + return false; + }); + if (isBotMentioned && !TRIGGER_PATTERN.test(content)) { + content = `@${ASSISTANT_NAME} ${content}`; + } + } + + this.opts.onChatMetadata(chatJid, timestamp, chatName); + + const group = this.opts.registeredGroups()[chatJid]; + if (!group) { + logger.debug({ chatJid, chatName }, "Message from unregistered Telegram chat"); + return; + } + + this.opts.onMessage(chatJid, { + id: msgId, + chat_jid: chatJid, + sender, + sender_name: senderName, + content, + timestamp, + is_from_me: false, + }); + + logger.info({ chatJid, chatName, sender: senderName }, "Telegram message stored"); + }); + + const storeNonText = (ctx: any, placeholder: string) => { + const chatJid = `tg:${ctx.chat.id}`; + const group = this.opts.registeredGroups()[chatJid]; + if (!group) return; + + const timestamp = new Date(ctx.message.date * 1000).toISOString(); + const senderName = + ctx.from?.first_name || + ctx.from?.username || + ctx.from?.id?.toString() || + "Unknown"; + const caption = ctx.message.caption ? ` ${ctx.message.caption}` : ""; + + this.opts.onChatMetadata(chatJid, timestamp); + this.opts.onMessage(chatJid, { + id: ctx.message.message_id.toString(), + chat_jid: chatJid, + sender: ctx.from?.id?.toString() || "", + sender_name: senderName, + content: `${placeholder}${caption}`, + timestamp, + is_from_me: false, + }); + }; + + this.bot.on("message:photo", (ctx) => storeNonText(ctx, "[Photo]")); + this.bot.on("message:video", (ctx) => storeNonText(ctx, "[Video]")); + this.bot.on("message:voice", (ctx) => storeNonText(ctx, "[Voice message]")); + this.bot.on("message:audio", (ctx) => storeNonText(ctx, "[Audio]")); + this.bot.on("message:document", (ctx) => { + const name = ctx.message.document?.file_name || "file"; + storeNonText(ctx, `[Document: ${name}]`); + }); + this.bot.on("message:sticker", (ctx) => { + const emoji = ctx.message.sticker?.emoji || ""; + storeNonText(ctx, `[Sticker ${emoji}]`); + }); + this.bot.on("message:location", (ctx) => storeNonText(ctx, "[Location]")); + this.bot.on("message:contact", (ctx) => storeNonText(ctx, "[Contact]")); + + this.bot.catch((err) => { + logger.error({ err: err.message }, "Telegram bot error"); + }); + + return new Promise((resolve) => { + this.bot!.start({ + onStart: (botInfo) => { + logger.info( + { username: botInfo.username, id: botInfo.id }, + "Telegram bot connected", + ); + console.log(`\n Telegram bot: @${botInfo.username}`); + console.log(` Send /chatid to the bot to get a chat's registration ID\n`); + resolve(); + }, + }); + }); + } + + async sendMessage(jid: string, text: string): Promise { + if (!this.bot) { + logger.warn("Telegram bot not initialized"); + return; + } + try { + const numericId = jid.replace(/^tg:/, ""); + const MAX_LENGTH = 4096; + if (text.length <= MAX_LENGTH) { + await this.bot.api.sendMessage(numericId, text); + } else { + for (let i = 0; i < text.length; i += MAX_LENGTH) { + await this.bot.api.sendMessage(numericId, text.slice(i, i + MAX_LENGTH)); + } + } + logger.info({ jid, length: text.length }, "Telegram message sent"); + } catch (err) { + logger.error({ jid, err }, "Failed to send Telegram message"); + } + } + + isConnected(): boolean { + return this.bot !== null; + } + + ownsJid(jid: string): boolean { + return jid.startsWith("tg:"); + } + + async disconnect(): Promise { + if (this.bot) { + this.bot.stop(); + this.bot = null; + logger.info("Telegram bot stopped"); + } + } + + async setTyping(jid: string, isTyping: boolean): Promise { + if (!this.bot || !isTyping) return; + try { + const numericId = jid.replace(/^tg:/, ""); + await this.bot.api.sendChatAction(numericId, "typing"); + } catch (err) { + logger.debug({ jid, err }, "Failed to send Telegram typing indicator"); + } + } +} diff --git a/src/channels/wecom.ts b/src/channels/wecom.ts index 13e60f7..40ecea4 100644 --- a/src/channels/wecom.ts +++ b/src/channels/wecom.ts @@ -1,373 +1,646 @@ -import crypto from 'crypto'; +import { randomUUID } from 'crypto'; +import { execFile } from 'child_process'; import fs from 'fs'; import path from 'path'; +import WebSocket from 'ws'; -import { WSClient, type WsFrame, type BaseMessage, type TextMessage, type ImageMessage, type VoiceMessage, type MixedMessage } from '@wecom/aibot-node-sdk'; - +import { ASSISTANT_NAME, TRIGGER_PATTERN, STORE_DIR } from '../config.js'; import { logger } from '../logger.js'; -import { Channel, OnInboundMessage, OnChatMetadata, RegisteredGroup } from '../types.js'; +import { + Channel, + OnInboundMessage, + OnChatMetadata, + RegisteredGroup, +} from '../types.js'; + +const WS_ENDPOINT = 'wss://openws.work.weixin.qq.com'; +const PING_INTERVAL_MS = 30_000; +const RECONNECT_DELAY_MS = 5_000; +const MSG_MAX_CHARS = 2000; // WeCom markdown message character limit + +// --- Text chunking --- +// Split text into sections by markdown headings, then pack sections into chunks +// that fit within MSG_MAX_CHARS. A section that exceeds the limit on its own +// is split further at paragraph → sentence boundaries. +function chunkText(text: string): string[] { + if (text.length <= MSG_MAX_CHARS) return [text]; + + // Split into logical sections at heading lines (##, ###, etc.) + const sections: string[] = []; + let current = ''; + for (const line of text.split('\n')) { + if (/^#{1,6}\s/.test(line) && current.trim()) { + sections.push(current.trimEnd()); + current = line + '\n'; + } else { + current += line + '\n'; + } + } + if (current.trim()) sections.push(current.trimEnd()); + + // Helper: split a single oversized block at paragraph / sentence boundary + const splitBlock = (block: string): string[] => { + const result: string[] = []; + let rem = block; + while (rem.length > MSG_MAX_CHARS) { + let at = MSG_MAX_CHARS; + const paraIdx = rem.lastIndexOf('\n\n', MSG_MAX_CHARS); + if (paraIdx > MSG_MAX_CHARS * 0.3) { + at = paraIdx + 2; + } else { + const lineIdx = rem.lastIndexOf('\n', MSG_MAX_CHARS); + if (lineIdx > MSG_MAX_CHARS * 0.3) { + at = lineIdx + 1; + } else { + const sentIdx = rem.slice(0, MSG_MAX_CHARS).search(/[。!?!?.]\s/); + if (sentIdx > MSG_MAX_CHARS * 0.3) at = sentIdx + 2; + } + } + result.push(rem.slice(0, at).trim()); + rem = rem.slice(at).trim(); + } + if (rem) result.push(rem); + return result; + }; + + // Pack sections greedily into chunks + const chunks: string[] = []; + let buf = ''; + for (const section of sections) { + const sep = buf ? '\n\n' : ''; + if ((buf + sep + section).length <= MSG_MAX_CHARS) { + buf += sep + section; + } else { + if (buf) { chunks.push(buf.trim()); buf = ''; } + if (section.length <= MSG_MAX_CHARS) { + buf = section; + } else { + // Section itself is too long — split it + const parts = splitBlock(section); + for (let i = 0; i < parts.length - 1; i++) chunks.push(parts[i]); + buf = parts[parts.length - 1] ?? ''; + } + } + } + if (buf.trim()) chunks.push(buf.trim()); + return chunks; +} -const WECOM_JID_SUFFIX_GROUP = '@wecom.group'; -const WECOM_JID_SUFFIX_USER = '@wecom.user'; +// --- WhatsApp → WeCom markdown conversion --- +// WeCom uses **bold**, WhatsApp uses *bold* +function convertMarkdown(text: string): string { + // Convert *bold* → **bold** (skip already-doubled **) + return text.replace(/(? block formatting --- +// Converts ... blocks to WeCom markdown blockquote style +function formatThinkBlocks(text: string): string { + return text.replace(/([\s\S]*?)<\/think>/gi, (_, content) => { + const trimmed = content.trim(); + if (!trimmed) return ''; + const quoted = trimmed.split('\n').map((l: string) => `> ${l}`).join('\n'); + return `> 💭 **思考过程**\n${quoted}\n`; + }); +} + +// --- Quota tracker (passive replies: 30/day per JID) --- +class QuotaTracker { + private counts = new Map(); + + private today(): string { + return new Date().toLocaleDateString('en-CA', { timeZone: 'Asia/Shanghai' }); + } -interface AgentCredentials { - corpId: string; - corpSecret: string; - agentId: string; + record(jid: string): void { + const today = this.today(); + const entry = this.counts.get(jid); + if (!entry || entry.date !== today) { + this.counts.set(jid, { date: today, passive: 1 }); + return; + } + entry.passive++; + if (entry.passive >= 25) { + logger.warn({ jid, passive: entry.passive }, 'WeCom passive reply quota nearly exhausted (limit: 30/day)'); + } + } + + count(jid: string): number { + const today = this.today(); + const entry = this.counts.get(jid); + return entry?.date === today ? entry.passive : 0; + } +} + +// --- WeCom Corp API (for file/image sending) --- +interface AccessTokenCache { + token: string; + expiresAt: number; +} + +let _accessTokenCache: AccessTokenCache | null = null; + +async function getAccessToken(corpId: string, corpSecret: string): Promise { + if (_accessTokenCache && Date.now() < _accessTokenCache.expiresAt) { + return _accessTokenCache.token; + } + const url = `https://qyapi.weixin.qq.com/cgi-bin/gettoken?corpid=${corpId}&corpsecret=${corpSecret}`; + const res = await fetch(url); + const data = await res.json() as any; + if (data.errcode !== 0) throw new Error(`WeCom gettoken failed: ${data.errmsg}`); + _accessTokenCache = { token: data.access_token, expiresAt: Date.now() + (data.expires_in - 300) * 1000 }; + return data.access_token; +} + +async function uploadMedia(accessToken: string, filePath: string, filename: string): Promise { + return new Promise((resolve, reject) => { + execFile('curl', [ + '-s', '-X', 'POST', + `https://qyapi.weixin.qq.com/cgi-bin/media/upload?access_token=${accessToken}&type=file`, + '-F', `media=@${filePath};filename=${filename}`, + ], (err, stdout) => { + if (err) return reject(err); + try { + const data = JSON.parse(stdout) as any; + if (data.errcode !== 0 && data.errcode !== undefined) return reject(new Error(`WeCom upload failed: ${data.errmsg}`)); + resolve(data.media_id); + } catch (e) { reject(e); } + }); + }); } export interface WeComChannelOpts { - botId: string; - secret: string; - agent?: AgentCredentials; onMessage: OnInboundMessage; onChatMetadata: OnChatMetadata; registeredGroups: () => Record; - autoRegister?: (jid: string, name: string, channelName: string) => void; -} - -/** - * Tracks an inbound WeCom message frame so we can reply in-context - * using the original req_id (required by the WeCom API). - */ -interface PendingReply { - frame: WsFrame; - timestamp: number; + jidPrefix?: string; + corpId?: string; + corpSecret?: string; + agentId?: number; } - export class WeComChannel implements Channel { name = 'wecom'; prefixAssistantName = false; - private client!: WSClient; - private connected = false; + private botId: string; + private secret: string; private opts: WeComChannelOpts; - private pendingReplies = new Map(); - private outgoingQueue: Array<{ jid: string; text: string }> = []; - private flushing = false; - private agentToken = ''; - private agentTokenExpiresAt = 0; - - constructor(opts: WeComChannelOpts) { + private jidPrefix: string; + private corpId: string; + private corpSecret: string; + private agentId: number; + private ws: WebSocket | null = null; + private pingTimer: ReturnType | null = null; + private reconnectTimer: ReturnType | null = null; + private stopped = false; + private quota = new QuotaTracker(); + private reconnectDelay = 5_000; + private readonly maxReconnectDelay = 300_000; + + // response_url per JID (from aibot_msg_callback, used for HTTP reply) + private responseUrls: Map = new Map(); + // chattype per JID: 'single' | 'group' + private chatTypes: Map = new Map(); + // last msgid per JID + private lastMsgIds: Map = new Map(); + // last req_id from incoming message headers (required for aibot_respond_msg) + private lastReqIds: Map = new Map(); + // last sender userid per JID (used for file delivery to individual in group chats) + private lastSenders: Map = new Map(); + // path to persist req_ids across restarts + private readonly reqIdStorePath: string; + // per-JID send queue to prevent chunk interleaving across concurrent sendMessage calls + private sendQueues: Map> = new Map(); + // pending WebSocket messages awaiting ack, keyed by req_id (for 846604 retry) + private pendingWsMessages: Map = new Map(); + + constructor(botId: string, secret: string, opts: WeComChannelOpts) { + this.botId = botId; + this.secret = secret; this.opts = opts; - setInterval(() => this.cleanStalePendingReplies(), 5 * 60 * 1000); + this.jidPrefix = opts.jidPrefix ?? 'wc:'; + this.corpId = opts.corpId ?? ''; + this.corpSecret = opts.corpSecret ?? ''; + this.agentId = opts.agentId ?? 0; + this.reqIdStorePath = path.join(STORE_DIR, `wecom-reqids-${botId.slice(-8)}.json`); + this._loadReqIds(); + } + + private _loadReqIds(): void { + try { + if (fs.existsSync(this.reqIdStorePath)) { + const data = JSON.parse(fs.readFileSync(this.reqIdStorePath, 'utf-8')); + for (const [jid, reqId] of Object.entries(data)) { + this.lastReqIds.set(jid, reqId as string); + } + logger.debug({ count: this.lastReqIds.size }, 'WeCom req_ids loaded from disk'); + } + } catch (err) { + logger.warn({ err }, 'WeCom: failed to load persisted req_ids'); + } + } + + private _saveReqIds(): void { + try { + const data: Record = {}; + for (const [jid, reqId] of this.lastReqIds) data[jid] = reqId; + fs.writeFileSync(this.reqIdStorePath, JSON.stringify(data)); + } catch (err) { + logger.warn({ err }, 'WeCom: failed to persist req_ids'); + } } async connect(): Promise { + this.stopped = false; return new Promise((resolve, reject) => { - const timeout = setTimeout(() => reject(new Error('WeCom connection timeout')), 30_000); - - this.client = new WSClient({ - botId: this.opts.botId, - secret: this.opts.secret, - maxReconnectAttempts: -1, - logger: { - debug: (msg, ...a) => logger.info({ wecom: true, level: 'debug' }, msg, ...a), - info: (msg, ...a) => logger.info({ wecom: true }, msg, ...a), - warn: (msg, ...a) => logger.warn({ wecom: true }, msg, ...a), - error: (msg, ...a) => logger.error({ wecom: true }, msg, ...a), - }, - }); - - this.client.on('authenticated', () => { - this.connected = true; - logger.info('Connected to WeCom'); - clearTimeout(timeout); - this.flushOutgoingQueue().catch(err => - logger.error({ err }, 'Failed to flush WeCom outgoing queue'), - ); - resolve(); + const ws = new WebSocket(WS_ENDPOINT); + this.ws = ws; + let resolved = false; + + const done = (err?: Error) => { + if (resolved) return; + resolved = true; + if (err) reject(err); + else resolve(); + }; + + ws.on('open', () => { + logger.info('WeCom WebSocket connected, subscribing bot'); + this._send({ + cmd: 'aibot_subscribe', + headers: { req_id: randomUUID() }, + body: { bot_id: this.botId, secret: this.secret }, + }); + done(); }); - this.client.on('disconnected', (reason) => { - this.connected = false; - logger.warn({ reason }, 'WeCom disconnected'); + ws.on('message', (raw: WebSocket.RawData) => { + let msg: any; + try { + msg = JSON.parse(raw.toString()); + } catch { + logger.warn({ raw: raw.toString() }, 'WeCom: failed to parse message'); + return; + } + this._handleMessage(msg); }); - this.client.on('reconnecting', (attempt) => { - logger.info({ attempt }, 'WeCom reconnecting'); + ws.on('close', (code, reason) => { + this._stopPing(); + logger.warn({ code, reason: reason.toString() }, 'WeCom WebSocket closed'); + if (!this.stopped) { + this.reconnectTimer = setTimeout(() => this._reconnect(), this.reconnectDelay); + } }); - this.client.on('error', (err) => { - logger.error({ err }, 'WeCom error'); + ws.on('error', (err) => { + logger.error({ err }, 'WeCom WebSocket error'); + done(err); }); - - this.setupMessageHandlers(); - this.client.connect(); }); } - private setupMessageHandlers(): void { - this.client.on('message', (frame: WsFrame) => { - logger.info( - { cmd: frame.cmd, msgtype: frame.body?.msgtype, chattype: frame.body?.chattype, reqId: frame.headers?.req_id }, - 'WeCom raw message event', - ); - }); + private _reconnect(): void { + const ws = new WebSocket(WS_ENDPOINT); + this.ws = ws; - this.client.on('message.text', (frame: WsFrame) => { - this.handleInbound(frame, frame.body!.text.content); + ws.on('open', () => { + logger.info('WeCom WebSocket reconnected, re-subscribing bot'); + this._send({ + cmd: 'aibot_subscribe', + headers: { req_id: randomUUID() }, + body: { bot_id: this.botId, secret: this.secret }, + }); }); - this.client.on('message.voice', (frame: WsFrame) => { - this.handleInbound(frame, frame.body!.voice.content); + ws.on('message', (raw: WebSocket.RawData) => { + let msg: any; + try { + msg = JSON.parse(raw.toString()); + } catch { + return; + } + this._handleMessage(msg); }); - this.client.on('message.image', (frame: WsFrame) => { - this.handleInbound(frame, '[image]'); + ws.on('close', (code, reason) => { + this._stopPing(); + this.reconnectDelay = Math.min(this.reconnectDelay * 2, this.maxReconnectDelay); + logger.warn({ code, reason: reason.toString(), nextDelayMs: this.reconnectDelay }, 'WeCom WebSocket closed, will retry'); + if (!this.stopped) { + this.reconnectTimer = setTimeout(() => this._reconnect(), this.reconnectDelay); + } }); - this.client.on('message.mixed', (frame: WsFrame) => { - const parts = frame.body!.mixed.msg_item - .map(item => item.msgtype === 'text' ? item.text?.content : '[image]') - .filter(Boolean); - this.handleInbound(frame, parts.join(' ')); + ws.on('error', (err) => { + logger.error({ err }, 'WeCom WebSocket reconnect error'); }); } - private handleInbound(frame: WsFrame, content: string): void { - const body = frame.body!; - const chatJid = this.toChatJid(body); - const timestamp = new Date( - body.create_time ? body.create_time * 1000 : Date.now(), - ).toISOString(); - - logger.info( - { chatJid, sender: body.from.userid, chattype: body.chattype, contentPreview: content.slice(0, 80) }, - 'WeCom message received', - ); - - this.pendingReplies.set(chatJid, { frame, timestamp: Date.now() }); - - this.opts.onChatMetadata(chatJid, timestamp); - - let groups = this.opts.registeredGroups(); - if (!groups[chatJid] && this.opts.autoRegister) { - const chatName = body.chattype === 'group' - ? `WeCom Group ${body.chatid || chatJid}` - : `WeCom DM ${body.from.userid}`; - this.opts.autoRegister(chatJid, chatName, 'wecom'); - groups = this.opts.registeredGroups(); + private _handleMessage(msg: any): void { + // Error response (no cmd, errcode non-zero) + if (msg.errcode !== undefined && msg.errcode !== 0) { + logger.error({ errcode: msg.errcode, errmsg: msg.errmsg }, 'WeCom server error'); + console.error(`\n WeCom error ${msg.errcode}: ${msg.errmsg}\n`); + // 846604 = websocket request expired — retry via proactive Corp API + if (msg.errcode === 846604) { + const reqId: string | undefined = msg.headers?.req_id; + if (reqId) { + const pending = this.pendingWsMessages.get(reqId); + if (pending) { + this.pendingWsMessages.delete(reqId); + logger.info({ jid: pending.jid }, 'WeCom 846604: retrying via proactive send'); + this._sendProactive(pending.jid, pending.text).catch(err => { + logger.error({ err }, 'WeCom proactive retry after 846604 failed'); + }); + } + } + } + return; } - if (groups[chatJid]) { - this.opts.onMessage(chatJid, { - id: body.msgid, - chat_jid: chatJid, - sender: body.from.userid, - sender_name: body.from.userid, - content, - timestamp, - is_from_me: false, - }); - } else { - logger.info({ chatJid }, 'WeCom message from unregistered chat, freeing slot'); - this.ackAndFreeSlot(chatJid).catch(err => - logger.error({ chatJid, err }, 'Failed to free WeCom message slot'), - ); + const cmd: string = msg.cmd || ''; + + if (cmd === 'aibot_msg_callback') { + this._handleInboundMsg(msg); + return; } - } - /** - * Send a minimal finish-reply to free the WeCom concurrent-message slot - * (max 3 per user-bot pair). Without this, unreplied messages block - * all future message pushes from the server. - */ - private async ackAndFreeSlot(chatJid: string): Promise { - const pending = this.pendingReplies.get(chatJid); - if (!pending) return; - try { - const streamId = crypto.randomUUID(); - await this.client.replyStream(pending.frame, streamId, ' ', true); - } finally { - this.pendingReplies.delete(chatJid); + if (cmd === 'aibot_event_callback') { + logger.debug({ body: msg.body }, 'WeCom event callback (ignored)'); + return; } - } - async sendMessage(jid: string, text: string): Promise { - if (!this.connected) { - this.outgoingQueue.push({ jid, text }); - logger.info({ jid, queueSize: this.outgoingQueue.length }, 'WeCom disconnected, message queued'); + // Subscription/ping ack: {headers, errcode: 0, errmsg: "ok"} or {headers, errcode: 0} + if (!cmd && msg.headers) { + if (!this.pingTimer) { + // First ack = subscription confirmed; reset backoff + this.reconnectDelay = 5_000; + logger.info('WeCom bot subscribed successfully'); + console.log(`\n WeCom bot connected (bot_id: ${this.botId})\n`); + this._startPing(); + } return; } - try { - const pending = this.pendingReplies.get(jid); - if (pending) { - const streamId = crypto.randomUUID(); - await this.client.replyStream(pending.frame, streamId, text, true); - this.pendingReplies.delete(jid); - } else { - const chatid = this.toWeComId(jid); - await this.client.sendMessage(chatid, { - msgtype: 'markdown', - markdown: { content: text }, - }); + logger.debug({ cmd }, 'WeCom: unhandled message'); + } + + private _handleInboundMsg(msg: any): void { + const body = msg.body || {}; + const fromUser: string = body.from?.userid || body.from_user || body.sender || ''; + const chattype: string = body.chattype || 'single'; + // For group chats, body.chat_id is the group id; for single chats there may be no chat_id + const chatId: string = body.chat_id || body.chatid || fromUser; + const msgId: string = body.msgid || body.msg_id || randomUUID(); + const msgType: string = body.msgtype || 'text'; + const timestamp = new Date(body.create_time ? body.create_time * 1000 : Date.now()).toISOString(); + const chatJid = `${this.jidPrefix}${chatId}`; + + if (body.response_url) this.responseUrls.set(chatJid, body.response_url); + this.chatTypes.set(chatJid, chattype); + this.lastMsgIds.set(chatJid, msgId); + const reqId: string = msg.headers?.req_id || ''; + if (reqId) { + this.lastReqIds.set(chatJid, reqId); + this._saveReqIds(); + } + if (fromUser) this.lastSenders.set(chatJid, fromUser); + + this.opts.onChatMetadata(chatJid, timestamp, chattype === 'single' ? fromUser : chatId); + + let content: string; + if (msgType === 'text') { + content = body.text?.content || body.content || ''; + } else if (msgType === 'image') { + content = '[Image]'; + } else if (msgType === 'file') { + content = `[File: ${body.file?.filename || 'file'}]`; + } else if (msgType === 'voice') { + content = '[Voice message]'; + } else if (msgType === 'video') { + content = '[Video]'; + } else { + content = `[${msgType}]`; + } + + // Single chat: always prepend trigger (no @mention needed, it's a direct conversation) + if (chattype === 'single' && !TRIGGER_PATTERN.test(content)) { + content = `@${ASSISTANT_NAME} ${content}`; + } else { + // Group chat: translate @bot mentions into trigger pattern + const botIdLower = this.botId.toLowerCase(); + if (content.toLowerCase().includes(`@${botIdLower}`) && !TRIGGER_PATTERN.test(content)) { + content = `@${ASSISTANT_NAME} ${content}`; } - logger.info({ jid, length: text.length }, 'WeCom message sent'); - } catch (err) { - this.outgoingQueue.push({ jid, text }); - logger.warn({ jid, err, queueSize: this.outgoingQueue.length }, 'WeCom send failed, queued'); } + + const group = this.opts.registeredGroups()[chatJid]; + if (!group) { + logger.info({ chatJid, chattype, from: fromUser, chat_id: body.chat_id || body.chatid }, 'Message from unregistered WeCom chat — register this JID to enable replies'); + return; + } + + this.opts.onMessage(chatJid, { + id: msgId, + chat_jid: chatJid, + sender: fromUser, + sender_name: fromUser, + content, + timestamp, + is_from_me: false, + }); + + logger.info({ chatJid, sender: fromUser }, 'WeCom message stored'); } - async sendImage(jid: string, imagePath: string, caption?: string): Promise { - const imageBuffer = fs.readFileSync(imagePath); + async sendMessage(jid: string, text: string): Promise { + const prev = this.sendQueues.get(jid) ?? Promise.resolve(); + const next = prev.then(async () => { + const formatted = formatThinkBlocks(text); + const chunks = chunkText(formatted); + for (let i = 0; i < chunks.length; i++) { + // Add delay between chunks to ensure WeCom delivers them in order + // (WebSocket sends are fire-and-forget, rapid succession can reorder) + if (i > 0) await new Promise(r => setTimeout(r, 500)); + await this._sendChunk(jid, chunks[i]); + } + }); + this.sendQueues.set(jid, next.catch(() => {})); + await next; + } - // Agent API is the reliable way to send images (requires IP whitelist on server) - if (this.opts.agent) { + private async _sendChunk(jid: string, text: string): Promise { + const responseUrl = this.responseUrls.get(jid); + const reqId = this.lastReqIds.get(jid); + + // response_url is single-use: one shot, then discard + if (responseUrl) { + this.responseUrls.delete(jid); try { - await this.agentSendImage(jid, imageBuffer, path.basename(imagePath), caption); + const resBody = await this._httpsPost(responseUrl, { msgtype: 'markdown', markdown: { content: text } }); + logger.info({ jid, resBody }, 'WeCom response_url reply result'); + this.quota.record(jid); return; } catch (err) { - logger.warn({ jid, err }, 'Agent API image send failed, image skipped'); + logger.warn({ jid, err }, 'WeCom response_url failed, falling back to WebSocket'); + // fall through to WebSocket } } - if (caption) { - logger.info({ jid }, 'Image not sent (Agent API unavailable), sending caption as text'); - await this.sendMessage(jid, `[图片: ${path.basename(imagePath)}]\n${caption}`); - } else { - logger.warn({ jid, imagePath }, 'Image not sent (Agent API not configured or IP not whitelisted)'); + // WebSocket path + if (reqId) { + this._sendViaWebSocket(jid, text, reqId); + this.quota.record(jid); + return; } - } - // ── Agent API (self-built app) for media delivery ── - - private async getAgentToken(): Promise { - const agent = this.opts.agent; - if (!agent) throw new Error('Agent API not configured'); + // No passive reply context — fall back to proactive Corp API send + await this._sendProactive(jid, text); + } - if (this.agentToken && this.agentTokenExpiresAt > Date.now() + TOKEN_REFRESH_BUFFER_MS) { - return this.agentToken; + private async _sendProactive(jid: string, text: string): Promise { + if (!this.corpId || !this.corpSecret || !this.agentId) { + logger.warn({ jid }, 'WeCom proactive send: corp credentials not configured, message dropped'); + return; } - - const url = `${WECOM_API_BASE}/gettoken?corpid=${encodeURIComponent(agent.corpId)}&corpsecret=${encodeURIComponent(agent.corpSecret)}`; - const res = await fetch(url); - const json = await res.json() as { access_token?: string; expires_in?: number; errcode?: number; errmsg?: string }; - if (!json.access_token) { - throw new Error(`WeCom gettoken failed: ${json.errcode} ${json.errmsg}`); + try { + const token = await getAccessToken(this.corpId, this.corpSecret); + const rawId = jid.slice(this.jidPrefix.length); + const chatType = this.chatTypes.get(jid) || 'single'; + const toUser = chatType === 'group' + ? (this.lastSenders.get(jid) ?? this.opts.registeredGroups()[jid]?.notifyUser) + : rawId; + if (!toUser) { + logger.warn({ jid }, 'WeCom proactive send: no known user id for group, message dropped'); + return; + } + const formatted = convertMarkdown(text); + const resBody = await this._httpsPost( + `https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token=${token}`, + { touser: toUser, msgtype: 'markdown', agentid: this.agentId, markdown: { content: formatted } }, + ); + logger.info({ jid, chatType, toUser, length: text.length, resBody }, 'WeCom proactive message sent'); + } catch (err) { + logger.error({ jid, err }, 'WeCom proactive send failed'); } - this.agentToken = json.access_token; - this.agentTokenExpiresAt = Date.now() + (json.expires_in ?? 7200) * 1000; - return this.agentToken; } - private async agentUploadMedia(buffer: Buffer, filename: string, type: 'image' | 'file'): Promise { - const token = await this.getAgentToken(); - const url = `${WECOM_API_BASE}/media/upload?access_token=${encodeURIComponent(token)}&type=${type}`; - - const boundary = `----boundary${crypto.randomBytes(16).toString('hex')}`; - const ext = filename.split('.').pop()?.toLowerCase() || ''; - const ctMap: Record = { jpg: 'image/jpeg', jpeg: 'image/jpeg', png: 'image/png', gif: 'image/gif' }; - const contentType = ctMap[ext] || 'application/octet-stream'; - - const header = Buffer.from( - `--${boundary}\r\nContent-Disposition: form-data; name="media"; filename="${filename}"; filelength=${buffer.length}\r\nContent-Type: ${contentType}\r\n\r\n`, - ); - const footer = Buffer.from(`\r\n--${boundary}--\r\n`); - const body = Buffer.concat([header, buffer, footer]); - - const res = await fetch(url, { - method: 'POST', - headers: { 'Content-Type': `multipart/form-data; boundary=${boundary}` }, - body, - }); - const json = await res.json() as { media_id?: string; errcode?: number; errmsg?: string }; - if (!json.media_id) { - throw new Error(`WeCom media upload failed: ${json.errcode} ${json.errmsg}`); + private _sendViaWebSocket(jid: string, text: string, reqId: string | undefined): void { + if (!reqId) { + logger.warn({ jid }, 'WeCom WebSocket fallback: no req_id, message dropped'); + return; } - return json.media_id; + // Store for retry on 846604; clean up stale entries (>5 min) to prevent leak + const now = Date.now(); + for (const [id, entry] of this.pendingWsMessages) { + if (now - entry.ts > 300_000) this.pendingWsMessages.delete(id); + } + this.pendingWsMessages.set(reqId, { jid, text, ts: now }); + this._send({ + cmd: 'aibot_respond_msg', + headers: { req_id: reqId }, + body: { msgtype: 'markdown', markdown: { content: text } }, + }); + logger.info({ jid, reqId, length: text.length }, 'WeCom WebSocket fallback sent'); } - private async agentSendImage(jid: string, buffer: Buffer, filename: string, caption?: string): Promise { - const agent = this.opts.agent!; - const mediaId = await this.agentUploadMedia(buffer, filename, 'image'); - const token = await this.getAgentToken(); + async sendFile(jid: string, filePath: string, filename?: string): Promise { + await this._sendMedia(jid, filePath, filename || path.basename(filePath), 'file'); + } - const isGroup = jid.endsWith(WECOM_JID_SUFFIX_GROUP); - const wecomId = this.toWeComId(jid); + async sendImage(jid: string, imagePath: string, caption?: string): Promise { + await this._sendMedia(jid, imagePath, path.basename(imagePath), 'image'); + if (caption) await this.sendMessage(jid, caption); + } - if (isGroup) { - const url = `${WECOM_API_BASE}/appchat/send?access_token=${encodeURIComponent(token)}`; - const res = await fetch(url, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ chatid: wecomId, msgtype: 'image', image: { media_id: mediaId } }), - }); - const json = await res.json() as { errcode?: number; errmsg?: string }; - if (json.errcode !== 0) throw new Error(`appchat/send image failed: ${json.errcode} ${json.errmsg}`); - } else { - const url = `${WECOM_API_BASE}/message/send?access_token=${encodeURIComponent(token)}`; - const res = await fetch(url, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - touser: wecomId, - msgtype: 'image', - agentid: Number(agent.agentId), - image: { media_id: mediaId }, - }), - }); - const json = await res.json() as { errcode?: number; errmsg?: string }; - if (json.errcode !== 0) throw new Error(`message/send image failed: ${json.errcode} ${json.errmsg}`); + private async _sendMedia(jid: string, filePath: string, filename: string, type: 'file' | 'image'): Promise { + if (!this.corpId || !this.corpSecret || !this.agentId) { + logger.warn({ jid }, 'WeCom file send: corp credentials not configured'); + return; } - - if (caption) { - await this.sendMessage(jid, caption); + if (!fs.existsSync(filePath)) { + logger.warn({ jid, filePath }, 'WeCom file send: file not found'); + return; + } + try { + const token = await getAccessToken(this.corpId, this.corpSecret); + const mediaId = await uploadMedia(token, filePath, filename); + // Extract raw ID from JID (strip prefix) + const rawId = jid.slice(this.jidPrefix.length); + const chatType = this.chatTypes.get(jid) || 'single'; + // For group chats, send to the last sender's userid (appchat/send only works for API-created groups) + const toUser = chatType === 'group' + ? (this.lastSenders.get(jid) ?? this.opts.registeredGroups()[jid]?.notifyUser ?? rawId) + : rawId; + const resBody = await this._httpsPost( + `https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token=${token}`, + { touser: toUser, msgtype: type, agentid: this.agentId, [type]: { media_id: mediaId } }, + ); + logger.info({ jid, filename, type, resBody }, 'WeCom file sent'); + } catch (err) { + logger.error({ jid, filePath, err }, 'WeCom file send failed'); } - logger.info({ jid, filename }, 'WeCom image sent via Agent API'); } isConnected(): boolean { - return this.connected; + return this.ws !== null && this.ws.readyState === WebSocket.OPEN; } ownsJid(jid: string): boolean { - return jid.endsWith(WECOM_JID_SUFFIX_GROUP) || jid.endsWith(WECOM_JID_SUFFIX_USER); + return jid.startsWith(this.jidPrefix); } async disconnect(): Promise { - this.connected = false; - this.client?.disconnect(); - } - - private toChatJid(body: BaseMessage): string { - if (body.chattype === 'group' && body.chatid) { - return `${body.chatid}${WECOM_JID_SUFFIX_GROUP}`; + this.stopped = true; + this._stopPing(); + if (this.reconnectTimer) { + clearTimeout(this.reconnectTimer); + this.reconnectTimer = null; + } + if (this.ws) { + this.ws.close(); + this.ws = null; } - return `${body.from.userid}${WECOM_JID_SUFFIX_USER}`; + logger.info('WeCom bot stopped'); } - private toWeComId(jid: string): string { - return jid.replace(WECOM_JID_SUFFIX_GROUP, '').replace(WECOM_JID_SUFFIX_USER, ''); + async setTyping(_jid: string, _isTyping: boolean): Promise { + // WeCom Bot API does not expose a typing indicator } - private cleanStalePendingReplies(): void { - const now = Date.now(); - const maxAge = 20 * 60 * 1000; // 20 minutes - for (const [jid, pending] of this.pendingReplies) { - if (now - pending.timestamp > maxAge) { - this.pendingReplies.delete(jid); + private _startPing(): void { + this._stopPing(); + this.pingTimer = setInterval(() => { + if (this.ws?.readyState === WebSocket.OPEN) { + this._send({ cmd: 'ping', headers: { req_id: randomUUID() } }); } - } + }, PING_INTERVAL_MS); } - private async flushOutgoingQueue(): Promise { - if (this.flushing || this.outgoingQueue.length === 0) return; - this.flushing = true; - try { - logger.info({ count: this.outgoingQueue.length }, 'Flushing WeCom outgoing queue'); - while (this.outgoingQueue.length > 0) { - const item = this.outgoingQueue.shift()!; - await this.sendMessage(item.jid, item.text); - } - } finally { - this.flushing = false; + private _stopPing(): void { + if (this.pingTimer) { + clearInterval(this.pingTimer); + this.pingTimer = null; } } + + private _httpsPost(url: string, payload: object): Promise { + return new Promise((resolve, reject) => { + execFile('curl', [ + '-s', '--ipv4', '-X', 'POST', url, + '-H', 'Content-Type: application/json', + '-d', JSON.stringify(payload), + '--max-time', '30', + ], (err, stdout, stderr) => { + if (err) reject(err); + else resolve(stdout); + }); + }); + } + + private _send(payload: object): void { + this.ws?.send(JSON.stringify(payload)); + } } diff --git a/src/channels/whatsapp.ts b/src/channels/whatsapp.ts index 8e11eba..7ad7418 100644 --- a/src/channels/whatsapp.ts +++ b/src/channels/whatsapp.ts @@ -38,6 +38,7 @@ export class WhatsAppChannel implements Channel { private outgoingQueue: Array<{ jid: string; text: string }> = []; private flushing = false; private groupSyncTimerStarted = false; + private connectReject?: (err: Error) => void; private opts: WhatsAppChannelOpts; @@ -47,6 +48,7 @@ export class WhatsAppChannel implements Channel { async connect(): Promise { return new Promise((resolve, reject) => { + this.connectReject = reject; this.connectInternal(resolve).catch(reject); }); } diff --git a/src/cli.ts b/src/cli.ts index 953560e..8b30849 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -40,18 +40,23 @@ function ensureDirs() { }, null, 2) + '\n'); } - // Copy bio-tools skill + // Copy skills (recursive, handles nested dirs like scripts/) const skillsSrc = path.join(process.cwd(), 'container', 'skills'); const skillsDst = path.join(DATA_DIR, 'sessions', GROUP_FOLDER, '.claude', 'skills'); + function copyDirRecursive(src: string, dst: string) { + fs.mkdirSync(dst, { recursive: true }); + for (const entry of fs.readdirSync(src, { withFileTypes: true })) { + const s = path.join(src, entry.name); + const d = path.join(dst, entry.name); + if (entry.isDirectory()) copyDirRecursive(s, d); + else fs.copyFileSync(s, d); + } + } if (fs.existsSync(skillsSrc)) { for (const skillDir of fs.readdirSync(skillsSrc)) { const srcDir = path.join(skillsSrc, skillDir); if (!fs.statSync(srcDir).isDirectory()) continue; - const dstDir = path.join(skillsDst, skillDir); - fs.mkdirSync(dstDir, { recursive: true }); - for (const file of fs.readdirSync(srcDir)) { - fs.copyFileSync(path.join(srcDir, file), path.join(dstDir, file)); - } + copyDirRecursive(srcDir, path.join(skillsDst, skillDir)); } } } diff --git a/src/config.ts b/src/config.ts index ba57780..de79380 100644 --- a/src/config.ts +++ b/src/config.ts @@ -1,6 +1,7 @@ import { loadEnvFile } from './env.js'; import { getHomeDir } from './platform.js'; import path from 'path'; +import { fileURLToPath } from 'url'; loadEnvFile(); @@ -59,6 +60,9 @@ function escapeRegex(str: string): string { return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } +export const TELEGRAM_BOT_TOKEN = process.env.TELEGRAM_BOT_TOKEN || ""; +export const TELEGRAM_ONLY = process.env.TELEGRAM_ONLY === "true"; + export const TRIGGER_PATTERN = new RegExp( `^@${escapeRegex(ASSISTANT_NAME)}\\b`, 'i', @@ -68,3 +72,35 @@ export const TRIGGER_PATTERN = new RegExp( // Uses system timezone by default export const TIMEZONE = process.env.TZ || Intl.DateTimeFormat().resolvedOptions().timeZone; + +// MiniMax (optional) +export const MINIMAX_API_KEY = process.env.MINIMAX_API_KEY || ''; +export const MINIMAX_BASE_URL = process.env.MINIMAX_BASE_URL || 'https://api.minimax.chat/v1'; +export const MINIMAX_MODEL = process.env.MINIMAX_MODEL || 'MiniMax-M2.5'; + +// Qwen (optional — set via env or .env file) +export const QWEN_API_BASE = process.env.QWEN_API_BASE || ''; +export const QWEN_AUTH_TOKEN = process.env.QWEN_AUTH_TOKEN || ''; +export const QWEN_MODEL = process.env.QWEN_MODEL || ''; + +// WeCom (optional — set via env or .env file) +export const WECOM_BOT_ID = process.env.WECOM_BOT_ID || ""; +export const WECOM_SECRET = process.env.WECOM_SECRET || ""; +export const WECOM2_BOT_ID = process.env.WECOM2_BOT_ID || ""; +export const WECOM2_SECRET = process.env.WECOM2_SECRET || ""; +export const WECOM3_BOT_ID = process.env.WECOM3_BOT_ID || ""; +export const WECOM3_SECRET = process.env.WECOM3_SECRET || ""; +export const WECOM_CORP_ID = process.env.WECOM_CORP_ID || ""; +export const WECOM_CORP_SECRET = process.env.WECOM_CORP_SECRET || ""; +export const WECOM_AGENT_ID = parseInt(process.env.WECOM_AGENT_ID || "0", 10); + +// Feishu / Lark (optional — set via env or .env file) +export const FEISHU_APP_ID = process.env.FEISHU_APP_ID || ''; +export const FEISHU_APP_SECRET = process.env.FEISHU_APP_SECRET || ''; +export const FEISHU_DEFAULT_FOLDER = process.env.FEISHU_DEFAULT_FOLDER || 'main'; +export const FEISHU2_APP_ID = process.env.FEISHU2_APP_ID || ''; +export const FEISHU2_APP_SECRET = process.env.FEISHU2_APP_SECRET || ''; +export const FEISHU2_DEFAULT_FOLDER = process.env.FEISHU2_DEFAULT_FOLDER || ''; +export const FEISHU3_APP_ID = process.env.FEISHU3_APP_ID || ''; +export const FEISHU3_APP_SECRET = process.env.FEISHU3_APP_SECRET || ''; +export const FEISHU3_DEFAULT_FOLDER = process.env.FEISHU3_DEFAULT_FOLDER || ''; diff --git a/src/container-runner.ts b/src/container-runner.ts index c3ac94e..bab66e1 100644 --- a/src/container-runner.ts +++ b/src/container-runner.ts @@ -5,6 +5,7 @@ import { ChildProcess, exec, spawn } from 'child_process'; import fs from 'fs'; import path from 'path'; +import { fileURLToPath } from 'url'; import { CONTAINER_IMAGE, @@ -22,6 +23,8 @@ import { RegisteredGroup } from './types.js'; // Sentinel markers for robust output parsing (must match agent-runner) const OUTPUT_START_MARKER = '---BIOCLAW_OUTPUT_START---'; const OUTPUT_END_MARKER = '---BIOCLAW_OUTPUT_END---'; +const EVENT_START_MARKER = '---BIOCLAW_EVENT_START---'; +const EVENT_END_MARKER = '---BIOCLAW_EVENT_END---'; export interface ContainerInput { prompt: string; @@ -38,6 +41,26 @@ export interface ContainerOutput { result: string | null; newSessionId?: string; error?: string; + usage?: TokenUsageSummary; +} + +export interface TokenUsageSummary { + input_tokens: number; + output_tokens: number; + cache_read_tokens: number; + cache_creation_tokens: number; + cost_usd: number; + duration_ms: number; + num_turns: number; +} + +export interface ContainerEvent { + type: 'tool_call' | 'tool_result' | 'text'; + id?: string; + tool?: string; + input?: Record; + output?: string; + text?: string; } interface VolumeMount { @@ -52,7 +75,7 @@ function buildVolumeMounts( ): VolumeMount[] { const mounts: VolumeMount[] = []; const homeDir = getHomeDir(); - const projectRoot = process.cwd(); + const projectRoot = path.resolve(fileURLToPath(import.meta.url), '../..'); if (isMain) { // Main gets the entire project root mounted @@ -115,19 +138,32 @@ function buildVolumeMounts( } // Sync skills from container/skills/ into each group's .claude/skills/ + // Use manual recursive copy to avoid ENOTSUP from xattr (docker grpcfuse ownership) + function copySkillDir(src: string, dst: string) { + fs.mkdirSync(dst, { recursive: true }); + for (const entry of fs.readdirSync(src, { withFileTypes: true })) { + const s = path.join(src, entry.name); + const d = path.join(dst, entry.name); + if (entry.isDirectory()) { + copySkillDir(s, d); + } else { + try { + fs.copyFileSync(s, d, fs.constants.COPYFILE_FICLONE_FORCE); + } catch { + // fallback: read+write to skip xattr issues + fs.writeFileSync(d, fs.readFileSync(s)); + try { fs.chmodSync(d, fs.statSync(s).mode); } catch { /* ignore */ } + } + } + } + } const skillsSrc = path.join(process.cwd(), 'container', 'skills'); const skillsDst = path.join(groupSessionsDir, 'skills'); if (fs.existsSync(skillsSrc)) { for (const skillDir of fs.readdirSync(skillsSrc)) { const srcDir = path.join(skillsSrc, skillDir); if (!fs.statSync(srcDir).isDirectory()) continue; - const dstDir = path.join(skillsDst, skillDir); - fs.mkdirSync(dstDir, { recursive: true }); - for (const file of fs.readdirSync(srcDir)) { - const srcFile = path.join(srcDir, file); - const dstFile = path.join(dstDir, file); - fs.copyFileSync(srcFile, dstFile); - } + copySkillDir(srcDir, path.join(skillsDst, skillDir)); } } mounts.push({ @@ -176,7 +212,7 @@ function buildVolumeMounts( * Secrets are never written to disk or mounted as files. */ function readSecrets(): Record { - const envFile = path.join(process.cwd(), '.env'); + const envFile = path.join(path.resolve(fileURLToPath(import.meta.url), '../..'), '.env'); if (!fs.existsSync(envFile)) return {}; const allowedVars = [ @@ -189,6 +225,8 @@ function readSecrets(): Record { 'OPENAI_COMPATIBLE_API_KEY', 'OPENAI_COMPATIBLE_BASE_URL', 'OPENAI_COMPATIBLE_MODEL', + 'MINIMAX_API_KEY', 'MINIMAX_BASE_URL', 'MINIMAX_MODEL', + 'QWEN_API_BASE', 'QWEN_AUTH_TOKEN', 'QWEN_MODEL', ]; const secrets: Record = {}; const content = fs.readFileSync(envFile, 'utf-8'); @@ -235,6 +273,7 @@ export async function runContainerAgent( input: ContainerInput, onProcess: (proc: ChildProcess, containerName: string) => void, onOutput?: (output: ContainerOutput) => Promise, + onEvent?: (event: ContainerEvent) => void, ): Promise { const startTime = Date.now(); @@ -294,6 +333,7 @@ export async function runContainerAgent( // Streaming output: parse OUTPUT_START/END marker pairs as they arrive let parseBuffer = ''; let newSessionId: string | undefined; + let lastUsage: TokenUsageSummary | undefined; let outputChain = Promise.resolve(); container.stdout.on('data', (data) => { @@ -314,34 +354,60 @@ export async function runContainerAgent( } } - // Stream-parse for output markers - if (onOutput) { + // Stream-parse for output and event markers + if (onOutput || onEvent) { parseBuffer += chunk; - let startIdx: number; - while ((startIdx = parseBuffer.indexOf(OUTPUT_START_MARKER)) !== -1) { - const endIdx = parseBuffer.indexOf(OUTPUT_END_MARKER, startIdx); + let processed = true; + while (processed) { + processed = false; + const outStart = parseBuffer.indexOf(OUTPUT_START_MARKER); + const evtStart = parseBuffer.indexOf(EVENT_START_MARKER); + + // Determine which marker comes first + let nextStart: number; + let isOutput: boolean; + if (outStart !== -1 && (evtStart === -1 || outStart <= evtStart)) { + nextStart = outStart; + isOutput = true; + } else if (evtStart !== -1) { + nextStart = evtStart; + isOutput = false; + } else { + break; + } + + const endMarker = isOutput ? OUTPUT_END_MARKER : EVENT_END_MARKER; + const startLen = isOutput ? OUTPUT_START_MARKER.length : EVENT_START_MARKER.length; + const endIdx = parseBuffer.indexOf(endMarker, nextStart); if (endIdx === -1) break; // Incomplete pair, wait for more data - const jsonStr = parseBuffer - .slice(startIdx + OUTPUT_START_MARKER.length, endIdx) - .trim(); - parseBuffer = parseBuffer.slice(endIdx + OUTPUT_END_MARKER.length); + const jsonStr = parseBuffer.slice(nextStart + startLen, endIdx).trim(); + parseBuffer = parseBuffer.slice(endIdx + endMarker.length); + processed = true; try { - const parsed: ContainerOutput = JSON.parse(jsonStr); - if (parsed.newSessionId) { - newSessionId = parsed.newSessionId; + if (isOutput && onOutput) { + const parsed: ContainerOutput = JSON.parse(jsonStr); + if (parsed.newSessionId) { + newSessionId = parsed.newSessionId; + } + if (parsed.usage) { + lastUsage = parsed.usage; + } + hadStreamingOutput = true; + // Activity detected — reset the hard timeout + resetTimeout(); + // Call onOutput for all markers (including null results) + // so idle timers start even for "silent" query completions. + outputChain = outputChain.then(() => onOutput(parsed)); + } else if (!isOutput && onEvent) { + const event: ContainerEvent = JSON.parse(jsonStr); + onEvent(event); } - hadStreamingOutput = true; - // Activity detected — reset the hard timeout - resetTimeout(); - // Call onOutput for all markers (including null results) - // so idle timers start even for "silent" query completions. - outputChain = outputChain.then(() => onOutput(parsed)); } catch (err) { logger.warn( { group: group.name, error: err }, - 'Failed to parse streamed output chunk', + 'Failed to parse streamed marker chunk', ); } } @@ -426,6 +492,7 @@ export async function runContainerAgent( status: 'success', result: null, newSessionId, + usage: lastUsage, }); }); return; @@ -526,13 +593,14 @@ export async function runContainerAgent( if (onOutput) { outputChain.then(() => { logger.info( - { group: group.name, duration, newSessionId }, + { group: group.name, duration, newSessionId, usage: lastUsage }, 'Container completed (streaming mode)', ); resolve({ status: 'success', result: null, newSessionId, + usage: lastUsage, }); }); return; diff --git a/src/dashboard.html b/src/dashboard.html new file mode 100644 index 0000000..7238885 --- /dev/null +++ b/src/dashboard.html @@ -0,0 +1,1645 @@ + + + + + +BioClaw Dashboard + + + +
+

🧬 BioClaw

+
+
-
Groups
+
-
Tasks
+
-
Models
+
-
Skills
+
-
Containers
+
-
Chats
+
+
+ + + + +
+
+ + +
+ + + +
+ +
+
+ +
+
+
+
+
🧬
+
BioClaw
+
How can I help you with biology research today?
+
+
+
+
+
+
+
+ +
+ + + +
+
+
+ + + +
+
+
+ +
+ + +
+
+ Registered Groups + +
+
Loading...
+
+ + +
+
+ Scheduled Tasks + +
+
Loading...
+
+ + +
+
+ Activity Statistics +
+ + + +
+
+
+
+

Messages per Day

+
+
+
+

Task Runs per Day

+
+
+
+

Avg Response Time (ms)

+
+
+
+ + +
+
+ Configured Models + +
+
Loading...
+
+ Token Usage +
+ + + +
+
+
Loading...
+
+ + +
+ +
+
Loading...
+
+ + +
+
+ Running Containers + +
+
Loading...
+
+ + +
+
+
+ Alert Rules + +
+
+
+
Add Rule
+
+ + + + +
+
+ + + + +
+
+
+ + +
+
+ + + +
+
+
+ + +
+ + + + + + diff --git a/src/dashboard.ts b/src/dashboard.ts new file mode 100644 index 0000000..7c69023 --- /dev/null +++ b/src/dashboard.ts @@ -0,0 +1,656 @@ +import { exec } from 'child_process'; +import fs from 'fs'; +import http from 'http'; +import path from 'path'; +import { fileURLToPath } from 'url'; + +import { + CONTAINER_IMAGE, + DATA_DIR, + GROUPS_DIR, + MINIMAX_API_KEY, + MINIMAX_BASE_URL, + MINIMAX_MODEL, + QWEN_API_BASE, + QWEN_AUTH_TOKEN, + QWEN_MODEL, +} from './config.js'; +import { runContainerAgent, ContainerEvent } from './container-runner.js'; +import { + deleteTask, + getActivityStats, + getAllChats, + getAllRegisteredGroups, + getAllTasks, + getGroupMessageStats, + getTaskRunLogs, + getTokenUsageByDay, + getTokenUsageSummary, + logTokenUsage, + updateTask, +} from './db.js'; +import { logger } from './logger.js'; +import { RegisteredGroup } from './types.js'; + +const PROJECT_ROOT = path.resolve(fileURLToPath(import.meta.url), '../..'); +const LOG_FILE = path.join(PROJECT_ROOT, 'logs', 'bioclaw.log'); +const HTML_FILE = path.join(PROJECT_ROOT, 'src', 'dashboard.html'); +const DASHBOARD_PORT = parseInt(process.env.DASHBOARD_PORT || '3847', 10); + +function readEnvFile(): Record { + const envFile = path.join(PROJECT_ROOT, '.env'); + if (!fs.existsSync(envFile)) return {}; + const result: Record = {}; + for (const line of fs.readFileSync(envFile, 'utf-8').split('\n')) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith('#')) continue; + const eqIdx = trimmed.indexOf('='); + if (eqIdx === -1) continue; + const key = trimmed.slice(0, eqIdx).trim(); + let value = trimmed.slice(eqIdx + 1).trim(); + if ((value.startsWith('"') && value.endsWith('"')) || (value.startsWith("'") && value.endsWith("'"))) { + value = value.slice(1, -1); + } + if (value) result[key] = value; + } + return result; +} + +// SSE clients listening for log lines +const sseClients = new Set(); + +// Broadcast a log line to all SSE clients +export function broadcastLogLine(line: string): void { + if (sseClients.size === 0) return; + const payload = `data: ${JSON.stringify(line)}\n\n`; + for (const res of sseClients) { + try { + res.write(payload); + } catch { + sseClients.delete(res); + } + } +} + +function getContainers(): Promise { + return new Promise((resolve) => { + exec( + 'docker ps --filter "name=bioclaw-" --format "{{.Names}}\\t{{.Image}}\\t{{.Status}}\\t{{.RunningFor}}"', + { timeout: 5000 }, + (_err, stdout) => { + if (!stdout) { resolve([]); return; } + resolve( + stdout.trim().split('\n').filter(Boolean).map((line) => { + const [name, image, status, running] = line.split('\t'); + return { name, image, status, running }; + }), + ); + }, + ); + }); +} + +function tailFile(filePath: string, lines: number): string[] { + if (!fs.existsSync(filePath)) return []; + try { + const stat = fs.statSync(filePath); + const chunkSize = Math.min(stat.size, lines * 200); + const fd = fs.openSync(filePath, 'r'); + const buf = Buffer.alloc(chunkSize); + fs.readSync(fd, buf, 0, chunkSize, stat.size - chunkSize); + fs.closeSync(fd); + const text = buf.toString('utf-8'); + const all = text.split('\n').filter(Boolean); + return all.slice(-lines); + } catch { + return []; + } +} + +// Model specs (context window / max output / reasoning support) +const MODEL_SPECS: Record = { + 'MiniMax-M2.5': { contextWindow: 1_000_000, maxOutput: 40_960, reasoning: true }, + 'MiniMax-M1': { contextWindow: 1_000_000, maxOutput: 40_960, reasoning: true }, + 'claude-opus-4-6': { contextWindow: 200_000, maxOutput: 32_768, reasoning: true }, + 'claude-sonnet-4-6': { contextWindow: 200_000, maxOutput: 16_384, reasoning: true }, + 'claude-haiku-4-5-20251001': { contextWindow: 200_000, maxOutput: 8_192, reasoning: false }, +}; + +function getModels(): object[] { + const groups = getAllRegisteredGroups(); + const agentCounts: Record = {}; + for (const g of Object.values(groups)) { + const t = g.agentType || 'claude'; + agentCounts[t] = (agentCounts[t] || 0) + 1; + } + + const models: object[] = []; + const dotEnv = readEnvFile(); + + // Effective values: prefer process.env, fallback to .env file, then config defaults + const effectiveMiniMaxKey = process.env.MINIMAX_API_KEY || dotEnv['MINIMAX_API_KEY'] || MINIMAX_API_KEY; + const effectiveMiniMaxBase = process.env.MINIMAX_BASE_URL || dotEnv['MINIMAX_BASE_URL'] || MINIMAX_BASE_URL; + const effectiveMiniMaxModel = process.env.MINIMAX_MODEL || dotEnv['MINIMAX_MODEL'] || MINIMAX_MODEL; + const effectiveQwenBase = process.env.QWEN_API_BASE || dotEnv['QWEN_API_BASE'] || QWEN_API_BASE; + const effectiveQwenToken = process.env.QWEN_AUTH_TOKEN || dotEnv['QWEN_AUTH_TOKEN'] || QWEN_AUTH_TOKEN; + const effectiveQwenModel = process.env.QWEN_MODEL || dotEnv['QWEN_MODEL'] || QWEN_MODEL; + + if (effectiveMiniMaxModel) { + const spec = MODEL_SPECS[effectiveMiniMaxModel] ?? { contextWindow: 1_000_000, maxOutput: 40_960, reasoning: true }; + models.push({ id: 'minimax', name: effectiveMiniMaxModel, provider: 'MiniMax', + endpoint: effectiveMiniMaxBase, agentCount: agentCounts['minimax'] ?? 0, + configured: !!effectiveMiniMaxKey, ...spec }); + } + + if (effectiveQwenModel) { + const displayName = effectiveQwenModel.split('/').pop() ?? effectiveQwenModel; + models.push({ id: 'qwen', name: displayName, fullModel: effectiveQwenModel, provider: 'Qwen (Local)', + endpoint: effectiveQwenBase, agentCount: agentCounts['qwen'] ?? 0, + configured: !!(effectiveQwenBase && effectiveQwenToken), + contextWindow: 32_768, maxOutput: 8_192, reasoning: false }); + } + + const claudeModel = process.env.CLAUDE_MODEL ?? 'claude-sonnet-4-6'; + const claudeSpec = MODEL_SPECS[claudeModel] ?? { contextWindow: 200_000, maxOutput: 32_768, reasoning: true }; + const hasClaudeAuth = !!(process.env.CLAUDE_CODE_OAUTH_TOKEN || process.env.ANTHROPIC_API_KEY || dotEnv['CLAUDE_CODE_OAUTH_TOKEN'] || dotEnv['ANTHROPIC_API_KEY']); + const imageTag = CONTAINER_IMAGE.split(':')[1] ?? 'latest'; + models.push({ id: 'claude', name: claudeModel, provider: 'Anthropic (Claude Code)', + endpoint: `Docker image: ${imageTag}`, agentCount: agentCounts['claude'] ?? 0, + configured: hasClaudeAuth, ...claudeSpec }); + + return models; +} + +async function testModel(modelType: string, prompt: string): Promise { + const start = Date.now(); + try { + const envVars = readEnvFile(); + let apiBase: string, apiKey: string, model: string; + if (modelType === 'minimax') { + apiBase = process.env.MINIMAX_BASE_URL || envVars['MINIMAX_BASE_URL'] || MINIMAX_BASE_URL; + apiKey = process.env.MINIMAX_API_KEY || envVars['MINIMAX_API_KEY'] || MINIMAX_API_KEY; + model = process.env.MINIMAX_MODEL || envVars['MINIMAX_MODEL'] || MINIMAX_MODEL; + } else if (modelType === 'qwen') { + apiBase = process.env.QWEN_API_BASE || envVars['QWEN_API_BASE'] || QWEN_API_BASE; + apiKey = process.env.QWEN_AUTH_TOKEN || envVars['QWEN_AUTH_TOKEN'] || QWEN_AUTH_TOKEN; + model = process.env.QWEN_MODEL || envVars['QWEN_MODEL'] || QWEN_MODEL; + } else { + return { ok: false, response: 'Claude cannot be tested directly (runs in container)', durationMs: 0 }; + } + if (!apiBase || !model) return { ok: false, response: 'Not configured', durationMs: 0 }; + const res = await fetch(`${apiBase}/chat/completions`, { + method: 'POST', + headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${apiKey}` }, + body: JSON.stringify({ model, messages: [{ role: 'user', content: prompt }], max_tokens: 200 }), + signal: AbortSignal.timeout(30_000), + }); + const data = await res.json() as any; + const text = data.choices?.[0]?.message?.content ?? data.error?.message ?? JSON.stringify(data).slice(0, 300); + const usage = data.usage ?? {}; + const promptTokens = usage.prompt_tokens ?? 0; + const completionTokens = usage.completion_tokens ?? 0; + if (promptTokens || completionTokens) { + logTokenUsage({ group_folder: 'dashboard', agent_type: modelType, input_tokens: promptTokens, output_tokens: completionTokens, source: 'test', duration_ms: Date.now() - start }); + } + return { ok: res.ok, response: text, durationMs: Date.now() - start, + promptTokens, completionTokens }; + } catch (err: any) { + return { ok: false, response: err.message, durationMs: Date.now() - start }; + } +} + +function getSkills(): object { + const agentTools = [ + { name: 'bash', category: 'System', description: 'Execute bash commands (BLAST, samtools, python, etc.)' }, + { name: 'read_file', category: 'File', description: 'Read file from filesystem' }, + { name: 'write_file', category: 'File', description: 'Write content to file' }, + { name: 'web_fetch', category: 'Network', description: 'Fetch and extract web page content' }, + { name: 'send_message', category: 'Communication', description: 'Send progress update to user mid-task' }, + { name: 'send_file', category: 'Communication', description: 'Send result file to user' }, + { name: 'search_pubmed', category: 'Bio API', description: 'Search PubMed literature' }, + { name: 'fetch_abstract', category: 'Bio API', description: 'Fetch paper abstracts by PMID' }, + { name: 'search_chip_atlas', category: 'Bio API', description: 'Search CHIP-Atlas ChIP-seq/ATAC-seq' }, + ]; + + const bioCliTools = [ + { name: 'ncbi-blast+', category: 'Sequence', description: 'BLAST similarity search' }, + { name: 'samtools', category: 'NGS', description: 'SAM/BAM manipulation' }, + { name: 'bedtools', category: 'Genomics', description: 'Genome arithmetic' }, + { name: 'bwa', category: 'Alignment', description: 'BWA short-read aligner' }, + { name: 'minimap2', category: 'Alignment', description: 'Long-read / RNA-seq aligner' }, + { name: 'fastqc', category: 'QC', description: 'FastQ quality control' }, + { name: 'fastp', category: 'QC', description: 'All-in-one FASTQ preprocessor' }, + { name: 'seqtk', category: 'Sequence', description: 'FASTA/FASTQ toolkit' }, + { name: 'bcftools', category: 'Variant', description: 'VCF/BCF utilities' }, + { name: 'seqkit', category: 'Sequence', description: 'FASTA/FASTQ analysis' }, + { name: 'salmon', category: 'Quantification',description: 'Transcript quantification' }, + { name: 'kallisto', category: 'Quantification',description: 'RNA-seq pseudo-alignment' }, + { name: 'tabix', category: 'Indexing', description: 'Genomic file indexer' }, + { name: 'sra-toolkit', category: 'Data', description: 'NCBI SRA data access' }, + { name: 'pymol', category: 'Structure', description: 'Molecular visualization (headless)' }, + { name: 'pigz', category: 'Compression', description: 'Parallel gzip compression' }, + ]; + + const pythonLibs = [ + { name: 'biopython', category: 'Core', description: 'Biological computation toolkit' }, + { name: 'pandas', category: 'Data', description: 'Data analysis' }, + { name: 'numpy', category: 'Data', description: 'Numerical computing' }, + { name: 'scipy', category: 'Data', description: 'Scientific computing' }, + { name: 'matplotlib', category: 'Visualization', description: 'Data visualization' }, + { name: 'seaborn', category: 'Visualization', description: 'Statistical data visualization' }, + { name: 'scikit-learn', category: 'ML', description: 'Machine learning' }, + { name: 'scanpy', category: 'scRNA-seq', description: 'Single-cell RNA-seq analysis' }, + { name: 'pydeseq2', category: 'RNAseq', description: 'Differential expression' }, + { name: 'pysam', category: 'NGS', description: 'Python SAM/BAM interface' }, + { name: 'rdkit', category: 'Cheminformatics', description: 'Chemical informatics' }, + { name: 'anndata', category: 'scRNA-seq', description: 'Annotated data matrix' }, + { name: 'multiqc', category: 'QC', description: 'Multi-sample QC report' }, + { name: 'requests', category: 'Network', description: 'HTTP library for Python' }, + ]; + + // Dynamically scan container/skills/ directory + const skillsDir = path.join(process.cwd(), 'container', 'skills'); + const containerSkills: Array<{ name: string; category: string; description: string }> = []; + try { + const dirs = fs.readdirSync(skillsDir, { withFileTypes: true }) + .filter(d => d.isDirectory()) + .map(d => d.name) + .sort(); + for (const dir of dirs) { + const skillMd = path.join(skillsDir, dir, 'SKILL.md'); + let description = ''; + let category = 'Bio Skill'; + if (fs.existsSync(skillMd)) { + const content = fs.readFileSync(skillMd, 'utf-8').slice(0, 500); + const descMatch = content.match(/^description:\s*"?(.+?)"?\s*$/m); + if (descMatch) description = descMatch[1].replace(/^"|"$/g, '').trim(); + } + if (dir === 'agent-browser') category = 'Browser'; + else if (dir.startsWith('bio-')) category = 'Bio Pipeline'; + else if (dir.endsWith('-database')) category = 'Database'; + else if (dir === 'pubmed-search' || dir === 'literature-search') category = 'Literature'; + else if (dir === 'scrna-qc' || dir === 'visium-analysis') category = 'scRNA-seq'; + containerSkills.push({ name: dir, category, description: description || dir.replace(/-/g, ' ') }); + } + } catch { + // skills dir not accessible + } + + return { agentTools, bioCliTools, pythonLibs, containerSkills }; +} + +function json(res: http.ServerResponse, data: unknown, status = 200): void { + const body = JSON.stringify(data); + res.writeHead(status, { 'Content-Type': 'application/json' }); + res.end(body); +} + +function readBody(req: http.IncomingMessage): Promise { + return new Promise((resolve) => { + const chunks: Buffer[] = []; + req.on('data', (d) => chunks.push(d)); + req.on('end', () => resolve(Buffer.concat(chunks).toString('utf-8'))); + }); +} + +async function handleApi( + req: http.IncomingMessage, + res: http.ServerResponse, + pathname: string, +): Promise { + if (pathname === '/api/groups' && req.method === 'GET') { + const groups = getAllRegisteredGroups(); + json(res, groups); + return true; + } + + if (pathname === '/api/tasks' && req.method === 'GET') { + json(res, getAllTasks()); + return true; + } + + if (pathname === '/api/containers' && req.method === 'GET') { + json(res, await getContainers()); + return true; + } + + if (pathname === '/api/stats' && req.method === 'GET') { + const chats = getAllChats(); + const tasks = getAllTasks(); + const groups = getAllRegisteredGroups(); + json(res, { + totalChats: chats.length, + registeredGroups: Object.keys(groups).length, + activeTasks: tasks.filter((t) => t.status === 'active').length, + totalTasks: tasks.length, + }); + return true; + } + + if (pathname === '/api/models' && req.method === 'GET') { + json(res, getModels()); + return true; + } + + if (pathname === '/api/models/test' && req.method === 'POST') { + const body = JSON.parse(await readBody(req)); + json(res, await testModel(body.modelType, body.prompt || 'Reply with exactly one word: ok')); + return true; + } + + if (pathname === '/api/token-usage' && req.method === 'GET') { + const url = new URL(req.url || '/', 'http://localhost'); + const days = parseInt(url.searchParams.get('days') || '14', 10); + json(res, { daily: getTokenUsageByDay(days), summary: getTokenUsageSummary(days) }); + return true; + } + + if (pathname === '/api/skills' && req.method === 'GET') { + json(res, getSkills()); + return true; + } + + // Task actions + const taskPause = pathname.match(/^\/api\/tasks\/([^/]+)\/pause$/); + if (taskPause && req.method === 'PUT') { + updateTask(taskPause[1], { status: 'paused' }); + json(res, { ok: true }); + return true; + } + + const taskResume = pathname.match(/^\/api\/tasks\/([^/]+)\/resume$/); + if (taskResume && req.method === 'PUT') { + updateTask(taskResume[1], { status: 'active' }); + json(res, { ok: true }); + return true; + } + + const taskDelete = pathname.match(/^\/api\/tasks\/([^/]+)$/); + if (taskDelete && req.method === 'DELETE') { + deleteTask(taskDelete[1]); + json(res, { ok: true }); + return true; + } + + const taskLogs = pathname.match(/^\/api\/task-logs\/([^/]+)$/); + if (taskLogs && req.method === 'GET') { + json(res, getTaskRunLogs(taskLogs[1], 50)); + return true; + } + + if (pathname === '/api/groups/stats' && req.method === 'GET') { + json(res, getGroupMessageStats()); + return true; + } + + if (pathname === '/api/activity' && req.method === 'GET') { + const url = new URL(req.url || '/', 'http://localhost'); + const days = parseInt(url.searchParams.get('days') || '14', 10); + json(res, getActivityStats(days)); + return true; + } + + // Alerts — stored in a simple JSON file + const ALERTS_FILE = path.join(PROJECT_ROOT, 'data', 'alerts.json'); + + if (pathname === '/api/alerts' && req.method === 'GET') { + try { + const data = fs.existsSync(ALERTS_FILE) ? JSON.parse(fs.readFileSync(ALERTS_FILE, 'utf-8')) : { rules: [] }; + json(res, data); + } catch { json(res, { rules: [] }); } + return true; + } + + if (pathname === '/api/alerts' && req.method === 'POST') { + const body = JSON.parse(await readBody(req)); + fs.mkdirSync(path.dirname(ALERTS_FILE), { recursive: true }); + fs.writeFileSync(ALERTS_FILE, JSON.stringify(body, null, 2)); + json(res, { ok: true }); + return true; + } + + if (pathname === '/api/alerts/status' && req.method === 'GET') { + const firing: Array<{ rule: string; group: string; lastMsg: string }> = []; + try { + const alertData = fs.existsSync(ALERTS_FILE) ? JSON.parse(fs.readFileSync(ALERTS_FILE, 'utf-8')) : { rules: [] }; + const rules = alertData.rules || []; + if (rules.length > 0) { + const groupStats = getGroupMessageStats(); + const groups = getAllRegisteredGroups(); + for (const rule of rules) { + if (!rule.enabled) continue; + const thresholdMs = (rule.hours || 24) * 60 * 60 * 1000; + for (const stat of groupStats) { + if (!groups[stat.chat_jid]) continue; + const lastMsgTime = new Date(stat.last_msg).getTime(); + if (Date.now() - lastMsgTime > thresholdMs) { + firing.push({ rule: rule.name, group: groups[stat.chat_jid].name, lastMsg: stat.last_msg }); + } + } + } + } + } catch { /* no alerts */ } + json(res, { firing }); + return true; + } + + // Chat endpoint — streams SSE: text/tool_call/tool_result/session/done/error events + if (pathname === '/api/chat' && req.method === 'POST') { + const body = JSON.parse(await readBody(req)); + const { message, sessionId, groupFolder, attachments } = body as { + message: string; + sessionId?: string; + groupFolder?: string; + attachments?: Array<{ name: string; data: string; mime: string; fileType?: string }>; + }; + + // Resolve group and agent type + const groupKey = groupFolder || 'dashboard'; + const allGroups = getAllRegisteredGroups(); + const registeredGroup = allGroups[groupKey]; + const folder = registeredGroup?.folder || groupKey; + const agentType: 'claude' | 'minimax' | 'qwen' = registeredGroup?.agentType || 'claude'; + + res.writeHead(200, { + 'Content-Type': 'text/event-stream', + 'Cache-Control': 'no-cache', + Connection: 'keep-alive', + }); + res.flushHeaders?.(); + + const sendEvent = (event: object) => { + try { res.write(`data: ${JSON.stringify(event)}\n\n`); } catch { /* client disconnected */ } + }; + + try { + // All agent types go through the container runner + const group: RegisteredGroup = registeredGroup || { + name: folder, + folder, + trigger: '', + added_at: new Date().toISOString(), + }; + sendEvent({ type: 'status', text: 'Starting agent...' }); + + // Save attached images/files to group folder so container can access them + let prompt = message; + if (attachments?.length) { + const groupDir = path.join(GROUPS_DIR, group.folder); + fs.mkdirSync(groupDir, { recursive: true }); + const savedFiles: string[] = []; + for (const att of attachments) { + if (att.data?.startsWith('data:')) { + const base64 = att.data.split(',')[1] || ''; + const buf = Buffer.from(base64, 'base64'); + const safeName = att.name.replace(/[^a-zA-Z0-9._-]/g, '_'); + const filePath = path.join(groupDir, safeName); + fs.writeFileSync(filePath, buf); + savedFiles.push(safeName); + } + } + if (savedFiles.length) { + prompt += '\n\nAttached files saved to /workspace/group/: ' + savedFiles.join(', ') + '\nPlease read and analyze these files.'; + } + } + + let hadTextEvent = false; + const sentTexts = new Set(); + let gotResult = false; + const ipcInputDir = path.join(DATA_DIR, 'ipc', group.folder, 'input'); + await runContainerAgent( + group, + { + prompt, + sessionId: sessionId || undefined, + groupFolder: group.folder, + chatJid: 'dashboard', + isMain: false, + }, + () => { /* noop — no GroupQueue tracking needed for dashboard */ }, + async (output) => { + // Only send output.result if no text/send_message events were streamed (avoid duplication) + if (output.result && output.result !== 'No response requested.' && !hadTextEvent) sendEvent({ type: 'text', text: output.result }); + if (output.newSessionId) sendEvent({ type: 'session', sessionId: output.newSessionId }); + if (output.status === 'error' && output.error) sendEvent({ type: 'error', message: output.error }); + // Log token usage as soon as we receive it from the container (don't wait for exit) + if (output.usage && (output.usage.input_tokens > 0 || output.usage.output_tokens > 0)) { + logTokenUsage({ + group_folder: group.folder, + agent_type: 'claude', + input_tokens: output.usage.input_tokens, + output_tokens: output.usage.output_tokens, + cache_read_tokens: output.usage.cache_read_tokens, + cache_creation_tokens: output.usage.cache_creation_tokens, + cost_usd: output.usage.cost_usd, + duration_ms: output.usage.duration_ms, + num_turns: output.usage.num_turns, + source: 'dashboard', + }); + } + // Dashboard is single-shot: after the first result, signal the container + // to exit so the SSE stream closes and the send button re-enables. + if (!gotResult && (output.result || output.usage)) { + gotResult = true; + try { + fs.mkdirSync(ipcInputDir, { recursive: true }); + fs.writeFileSync(path.join(ipcInputDir, '_close'), ''); + } catch { /* ignore */ } + } + }, + (event: ContainerEvent) => { + if (event.type === 'text' || (event.type === 'tool_call' && event.tool?.endsWith('send_message'))) { + hadTextEvent = true; + } + // Deduplicate: skip send_message tool call if identical text was already sent as a text event, or vice versa + const eventText = event.type === 'text' ? event.text : (event.type === 'tool_call' && event.tool?.endsWith('send_message') ? String((event.input as any)?.text || '') : ''); + if (eventText) { + if (sentTexts.has(eventText)) return; // skip duplicate + sentTexts.add(eventText); + } + sendEvent(event); + }, + ); + sendEvent({ type: 'done' }); + } catch (err: any) { + sendEvent({ type: 'error', message: err.message || String(err) }); + } + + res.end(); + return true; + } + + // SSE log stream + if (pathname === '/api/logs' && req.method === 'GET') { + res.writeHead(200, { + 'Content-Type': 'text/event-stream', + 'Cache-Control': 'no-cache', + Connection: 'keep-alive', + }); + res.flushHeaders?.(); + + // Send last 200 lines immediately + const recent = tailFile(LOG_FILE, 200); + for (const line of recent) { + res.write(`data: ${JSON.stringify(line)}\n\n`); + } + + sseClients.add(res); + + // Also tail the file in case bioclaw logs don't hit broadcastLogLine + let filePos = fs.existsSync(LOG_FILE) ? fs.statSync(LOG_FILE).size : 0; + const watchInterval = setInterval(() => { + if (!fs.existsSync(LOG_FILE)) return; + const stat = fs.statSync(LOG_FILE); + if (stat.size <= filePos) return; + const fd = fs.openSync(LOG_FILE, 'r'); + const readLen = stat.size - filePos; + const buf = Buffer.alloc(readLen); + fs.readSync(fd, buf, 0, readLen, filePos); + fs.closeSync(fd); + filePos = stat.size; + const lines = buf.toString('utf-8').split('\n').filter(Boolean); + for (const line of lines) { + try { + res.write(`data: ${JSON.stringify(line)}\n\n`); + } catch { + /* client disconnected */ + } + } + }, 1000); + + req.on('close', () => { + clearInterval(watchInterval); + sseClients.delete(res); + }); + return true; + } + + return false; +} + +function handleRequest( + req: http.IncomingMessage, + res: http.ServerResponse, +): void { + const url = new URL(req.url || '/', `http://localhost`); + const pathname = url.pathname; + + res.setHeader('Access-Control-Allow-Origin', '*'); + res.setHeader('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE'); + res.setHeader('Access-Control-Allow-Headers', 'Content-Type'); + + if (req.method === 'OPTIONS') { + res.writeHead(204); + res.end(); + return; + } + + handleApi(req, res, pathname).then((handled) => { + if (!handled) { + if (pathname === '/' && req.method === 'GET') { + const html = fs.readFileSync(HTML_FILE, 'utf-8'); + res.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' }); + res.end(html); + } else { + res.writeHead(404, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ error: 'not found' })); + } + } + }).catch((err) => { + logger.error({ err }, 'Dashboard request error'); + if (!res.headersSent) { + res.writeHead(500, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ error: String(err) })); + } + }); +} + +export function startDashboard(): void { + const server = http.createServer(handleRequest); + server.listen(DASHBOARD_PORT, '127.0.0.1', () => { + logger.info({ port: DASHBOARD_PORT }, `Dashboard: http://127.0.0.1:${DASHBOARD_PORT}`); + }); + server.on('error', (err) => { + logger.error({ err }, 'Dashboard server error'); + }); +} diff --git a/src/db.ts b/src/db.ts index 22b2fa7..32c1e11 100644 --- a/src/db.ts +++ b/src/db.ts @@ -3,7 +3,7 @@ import fs from 'fs'; import path from 'path'; import { DATA_DIR, STORE_DIR } from './config.js'; -import { NewMessage, RegisteredGroup, ScheduledTask, TaskRunLog } from './types.js'; +import { ContainerConfig, NewMessage, RegisteredGroup, ScheduledTask, TaskRunLog } from './types.js'; let db: Database.Database; @@ -82,6 +82,27 @@ function createSchema(database: Database.Database): void { } catch { /* column already exists */ } + + // Add token_usage table (migration for existing DBs) + database.exec(` + CREATE TABLE IF NOT EXISTS token_usage ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + group_folder TEXT NOT NULL, + agent_type TEXT NOT NULL DEFAULT 'claude', + input_tokens INTEGER NOT NULL DEFAULT 0, + output_tokens INTEGER NOT NULL DEFAULT 0, + cache_read_tokens INTEGER NOT NULL DEFAULT 0, + cache_creation_tokens INTEGER NOT NULL DEFAULT 0, + cost_usd REAL NOT NULL DEFAULT 0, + duration_ms INTEGER NOT NULL DEFAULT 0, + num_turns INTEGER NOT NULL DEFAULT 0, + source TEXT NOT NULL DEFAULT 'message', + task_id TEXT, + created_at TEXT NOT NULL + ); + CREATE INDEX IF NOT EXISTS idx_token_usage_created ON token_usage(created_at); + CREATE INDEX IF NOT EXISTS idx_token_usage_group ON token_usage(group_folder); + `); } export function initDatabase(): void { @@ -411,6 +432,14 @@ export function updateTaskAfterRun( ).run(nextRun, now, lastResult, nextRun, id); } +export function getTaskRunLogs(taskId: string, limit = 20): TaskRunLog[] { + return db + .prepare( + 'SELECT * FROM task_run_logs WHERE task_id = ? ORDER BY run_at DESC LIMIT ?', + ) + .all(taskId, limit) as TaskRunLog[]; +} + export function logTaskRun(log: TaskRunLog): void { db.prepare( ` @@ -470,6 +499,21 @@ export function getAllSessions(): Record { // --- Registered group accessors --- +function parseConfigBlob(raw: string | null): { containerConfig?: ContainerConfig; agentType?: string; notifyUser?: string } { + if (!raw) return {}; + try { + const parsed = JSON.parse(raw); + // New format: { containerConfig, agentType } + if ('containerConfig' in parsed || 'agentType' in parsed) { + return parsed; + } + // Old format: containerConfig directly + return { containerConfig: parsed }; + } catch { + return {}; + } +} + export function getRegisteredGroup( jid: string, ): (RegisteredGroup & { jid: string }) | undefined { @@ -487,16 +531,17 @@ export function getRegisteredGroup( } | undefined; if (!row) return undefined; + const { containerConfig, agentType, notifyUser } = parseConfigBlob(row.container_config); return { jid: row.jid, name: row.name, folder: row.folder, trigger: row.trigger_pattern, added_at: row.added_at, - containerConfig: row.container_config - ? JSON.parse(row.container_config) - : undefined, + containerConfig, requiresTrigger: row.requires_trigger === null ? undefined : row.requires_trigger === 1, + agentType: agentType as 'claude' | 'minimax' | 'qwen' | undefined, + notifyUser, }; } @@ -504,6 +549,9 @@ export function setRegisteredGroup( jid: string, group: RegisteredGroup, ): void { + const configBlob = (group.containerConfig || group.agentType || group.notifyUser) + ? JSON.stringify({ containerConfig: group.containerConfig, agentType: group.agentType, notifyUser: group.notifyUser }) + : null; db.prepare( `INSERT OR REPLACE INTO registered_groups (jid, name, folder, trigger_pattern, added_at, container_config, requires_trigger) VALUES (?, ?, ?, ?, ?, ?, ?)`, @@ -513,7 +561,7 @@ export function setRegisteredGroup( group.folder, group.trigger, group.added_at, - group.containerConfig ? JSON.stringify(group.containerConfig) : null, + configBlob, group.requiresTrigger === undefined ? 1 : group.requiresTrigger ? 1 : 0, ); } @@ -532,15 +580,16 @@ export function getAllRegisteredGroups(): Record { }>; const result: Record = {}; for (const row of rows) { + const { containerConfig, agentType, notifyUser } = parseConfigBlob(row.container_config); result[row.jid] = { name: row.name, folder: row.folder, trigger: row.trigger_pattern, added_at: row.added_at, - containerConfig: row.container_config - ? JSON.parse(row.container_config) - : undefined, + containerConfig, requiresTrigger: row.requires_trigger === null ? undefined : row.requires_trigger === 1, + agentType: agentType as 'claude' | 'minimax' | 'qwen' | undefined, + notifyUser, }; } return result; @@ -600,3 +649,120 @@ function migrateJsonState(): void { } } } + +// --- Token usage --- + +export function logTokenUsage(entry: { + group_folder: string; + agent_type: string; + input_tokens: number; + output_tokens: number; + cache_read_tokens?: number; + cache_creation_tokens?: number; + cost_usd?: number; + duration_ms?: number; + num_turns?: number; + source?: string; + task_id?: string; +}): void { + db.prepare( + `INSERT INTO token_usage (group_folder, agent_type, input_tokens, output_tokens, cache_read_tokens, cache_creation_tokens, cost_usd, duration_ms, num_turns, source, task_id, created_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ).run( + entry.group_folder, + entry.agent_type, + entry.input_tokens, + entry.output_tokens, + entry.cache_read_tokens ?? 0, + entry.cache_creation_tokens ?? 0, + entry.cost_usd ?? 0, + entry.duration_ms ?? 0, + entry.num_turns ?? 0, + entry.source || 'chat', + entry.task_id || null, + new Date().toISOString(), + ); +} + +export interface TokenUsageDaily { + day: string; + agent_type: string; + input_tokens: number; + output_tokens: number; + total_tokens: number; + cost_usd: number; + call_count: number; +} + +export function getTokenUsageByDay(days: number): TokenUsageDaily[] { + const since = new Date(Date.now() - days * 24 * 60 * 60 * 1000).toISOString(); + return db.prepare( + `SELECT date(created_at) as day, agent_type, + SUM(input_tokens) as input_tokens, + SUM(output_tokens) as output_tokens, + SUM(input_tokens + output_tokens) as total_tokens, + SUM(cost_usd) as cost_usd, + COUNT(*) as call_count + FROM token_usage + WHERE created_at > ? + GROUP BY day, agent_type + ORDER BY day`, + ).all(since) as TokenUsageDaily[]; +} + +export interface TokenUsageSummary { + agent_type: string; + input_tokens: number; + output_tokens: number; + cache_read_tokens: number; + cache_creation_tokens: number; + total_tokens: number; + cost_usd: number; + call_count: number; +} + +export function getTokenUsageSummary(days: number): TokenUsageSummary[] { + const since = new Date(Date.now() - days * 24 * 60 * 60 * 1000).toISOString(); + return db.prepare( + `SELECT agent_type, + SUM(input_tokens) as input_tokens, + SUM(output_tokens) as output_tokens, + SUM(cache_read_tokens) as cache_read_tokens, + SUM(cache_creation_tokens) as cache_creation_tokens, + SUM(input_tokens + output_tokens) as total_tokens, + SUM(cost_usd) as cost_usd, + COUNT(*) as call_count + FROM token_usage + WHERE created_at > ? + GROUP BY agent_type + ORDER BY total_tokens DESC`, + ).all(since) as TokenUsageSummary[]; +} + +// --- Dashboard stats --- + +export interface ActivityStats { + messages: Array<{ day: string; cnt: number }>; + tasks: Array<{ day: string; total: number; success: number; avg_ms: number }>; + taskTotals: { total: number; success: number; avg_ms: number; max_ms: number }; +} + +export function getActivityStats(days: number): ActivityStats { + const since = new Date(Date.now() - days * 24 * 60 * 60 * 1000).toISOString(); + const messages = db.prepare( + `SELECT date(timestamp) as day, COUNT(*) as cnt FROM messages WHERE timestamp > ? GROUP BY day ORDER BY day`, + ).all(since) as Array<{ day: string; cnt: number }>; + const tasks = db.prepare( + `SELECT date(run_at) as day, COUNT(*) as total, SUM(CASE WHEN status='success' THEN 1 ELSE 0 END) as success, CAST(AVG(duration_ms) AS INTEGER) as avg_ms FROM task_run_logs WHERE run_at > ? GROUP BY day ORDER BY day`, + ).all(since) as Array<{ day: string; total: number; success: number; avg_ms: number }>; + const taskTotals = db.prepare( + `SELECT COUNT(*) as total, SUM(CASE WHEN status='success' THEN 1 ELSE 0 END) as success, CAST(AVG(duration_ms) AS INTEGER) as avg_ms, CAST(MAX(duration_ms) AS INTEGER) as max_ms FROM task_run_logs WHERE run_at > ?`, + ).get(since) as { total: number; success: number; avg_ms: number; max_ms: number } | undefined; + return { messages, tasks, taskTotals: taskTotals || { total: 0, success: 0, avg_ms: 0, max_ms: 0 } }; +} + +export function getGroupMessageStats(): Array<{ chat_jid: string; msg_count: number; last_msg: string }> { + return db.prepare( + `SELECT chat_jid, COUNT(*) as msg_count, MAX(timestamp) as last_msg FROM messages GROUP BY chat_jid`, + ).all() as Array<{ chat_jid: string; msg_count: number; last_msg: string }>; +} diff --git a/src/index.ts b/src/index.ts index fb702ce..96a891d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,4 +1,3 @@ -import 'dotenv/config'; import { execSync } from 'child_process'; import fs from 'fs'; import path from 'path'; @@ -14,11 +13,34 @@ import { LOCAL_WEB_GROUP_NAME, MAIN_GROUP_FOLDER, POLL_INTERVAL, + STORE_DIR, + TELEGRAM_BOT_TOKEN, + TELEGRAM_ONLY, TRIGGER_PATTERN, + WECOM_BOT_ID, + WECOM_SECRET, + WECOM2_BOT_ID, + WECOM2_SECRET, + WECOM3_BOT_ID, + WECOM3_SECRET, + WECOM_CORP_ID, + WECOM_CORP_SECRET, + WECOM_AGENT_ID, + FEISHU_APP_ID, + FEISHU_APP_SECRET, + FEISHU_DEFAULT_FOLDER, + FEISHU2_APP_ID, + FEISHU2_APP_SECRET, + FEISHU2_DEFAULT_FOLDER, + FEISHU3_APP_ID, + FEISHU3_APP_SECRET, + FEISHU3_DEFAULT_FOLDER, } from './config.js'; import { LocalWebChannel } from './channels/local-web.js'; import { WhatsAppChannel } from './channels/whatsapp.js'; +import { TelegramChannel } from './channels/telegram.js'; import { WeComChannel } from './channels/wecom.js'; +import { FeishuChannel } from './channels/feishu.js'; import { DiscordChannel } from './channels/discord.js'; import { ContainerOutput, @@ -35,6 +57,7 @@ import { getNewMessages, getRouterState, initDatabase, + logTokenUsage, setRegisteredGroup, setRouterState, setSession, @@ -45,6 +68,7 @@ import { GroupQueue } from './group-queue.js'; import { startIpcWatcher } from './ipc.js'; import { findChannel, formatMessages, formatOutbound } from './router.js'; import { startSchedulerLoop } from './task-scheduler.js'; +import { startDashboard } from './dashboard.js'; import { Channel, NewMessage, RegisteredGroup } from './types.js'; import { logger } from './logger.js'; @@ -58,7 +82,6 @@ let lastAgentTimestamp: Record = {}; let messageLoopRunning = false; let whatsapp: WhatsAppChannel | undefined; -let wecom: WeComChannel | null = null; const channels: Channel[] = []; const queue = new GroupQueue(); @@ -136,7 +159,11 @@ export function getAvailableGroups(): import('./container-runner.js').AvailableG .filter( (c) => c.jid !== '__group_sync__' && - (c.jid.endsWith('@g.us') || c.jid.endsWith('@local.web')), + (c.jid.endsWith('@g.us') || + c.jid.endsWith('@local.web') || + c.jid.startsWith('tg:') || + c.jid.startsWith('wc') || + c.jid.startsWith('fs')), ) .map((c) => ({ jid: c.jid, @@ -217,9 +244,12 @@ async function processGroupMessages(chatJid: string): Promise { const raw = typeof result.result === 'string' ? result.result : JSON.stringify(result.result); const text = raw.replace(/[\s\S]*?<\/internal>/g, '').trim(); logger.info({ group: group.name }, `Agent output: ${raw.slice(0, 200)}`); - if (text) { - await sendToChannel(chatJid, text); - outputSentToUser = true; + if (text && channel) { + const formatted = formatOutbound(channel, text); + if (formatted) { + await channel.sendMessage(chatJid, formatted); + outputSentToUser = true; + } } resetIdleTimer(); } @@ -283,13 +313,28 @@ async function runAgent( new Set(Object.keys(registeredGroups)), ); - // Wrap onOutput to track session ID from streamed results + // Wrap onOutput to track session ID and log token usage from streamed results const wrappedOnOutput = onOutput ? async (output: ContainerOutput) => { if (output.newSessionId) { sessions[group.folder] = output.newSessionId; setSession(group.folder, output.newSessionId); } + // Log token usage as soon as we receive it (don't wait for container exit) + if (output.usage && (output.usage.input_tokens > 0 || output.usage.output_tokens > 0)) { + logTokenUsage({ + group_folder: group.folder, + agent_type: 'claude', + input_tokens: output.usage.input_tokens, + output_tokens: output.usage.output_tokens, + cache_read_tokens: output.usage.cache_read_tokens, + cache_creation_tokens: output.usage.cache_creation_tokens, + cost_usd: output.usage.cost_usd, + duration_ms: output.usage.duration_ms, + num_turns: output.usage.num_turns, + source: 'message', + }); + } await onOutput(output); } : undefined; @@ -486,24 +531,92 @@ async function main(): Promise { process.on('SIGTERM', () => shutdown('SIGTERM')); process.on('SIGINT', () => shutdown('SIGINT')); - // Create and connect channels - const channelCallbacks = { + const channelOpts = { onMessage: (_chatJid: string, msg: NewMessage) => storeMessage(msg), - onChatMetadata: (chatJid: string, timestamp: string) => storeChatMetadata(chatJid, timestamp), + onChatMetadata: (chatJid: string, timestamp: string, name?: string) => + storeChatMetadata(chatJid, timestamp, name), registeredGroups: () => registeredGroups, - autoRegister: (jid: string, name: string, channelName: string) => { - if (registeredGroups[jid]) return; - const folder = `${channelName}-${jid.split('@')[0].slice(-8)}`; - registerGroup(jid, { - name, - folder, - trigger: TRIGGER_PATTERN.source, - added_at: new Date().toISOString(), - requiresTrigger: false, - }); - }, }; + // Create and connect channels + // WhatsApp: only attempt if not TELEGRAM_ONLY and auth credentials exist + if (!TELEGRAM_ONLY) { + const authCredsPath = path.join(STORE_DIR, 'auth', 'creds.json'); + if (fs.existsSync(authCredsPath)) { + whatsapp = new WhatsAppChannel({ + onMessage: channelOpts.onMessage, + onChatMetadata: (chatJid, timestamp) => storeChatMetadata(chatJid, timestamp), + registeredGroups: () => registeredGroups, + }); + try { + channels.push(whatsapp); + await whatsapp.connect(); + } catch (err) { + channels.splice(channels.indexOf(whatsapp), 1); + logger.warn({ err }, 'WhatsApp connection failed — run /setup to re-authenticate.'); + } + } else { + logger.info('WhatsApp auth not found, skipping. Use /setup to configure.'); + } + } + + if (TELEGRAM_BOT_TOKEN) { + const telegram = new TelegramChannel(TELEGRAM_BOT_TOKEN, channelOpts); + channels.push(telegram); + await telegram.connect(); + } + + if (WECOM_BOT_ID && WECOM_SECRET) { + const wecom = new WeComChannel(WECOM_BOT_ID, WECOM_SECRET, { ...channelOpts, corpId: WECOM_CORP_ID, corpSecret: WECOM_CORP_SECRET, agentId: WECOM_AGENT_ID }); + channels.push(wecom); + await wecom.connect(); + } + + if (WECOM2_BOT_ID && WECOM2_SECRET) { + const wecom2 = new WeComChannel(WECOM2_BOT_ID, WECOM2_SECRET, { ...channelOpts, jidPrefix: 'wc2:', corpId: WECOM_CORP_ID, corpSecret: WECOM_CORP_SECRET, agentId: WECOM_AGENT_ID }); + channels.push(wecom2); + await wecom2.connect(); + } + + if (WECOM3_BOT_ID && WECOM3_SECRET) { + const wecom3 = new WeComChannel(WECOM3_BOT_ID, WECOM3_SECRET, { ...channelOpts, jidPrefix: 'wc3:', corpId: WECOM_CORP_ID, corpSecret: WECOM_CORP_SECRET, agentId: WECOM_AGENT_ID }); + channels.push(wecom3); + await wecom3.connect(); + } + + if (FEISHU_APP_ID && FEISHU_APP_SECRET) { + const feishu = new FeishuChannel(FEISHU_APP_ID, FEISHU_APP_SECRET, { + ...channelOpts, + defaultFolder: FEISHU_DEFAULT_FOLDER || undefined, + onRegisterGroup: FEISHU_DEFAULT_FOLDER ? registerGroup : undefined, + }); + channels.push(feishu); + await feishu.connect(); + } + + if (FEISHU2_APP_ID && FEISHU2_APP_SECRET) { + const feishu2 = new FeishuChannel(FEISHU2_APP_ID, FEISHU2_APP_SECRET, { + ...channelOpts, + jidPrefix: 'fs2:', + defaultFolder: FEISHU2_DEFAULT_FOLDER || undefined, + onRegisterGroup: FEISHU2_DEFAULT_FOLDER ? registerGroup : undefined, + }); + channels.push(feishu2); + await feishu2.connect(); + } + + if (FEISHU3_APP_ID && FEISHU3_APP_SECRET) { + const feishu3 = new FeishuChannel(FEISHU3_APP_ID, FEISHU3_APP_SECRET, { + ...channelOpts, + jidPrefix: 'fs3:', + defaultFolder: FEISHU3_DEFAULT_FOLDER || undefined, + onRegisterGroup: FEISHU3_DEFAULT_FOLDER ? registerGroup : undefined, + }); + channels.push(feishu3); + await feishu3.connect(); + } + + // LocalWeb channel (upstream addition — optional, only if ENABLE_LOCAL_WEB is set) if (ENABLE_LOCAL_WEB) { if (!registeredGroups[LOCAL_WEB_GROUP_JID]) { const folderConflict = Object.entries(registeredGroups).find( @@ -539,38 +652,10 @@ async function main(): Promise { await localWeb.connect(); } - // WeCom channel (connect first — WhatsApp's heavy sync can starve the event loop) - const wecomBotId = process.env.WECOM_BOT_ID; - const wecomSecret = process.env.WECOM_SECRET; - if (wecomBotId && wecomSecret) { - const agentCreds = process.env.WECOM_CORP_ID && process.env.WECOM_CORP_SECRET && process.env.WECOM_AGENT_ID - ? { corpId: process.env.WECOM_CORP_ID, corpSecret: process.env.WECOM_CORP_SECRET, agentId: process.env.WECOM_AGENT_ID } - : undefined; - wecom = new WeComChannel({ botId: wecomBotId, secret: wecomSecret, agent: agentCreds, ...channelCallbacks }); - channels.push(wecom); - try { - await wecom.connect(); - } catch (err) { - logger.error({ err }, 'WeCom connection failed, continuing without it'); - wecom = null; - } - } else { - logger.info('WeCom not configured (set WECOM_BOT_ID and WECOM_SECRET to enable)'); - } - - // WhatsApp channel (skip if DISABLE_WHATSAPP is set) - if (ENABLE_WHATSAPP && !process.env.DISABLE_WHATSAPP) { - whatsapp = new WhatsAppChannel(channelCallbacks); - channels.push(whatsapp); - await whatsapp.connect(); - } else { - logger.info('WhatsApp disabled via DISABLE_WHATSAPP env var'); - } - - // Discord channel (optional — only if token is configured) + // Discord channel (upstream addition — optional, only if token is configured) const discordToken = process.env.DISCORD_BOT_TOKEN; if (discordToken) { - const discord = new DiscordChannel({ token: discordToken, ...channelCallbacks }); + const discord = new DiscordChannel({ token: discordToken, ...channelOpts }); channels.push(discord); try { await discord.connect(); @@ -588,25 +673,29 @@ async function main(): Promise { queue, onProcess: (groupJid, proc, containerName, groupFolder) => queue.registerProcess(groupJid, proc, containerName, groupFolder), sendMessage: async (jid, rawText) => { - const ch = channelForJid(jid); - if (ch) { - const text = formatOutbound(ch, rawText); - if (text) await ch.sendMessage(jid, text); - } + const ch = findChannel(channels, jid); + if (!ch) return; + const text = formatOutbound(ch, rawText); + if (text) await ch.sendMessage(jid, text); }, }); startIpcWatcher({ + sendMessage: (jid, text) => { + const ch = findChannel(channels, jid); + if (!ch) throw new Error(`No channel for JID: ${jid}`); + return ch.sendMessage(jid, text); + }, + sendImage: (jid, imagePath, caption) => whatsapp?.sendImage(jid, imagePath, caption) ?? Promise.resolve(), registeredGroups: () => registeredGroups, registerGroup, - sendMessage: (jid, text) => sendToChannel(jid, text), - sendImage: (jid, imagePath, caption) => sendImageToChannel(jid, imagePath, caption), - syncGroupMetadata: (force) => whatsapp?.syncGroupMetadata(force), + syncGroupMetadata: (force) => whatsapp?.syncGroupMetadata(force) ?? Promise.resolve(), getAvailableGroups, writeGroupsSnapshot: (gf, im, ag, rj) => writeGroupsSnapshot(gf, im, ag, rj), }); queue.setProcessMessagesFn(processGroupMessages); recoverPendingMessages(); startMessageLoop(); + startDashboard(); } // Guard: only run when executed directly, not when imported by tests diff --git a/src/ipc.ts b/src/ipc.ts index 4e33907..c6bedba 100644 --- a/src/ipc.ts +++ b/src/ipc.ts @@ -74,50 +74,60 @@ export function startIpcWatcher(deps: IpcDeps): void { try { const data = JSON.parse(fs.readFileSync(filePath, 'utf-8')); if (data.type === 'message' && data.chatJid && data.text) { - const targetGroup = registeredGroups[data.chatJid]; - if ( - isMain || - (targetGroup && targetGroup.folder === sourceGroup) - ) { - await deps.sendMessage( - data.chatJid, - `${ASSISTANT_NAME}: ${data.text}`, - ); - logger.info( - { chatJid: data.chatJid, sourceGroup }, - 'IPC message sent', - ); + // Dashboard chat — messages are delivered via SSE, skip IPC send + if (data.chatJid === 'dashboard') { + logger.debug({ sourceGroup }, 'IPC message from dashboard chat (delivered via SSE)'); } else { - logger.warn( - { chatJid: data.chatJid, sourceGroup }, - 'Unauthorized IPC message attempt blocked', - ); - } - } else if (data.type === 'image' && data.chatJid && data.filePath) { - const targetGroup = registeredGroups[data.chatJid]; - if ( - isMain || - (targetGroup && targetGroup.folder === sourceGroup) - ) { - const hostImagePath = path.join(ipcBaseDir, sourceGroup, data.filePath); - if (fs.existsSync(hostImagePath)) { - await deps.sendImage(data.chatJid, hostImagePath, data.caption); + const targetGroup = registeredGroups[data.chatJid]; + if ( + isMain || + (targetGroup && targetGroup.folder === sourceGroup) + ) { + await deps.sendMessage( + data.chatJid, + `${ASSISTANT_NAME}: ${data.text}`, + ); logger.info( - { chatJid: data.chatJid, sourceGroup, filePath: data.filePath }, - 'IPC image sent', + { chatJid: data.chatJid, sourceGroup }, + 'IPC message sent', ); - try { fs.unlinkSync(hostImagePath); } catch {} } else { logger.warn( - { hostImagePath, sourceGroup }, - 'IPC image file not found', + { chatJid: data.chatJid, sourceGroup }, + 'Unauthorized IPC message attempt blocked', ); } + } + } else if (data.type === 'image' && data.chatJid && data.filePath) { + // Dashboard chat — images delivered via SSE + if (data.chatJid === 'dashboard') { + logger.debug({ sourceGroup }, 'IPC image from dashboard chat (skipped)'); } else { - logger.warn( - { chatJid: data.chatJid, sourceGroup }, - 'Unauthorized IPC image attempt blocked', - ); + const targetGroup = registeredGroups[data.chatJid]; + if ( + isMain || + (targetGroup && targetGroup.folder === sourceGroup) + ) { + const hostImagePath = path.join(ipcBaseDir, sourceGroup, data.filePath); + if (fs.existsSync(hostImagePath)) { + await deps.sendImage(data.chatJid, hostImagePath, data.caption); + logger.info( + { chatJid: data.chatJid, sourceGroup, filePath: data.filePath }, + 'IPC image sent', + ); + try { fs.unlinkSync(hostImagePath); } catch {} + } else { + logger.warn( + { hostImagePath, sourceGroup }, + 'IPC image file not found', + ); + } + } else { + logger.warn( + { chatJid: data.chatJid, sourceGroup }, + 'Unauthorized IPC image attempt blocked', + ); + } } } fs.unlinkSync(filePath); diff --git a/src/task-scheduler.ts b/src/task-scheduler.ts index 6efe7d5..82cfdad 100644 --- a/src/task-scheduler.ts +++ b/src/task-scheduler.ts @@ -16,6 +16,7 @@ import { getDueTasks, getTaskById, logTaskRun, + logTokenUsage, updateTaskAfterRun, } from './db.js'; import { GroupQueue } from './group-queue.js'; @@ -43,6 +44,8 @@ async function runTask( 'Running scheduled task', ); + // next_run was already advanced by the scheduler loop before enqueuing. + const groups = deps.registeredGroups(); const group = Object.values(groups).find( (g) => g.folder === task.group_folder, @@ -136,6 +139,23 @@ async function runTask( result = output.result; } + // Log token usage from container agent + if (output.usage && (output.usage.input_tokens > 0 || output.usage.output_tokens > 0)) { + logTokenUsage({ + group_folder: task.group_folder, + agent_type: 'claude', + input_tokens: output.usage.input_tokens, + output_tokens: output.usage.output_tokens, + cache_read_tokens: output.usage.cache_read_tokens, + cache_creation_tokens: output.usage.cache_creation_tokens, + cost_usd: output.usage.cost_usd, + duration_ms: output.usage.duration_ms, + num_turns: output.usage.num_turns, + source: 'task', + task_id: task.id, + }); + } + logger.info( { taskId: task.id, durationMs: Date.now() - startTime }, 'Task completed', @@ -201,6 +221,22 @@ export function startSchedulerLoop(deps: SchedulerDependencies): void { continue; } + // Advance next_run BEFORE enqueuing to prevent duplicate triggers. + // Without this, a slow-starting task (queued behind an active container) + // would still have the old next_run when the next scheduler poll fires, + // causing getDueTasks() to return it again. + let nextRun: string | null = null; + if (currentTask.schedule_type === 'cron') { + const interval = CronExpressionParser.parse(currentTask.schedule_value, { tz: TIMEZONE }); + nextRun = interval.next().toISOString(); + } else if (currentTask.schedule_type === 'interval') { + const ms = parseInt(currentTask.schedule_value, 10); + nextRun = new Date(Date.now() + ms).toISOString(); + } + if (nextRun) { + updateTaskAfterRun(currentTask.id, nextRun, '(pending)'); + } + deps.queue.enqueueTask( currentTask.chat_jid, currentTask.id, diff --git a/src/types.ts b/src/types.ts index 4a7571a..97af37a 100644 --- a/src/types.ts +++ b/src/types.ts @@ -39,6 +39,8 @@ export interface RegisteredGroup { added_at: string; containerConfig?: ContainerConfig; requiresTrigger?: boolean; // Default: true for groups, false for solo chats + agentType?: 'claude' | 'minimax' | 'qwen'; + notifyUser?: string; } export interface NewMessage { diff --git a/start-dashboard.sh b/start-dashboard.sh new file mode 100755 index 0000000..9e8fa98 --- /dev/null +++ b/start-dashboard.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# If bioclaw is already running on 3847 (via launchd), just wait. +# Otherwise start the dev server. +if lsof -i :3847 -sTCP:LISTEN -t > /dev/null 2>&1; then + echo "BioClaw dashboard already running on port 3847" + # Keep process alive so preview tool considers it running + while true; do sleep 60; done +else + cd "$(dirname "$0")" + exec npm run dev +fi