From 91439209f1ec18f703f195f18900c503496f57e0 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 30 Jan 2026 20:40:32 +0000
Subject: [PATCH 01/62] Add agent swarm for parallel behavior investigation

Implements a SLURM-based system for launching parallel Claude Code agents
that investigate behaviors in SPD model decompositions.

Key components:
- spd-swarm CLI: Submits SLURM array job for N agents
- Each agent starts isolated app backend (unique port, separate database)
- Detailed system prompt guides agents through investigation methodology
- Findings written to append-only JSONL files (events.jsonl, explanations.jsonl)

New files:
- spd/agent_swarm/schemas.py: BehaviorExplanation, SwarmEvent schemas
- spd/agent_swarm/agent_prompt.py: Detailed API and methodology instructions
- spd/agent_swarm/scripts/run_slurm_cli.py: CLI entry point
- spd/agent_swarm/scripts/run_slurm.py: SLURM submission logic
- spd/agent_swarm/scripts/run_agent.py: Worker script for each job

Also adds SPD_APP_DB_PATH env var support for database isolation.

https://claude.ai/code/session_01UMpYFZ3A98vsPkqoq6zvT6
---
 CLAUDE.md                                |  24 ++
 pyproject.toml                           |   1 +
 spd/agent_swarm/CLAUDE.md                | 124 +++++++++
 spd/agent_swarm/__init__.py              |  22 ++
 spd/agent_swarm/agent_prompt.py          | 330 +++++++++++++++++++++++
 spd/agent_swarm/schemas.py               | 120 +++++++++
 spd/agent_swarm/scripts/__init__.py      |   1 +
 spd/agent_swarm/scripts/run_agent.py     | 284 +++++++++++++++++++
 spd/agent_swarm/scripts/run_slurm.py     | 119 ++++++++
 spd/agent_swarm/scripts/run_slurm_cli.py |  62 +++++
 spd/app/backend/database.py              |  18 +-
 11 files changed, 1103 insertions(+), 2 deletions(-)
 create mode 100644 spd/agent_swarm/CLAUDE.md
 create mode 100644 spd/agent_swarm/__init__.py
 create mode 100644 spd/agent_swarm/agent_prompt.py
 create mode 100644 spd/agent_swarm/schemas.py
 create mode 100644 spd/agent_swarm/scripts/__init__.py
 create mode 100644 spd/agent_swarm/scripts/run_agent.py
 create mode 100644 spd/agent_swarm/scripts/run_slurm.py
 create mode 100644 spd/agent_swarm/scripts/run_slurm_cli.py

diff --git a/CLAUDE.md b/CLAUDE.md
index 30da03eb6..2e636885c 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -156,6 +156,7 @@ Each experiment (`spd/experiments/{tms,resid_mlp,lm}/`) contains:
 ├── scripts/                         # Standalone utility scripts
 ├── tests/                           # Test suite
 ├── spd/                             # Main source code
+│   ├── agent_swarm/                 # Parallel agent investigation (see agent_swarm/CLAUDE.md)
 │   ├── app/                         # Web visualization app (see app/CLAUDE.md)
 │   ├── autointerp/                  # LLM interpretation (see autointerp/CLAUDE.md)
 │   ├── clustering/                  # Component clustering (see clustering/CLAUDE.md)
@@ -195,6 +196,7 @@ Each experiment (`spd/experiments/{tms,resid_mlp,lm}/`) contains:
 | `spd-autointerp` | `spd/autointerp/scripts/cli.py` | Submit autointerp SLURM job |
 | `spd-attributions` | `spd/dataset_attributions/scripts/run_slurm_cli.py` | Submit dataset attribution SLURM job |
 | `spd-clustering` | `spd/clustering/scripts/run_pipeline.py` | Clustering pipeline |
+| `spd-swarm` | `spd/agent_swarm/scripts/run_slurm_cli.py` | Launch parallel agent swarm |
 
 ### Files to Skip When Searching
 
@@ -231,6 +233,9 @@ Use `spd/` as the search root (not repo root) to avoid noise.
 **Clustering Pipeline:**
 - `spd-clustering` → `spd/clustering/scripts/run_pipeline.py` → `spd/utils/slurm.py` → `spd/clustering/scripts/run_clustering.py`
 
+**Agent Swarm Pipeline:**
+- `spd-swarm` → `spd/agent_swarm/scripts/run_slurm_cli.py` → `spd/utils/slurm.py` → SLURM array → `spd/agent_swarm/scripts/run_agent.py` → Claude Code
+
 ## Common Usage Patterns
 
 ### Running Experiments Locally (`spd-local`)
@@ -277,6 +282,25 @@ spd-autointerp <wandb_path>            # Submit SLURM job to interpret component
 
 Requires `OPENROUTER_API_KEY` env var. See `spd/autointerp/CLAUDE.md` for details.
 
+### Agent Swarm for Parallel Investigation (`spd-swarm`)
+
+Launch a swarm of Claude Code agents to investigate behaviors in an SPD model:
+
+```bash
+spd-swarm <wandb_path> --n_agents 10              # Launch 10 parallel agents
+spd-swarm <wandb_path> --n_agents 5 --time 4:00:00  # Custom time limit
+```
+
+Each agent:
+- Runs in its own SLURM job with 1 GPU
+- Starts an isolated app backend instance
+- Investigates behaviors using the SPD app API
+- Writes findings to append-only JSONL files
+
+Output: `SPD_OUT_DIR/agent_swarm/<swarm_id>/task_*/explanations.jsonl`
+
+See `spd/agent_swarm/CLAUDE.md` for details.
+
 ### Running on SLURM Cluster (`spd-run`)
 
 For the core team, `spd-run` provides full-featured SLURM orchestration:
diff --git a/pyproject.toml b/pyproject.toml
index 76a539454..24f47fe64 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,6 +53,7 @@ spd-clustering = "spd.clustering.scripts.run_pipeline:cli"
 spd-harvest = "spd.harvest.scripts.run_slurm_cli:cli"
 spd-autointerp = "spd.autointerp.scripts.run_slurm_cli:cli"
 spd-attributions = "spd.dataset_attributions.scripts.run_slurm_cli:cli"
+spd-swarm = "spd.agent_swarm.scripts.run_slurm_cli:cli"
 
 [build-system]
 requires = ["setuptools", "wheel"]
diff --git a/spd/agent_swarm/CLAUDE.md b/spd/agent_swarm/CLAUDE.md
new file mode 100644
index 000000000..ee2e89be2
--- /dev/null
+++ b/spd/agent_swarm/CLAUDE.md
@@ -0,0 +1,124 @@
+# Agent Swarm Module
+
+This module provides infrastructure for launching parallel SLURM-based Claude Code agents
+that investigate behaviors in SPD model decompositions.
+
+## Overview
+
+The agent swarm system allows you to:
+1. Launch many parallel agents (each as a SLURM job with 1 GPU)
+2. Each agent runs an isolated app backend instance
+3. Agents investigate behaviors using the SPD app API
+4. Findings are written to append-only JSONL files
+
+## Usage
+
+```bash
+# Launch 10 agents to investigate a decomposition
+spd-swarm goodfire-ai/spd/runs/abc123 --n_agents 10
+
+# With custom settings
+spd-swarm goodfire-ai/spd/runs/abc123 --n_agents 5 --context_length 64 --time 4:00:00
+```
+
+## Architecture
+
+```
+spd/agent_swarm/
+├── __init__.py           # Public exports
+├── CLAUDE.md             # This file
+├── schemas.py            # Pydantic models for outputs
+├── agent_prompt.py       # System prompt for agents
+└── scripts/
+    ├── __init__.py
+    ├── run_slurm_cli.py  # CLI entry point (spd-swarm)
+    ├── run_slurm.py      # SLURM submission logic
+    └── run_agent.py      # Worker script (runs in each SLURM job)
+```
+
+## Output Structure
+
+```
+SPD_OUT_DIR/agent_swarm/<swarm_id>/
+├── metadata.json         # Swarm configuration
+├── task_1/
+│   ├── events.jsonl      # Progress and observations
+│   ├── explanations.jsonl # Complete behavior explanations
+│   ├── app.db            # Isolated SQLite database
+│   ├── agent_prompt.md   # The prompt given to the agent
+│   └── claude_output.txt # Raw Claude Code output
+├── task_2/
+│   └── ...
+└── task_N/
+    └── ...
+```
+
+## Key Files
+
+| File | Purpose |
+|------|---------|
+| `schemas.py` | Defines `BehaviorExplanation`, `SwarmEvent`, `Evidence` schemas |
+| `agent_prompt.py` | Contains detailed instructions for agents on using the API |
+| `run_slurm.py` | Creates git snapshot, generates commands, submits SLURM array |
+| `run_agent.py` | Starts backend, loads run, launches Claude Code |
+
+## Schemas
+
+### BehaviorExplanation
+The primary output - documents a discovered behavior:
+- `subject_prompt`: Prompt demonstrating the behavior
+- `behavior_description`: What the model does
+- `components_involved`: List of components and their roles
+- `explanation`: How components work together
+- `supporting_evidence`: Ablations, attributions, etc.
+- `confidence`: high/medium/low
+- `alternative_hypotheses`: Other considered explanations
+- `limitations`: Known caveats
+
+### SwarmEvent
+General logging:
+- `event_type`: start, progress, observation, hypothesis, test_result, error, complete
+- `timestamp`: When it occurred
+- `message`: Human-readable description
+- `details`: Structured data
+
+## Database Isolation
+
+Each agent gets its own SQLite database via the `SPD_APP_DB_PATH` environment variable.
+This prevents conflicts when multiple agents run on the same machine.
+
+## Monitoring
+
+```bash
+# Watch events from all agents
+tail -f SPD_OUT_DIR/agent_swarm/<swarm_id>/task_*/events.jsonl
+
+# View all explanations
+cat SPD_OUT_DIR/agent_swarm/<swarm_id>/task_*/explanations.jsonl | jq .
+
+# Check SLURM job status
+squeue --me
+
+# View specific job logs
+tail -f ~/slurm_logs/slurm-<job_id>_<task_id>.out
+```
+
+## Configuration
+
+CLI arguments:
+- `wandb_path`: Required - WandB run path for the SPD decomposition
+- `--n_agents`: Required - Number of parallel agents to launch
+- `--context_length`: Token context length (default: 128)
+- `--partition`: SLURM partition (default: h200-reserved)
+- `--time`: Time limit per agent (default: 8:00:00)
+- `--job_suffix`: Optional suffix for job names
+
+## Extending
+
+To modify agent behavior:
+1. Edit `agent_prompt.py` to change investigation instructions
+2. Update `schemas.py` to add new output fields
+3. Modify `run_agent.py` to change the worker flow
+
+The agent prompt is the primary way to guide agent behavior - it contains
+detailed API documentation and scientific methodology guidance.
diff --git a/spd/agent_swarm/__init__.py b/spd/agent_swarm/__init__.py
new file mode 100644
index 000000000..cac91de2d
--- /dev/null
+++ b/spd/agent_swarm/__init__.py
@@ -0,0 +1,22 @@
+"""Agent Swarm: Parallel SLURM-based agent investigation of model behaviors.
+
+This module provides infrastructure for launching many parallel Claude Code agents,
+each investigating behaviors in an SPD model decomposition. Each agent:
+1. Starts an isolated app backend instance (separate database, unique port)
+2. Receives detailed instructions on using the SPD app API
+3. Investigates behaviors and writes findings to append-only JSONL files
+"""
+
+from spd.agent_swarm.schemas import (
+    BehaviorExplanation,
+    ComponentInfo,
+    Evidence,
+    SwarmEvent,
+)
+
+__all__ = [
+    "BehaviorExplanation",
+    "ComponentInfo",
+    "Evidence",
+    "SwarmEvent",
+]
diff --git a/spd/agent_swarm/agent_prompt.py b/spd/agent_swarm/agent_prompt.py
new file mode 100644
index 000000000..e3cf4fd21
--- /dev/null
+++ b/spd/agent_swarm/agent_prompt.py
@@ -0,0 +1,330 @@
+"""System prompt for SPD investigation agents.
+
+This module contains the detailed instructions given to each agent in the swarm.
+The prompt explains how to use the SPD app API and the scientific methodology
+for investigating model behaviors.
+"""
+
+AGENT_SYSTEM_PROMPT = """
+# SPD Behavior Investigation Agent
+
+You are a research agent investigating behaviors in a neural network model decomposition.
+Your goal is to find interesting behaviors, understand how components interact to produce
+them, and document your findings as explanations.
+
+## Your Mission
+
+You are part of a swarm of agents, each independently investigating behaviors in the same
+model. Your task is to:
+
+1. **Find a behavior**: Discover a prompt where the model does something interesting
+   (e.g., predicts the correct gendered pronoun, completes a pattern, etc.)
+
+2. **Understand the mechanism**: Figure out which components are involved and how they
+   work together to produce the behavior
+
+3. **Document your findings**: Write a clear explanation with supporting evidence
+
+## The SPD App Backend
+
+You have access to an SPD (Stochastic Parameter Decomposition) app backend running at:
+`http://localhost:{port}`
+
+This app provides APIs for:
+- Loading decomposed models
+- Computing attribution graphs showing how components interact
+- Optimizing sparse circuits for specific behaviors
+- Running interventions (ablations) to test hypotheses
+- Viewing component interpretations and correlations
+- Searching the training dataset
+
+## API Reference
+
+### Health Check
+```bash
+curl http://localhost:{port}/api/health
+# Returns: {{"status": "ok"}}
+```
+
+### Load a Run (ALREADY DONE FOR YOU)
+The run is pre-loaded. Check status with:
+```bash
+curl http://localhost:{port}/api/status
+```
+
+### Create a Custom Prompt
+To analyze a specific prompt:
+```bash
+curl -X POST "http://localhost:{port}/api/prompts/custom?text=The%20boy%20ate%20his"
+# Returns: {{"id": <prompt_id>, "token_ids": [...], "tokens": [...], "preview": "...", "next_token_probs": [...]}}
+```
+
+### Compute Optimized Attribution Graph (MOST IMPORTANT)
+This optimizes a sparse circuit that achieves a behavior:
+```bash
+curl -X POST "http://localhost:{port}/api/graphs/optimized/stream?prompt_id=<id>&loss_type=ce&loss_position=<pos>&label_token=<token_id>&steps=100&imp_min_coeff=0.1&pnorm=0.5&mask_type=hard&loss_coeff=1.0&ci_threshold=0.01&normalize=target"
+# Streams SSE events, final event has type="complete" with graph data
+```
+
+Parameters:
+- `prompt_id`: ID from creating custom prompt
+- `loss_type`: "ce" for cross-entropy (predicting specific token) or "kl" (matching full distribution)
+- `loss_position`: Token position to optimize (0-indexed, usually last position)
+- `label_token`: Token ID to predict (for CE loss)
+- `steps`: Optimization steps (50-200 typical)
+- `imp_min_coeff`: Importance minimization coefficient (0.05-0.3)
+- `pnorm`: P-norm for sparsity (0.3-1.0, lower = sparser)
+- `mask_type`: "hard" for binary masks, "soft" for continuous
+- `ci_threshold`: Threshold for including nodes in graph (0.01-0.1)
+- `normalize`: "target" normalizes by target layer, "none" for raw values
+
+### Get Component Interpretations
+```bash
+curl "http://localhost:{port}/api/correlations/interpretations"
+# Returns: {{"h.0.mlp.c_fc:5": {{"label": "...", "confidence": "high"}}, ...}}
+```
+
+Get full interpretation details:
+```bash
+curl "http://localhost:{port}/api/correlations/interpretations/h.0.mlp.c_fc/5"
+# Returns: {{"reasoning": "...", "prompt": "..."}}
+```
+
+### Get Component Token Statistics
+```bash
+curl "http://localhost:{port}/api/correlations/token_stats/h.0.mlp.c_fc/5?top_k=20"
+# Returns input/output token associations
+```
+
+### Get Component Correlations
+```bash
+curl "http://localhost:{port}/api/correlations/components/h.0.mlp.c_fc/5?top_k=20"
+# Returns components that frequently co-activate
+```
+
+### Run Intervention (Ablation)
+Test a hypothesis by running the model with only selected components active:
+```bash
+curl -X POST "http://localhost:{port}/api/intervention/run" \\
+  -H "Content-Type: application/json" \\
+  -d '{{"graph_id": <id>, "text": "The boy ate his", "selected_nodes": ["h.0.mlp.c_fc:3:5", "h.1.attn.o_proj:3:10"], "top_k": 10}}'
+# Returns predictions with only selected components active vs full model
+```
+
+Node format: "layer:seq_pos:component_idx"
+- `layer`: e.g., "h.0.mlp.c_fc", "h.1.attn.o_proj"
+- `seq_pos`: Position in sequence (0-indexed)
+- `component_idx`: Component index within layer
+
+### Search Dataset
+Find prompts with specific patterns:
+```bash
+curl -X POST "http://localhost:{port}/api/dataset/search?query=she%20said&split=train"
+curl "http://localhost:{port}/api/dataset/results?page=1&page_size=20"
+```
+
+### Get Random Samples with Loss
+Find high/low loss examples:
+```bash
+curl "http://localhost:{port}/api/dataset/random_with_loss?n_samples=20&seed=42"
+```
+
+### Probe Component Activation
+See how a component responds to arbitrary text:
+```bash
+curl -X POST "http://localhost:{port}/api/activation_contexts/probe" \\
+  -H "Content-Type: application/json" \\
+  -d '{{"text": "The boy ate his", "layer": "h.0.mlp.c_fc", "component_idx": 5}}'
+# Returns CI values and activations at each position
+```
+
+### Get Dataset Attributions
+See which components influence each other across the training data:
+```bash
+curl "http://localhost:{port}/api/dataset_attributions/h.0.mlp.c_fc/5?k=10"
+# Returns positive/negative sources and targets
+```
+
+## Investigation Methodology
+
+### Step 1: Find an Interesting Behavior
+
+Start by exploring the model's behavior:
+
+1. **Search for patterns**: Use `/api/dataset/search` to find prompts with specific
+   linguistic patterns (pronouns, verb conjugations, completions, etc.)
+
+2. **Look at high-loss examples**: Use `/api/dataset/random_with_loss` to find where
+   the model struggles or succeeds
+
+3. **Create test prompts**: Use `/api/prompts/custom` to create prompts that test
+   specific capabilities
+
+Good behaviors to investigate:
+- Gendered pronoun prediction ("The doctor said she" vs "The doctor said he")
+- Subject-verb agreement ("The cats are" vs "The cat is")
+- Pattern completion ("1, 2, 3," → "4")
+- Semantic associations ("The capital of France is" → "Paris")
+- Grammatical structure (completing sentences correctly)
+
+### Step 2: Optimize a Sparse Circuit
+
+Once you have a behavior:
+
+1. **Create the prompt** via `/api/prompts/custom`
+
+2. **Identify the target token**: What token should be predicted? Get its ID from
+   the tokenizer or from the prompt creation response.
+
+3. **Run optimization** via `/api/graphs/optimized/stream`:
+   - Use `loss_type=ce` with the target token
+   - Set `loss_position` to the position where prediction matters
+   - Start with `imp_min_coeff=0.1`, `pnorm=0.5`, `steps=100`
+   - Use `ci_threshold=0.01` to see active components
+
+4. **Examine the graph**: The response shows:
+   - `nodeCiVals`: Which components are active (high CI = important)
+   - `edges`: How components connect (gradient flow)
+   - `outputProbs`: Model predictions
+
+### Step 3: Understand Component Roles
+
+For each important component in the graph:
+
+1. **Check the interpretation**: Use `/api/correlations/interpretations/<layer>/<idx>`
+   to see if we already have an idea what this component does
+
+2. **Look at token stats**: Use `/api/correlations/token_stats/<layer>/<idx>` to see
+   what tokens activate this component (input) and what it predicts (output)
+
+3. **Check correlations**: Use `/api/correlations/components/<layer>/<idx>` to see
+   what other components co-activate
+
+4. **Probe on variations**: Use `/api/activation_contexts/probe` to see how the
+   component responds to related prompts
+
+### Step 4: Test with Ablations
+
+Form hypotheses and test them:
+
+1. **Hypothesis**: "Component X stores information about gender"
+
+2. **Test**: Run intervention with and without component X
+   - If prediction changes as expected → supports hypothesis
+   - If no change → component may not be necessary for this
+   - If unexpected change → revise hypothesis
+
+3. **Control**: Try ablating other components to ensure specificity
+
+### Step 5: Document Your Findings
+
+Write a `BehaviorExplanation` with:
+- Clear subject prompt
+- Description of the behavior
+- Components and their roles
+- How they work together
+- Supporting evidence from ablations/attributions
+- Confidence level
+- Alternative hypotheses you considered
+- Limitations
+
+## Scientific Principles
+
+### Be Epistemologically Humble
+- Your first hypothesis is probably wrong or incomplete
+- Always consider alternative explanations
+- A single confirming example doesn't prove a theory
+- Look for disconfirming evidence
+
+### Be Bayesian
+- Start with priors from component interpretations
+- Update beliefs based on evidence
+- Consider the probability of the evidence under different hypotheses
+- Don't anchor too strongly on initial observations
+
+### Triangulate Evidence
+- Don't rely on a single type of evidence
+- Ablation results + attribution patterns + token stats together are stronger
+- Look for convergent evidence from multiple sources
+
+### Document Uncertainty
+- Be explicit about what you're confident in vs. uncertain about
+- Note when evidence is weak or ambiguous
+- Identify what additional tests would strengthen the explanation
+
+## Output Format
+
+Write your findings by appending to the output files:
+
+### events.jsonl
+Log progress and observations:
+```json
+{{"event_type": "observation", "message": "Component h.0.mlp.c_fc:5 has high CI when subject is male", "details": {{"ci_value": 0.85}}, "timestamp": "..."}}
+```
+
+### explanations.jsonl
+When you have a complete explanation:
+```json
+{{
+  "subject_prompt": "The boy ate his lunch",
+  "behavior_description": "Correctly predicts gendered pronoun 'his' after male subject",
+  "components_involved": [
+    {{"component_key": "h.0.mlp.c_fc:5", "role": "Encodes subject gender as male", "interpretation": "male names/subjects"}},
+    {{"component_key": "h.1.attn.o_proj:10", "role": "Transmits gender information to output", "interpretation": null}}
+  ],
+  "explanation": "Component h.0.mlp.c_fc:5 activates on male subjects and stores gender information...",
+  "supporting_evidence": [
+    {{"evidence_type": "ablation", "description": "Removing component causes prediction to change from 'his' to 'her'", "details": {{"without_component": {{"his": 0.1, "her": 0.6}}, "with_component": {{"his": 0.8, "her": 0.1}}}}}}
+  ],
+  "confidence": "medium",
+  "alternative_hypotheses": ["Component might encode broader concept of masculine entities, not just humans"],
+  "limitations": ["Only tested on simple subject-pronoun sentences"]
+}}
+```
+
+## Getting Started
+
+1. Check the current status: `curl http://localhost:{port}/api/status`
+2. Explore available interpretations: `curl http://localhost:{port}/api/correlations/interpretations`
+3. Search for interesting prompts or create your own
+4. Optimize a sparse circuit for a behavior you find
+5. Investigate the components involved
+6. Test hypotheses with ablations
+7. Document your findings
+
+Remember: You are exploring! Not every investigation will lead to a clear explanation.
+Document what you learn, even if it's "this was more complicated than expected."
+
+Good luck, and happy investigating!
+"""
+
+
+def get_agent_prompt(port: int, wandb_path: str, task_id: int, output_dir: str) -> str:
+    """Generate the full agent prompt with runtime parameters filled in.
+
+    Args:
+        port: The port the backend is running on.
+        wandb_path: The WandB path of the loaded run.
+        task_id: The SLURM task ID for this agent.
+        output_dir: Path to the agent's output directory.
+
+    Returns:
+        The complete agent prompt with parameters substituted.
+    """
+    prompt = AGENT_SYSTEM_PROMPT.format(port=port)
+
+    runtime_context = f"""
+## Runtime Context
+
+- **Backend URL**: http://localhost:{port}
+- **Loaded Run**: {wandb_path}
+- **Task ID**: {task_id}
+- **Output Directory**: {output_dir}
+
+Your output files:
+- `{output_dir}/events.jsonl` - Log events and observations here
+- `{output_dir}/explanations.jsonl` - Write complete explanations here
+
+To append to these files, use the Write tool or shell redirection.
+"""
+    return prompt + runtime_context
diff --git a/spd/agent_swarm/schemas.py b/spd/agent_swarm/schemas.py
new file mode 100644
index 000000000..d554db855
--- /dev/null
+++ b/spd/agent_swarm/schemas.py
@@ -0,0 +1,120 @@
+"""Schemas for agent swarm outputs.
+
+All agent outputs are append-only JSONL files. Each line is a JSON object
+conforming to one of the schemas defined here.
+"""
+
+from datetime import UTC, datetime
+from typing import Any, Literal
+
+from pydantic import BaseModel, Field
+
+
+class ComponentInfo(BaseModel):
+    """Information about a component involved in a behavior."""
+
+    component_key: str = Field(
+        ...,
+        description="Component key in format 'layer:component_idx' (e.g., 'h.0.mlp.c_fc:5')",
+    )
+    role: str = Field(
+        ...,
+        description="The role this component plays in the behavior (e.g., 'stores subject gender')",
+    )
+    interpretation: str | None = Field(
+        default=None,
+        description="Auto-interp label for this component if available",
+    )
+
+
+class Evidence(BaseModel):
+    """A piece of supporting evidence for an explanation."""
+
+    evidence_type: Literal["ablation", "attribution", "activation_pattern", "correlation", "other"]
+    description: str = Field(
+        ...,
+        description="Description of the evidence",
+    )
+    details: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Additional structured details (e.g., ablation results, attribution values)",
+    )
+
+
+class BehaviorExplanation(BaseModel):
+    """A candidate explanation for a behavior discovered by an agent.
+
+    This is the primary output schema for agent investigations. Each explanation
+    describes a behavior (demonstrated by a subject prompt), the components involved,
+    and supporting evidence.
+    """
+
+    subject_prompt: str = Field(
+        ...,
+        description="A prompt that demonstrates the behavior being explained",
+    )
+    behavior_description: str = Field(
+        ...,
+        description="Clear description of the behavior (e.g., 'correctly predicts gendered pronoun')",
+    )
+    components_involved: list[ComponentInfo] = Field(
+        ...,
+        description="List of components involved in this behavior and their roles",
+    )
+    explanation: str = Field(
+        ...,
+        description="Explanation of how the components work together to produce the behavior",
+    )
+    supporting_evidence: list[Evidence] = Field(
+        default_factory=list,
+        description="Evidence supporting this explanation (ablations, attributions, etc.)",
+    )
+    confidence: Literal["high", "medium", "low"] = Field(
+        ...,
+        description="Agent's confidence in this explanation",
+    )
+    alternative_hypotheses: list[str] = Field(
+        default_factory=list,
+        description="Alternative hypotheses that were considered but not fully supported",
+    )
+    limitations: list[str] = Field(
+        default_factory=list,
+        description="Known limitations of this explanation",
+    )
+
+
+class SwarmEvent(BaseModel):
+    """A generic event logged by an agent during investigation.
+
+    Used for logging progress, observations, and other non-explanation events.
+    """
+
+    event_type: Literal[
+        "start",
+        "progress",
+        "observation",
+        "hypothesis",
+        "test_result",
+        "explanation",
+        "error",
+        "complete",
+    ]
+    timestamp: datetime = Field(default_factory=lambda: datetime.now(UTC))
+    message: str
+    details: dict[str, Any] = Field(default_factory=dict)
+
+
+class AgentOutput(BaseModel):
+    """Container for all outputs from a single agent run.
+
+    Written to the agent's output directory as output.json upon completion.
+    """
+
+    task_id: int
+    wandb_path: str
+    started_at: datetime
+    completed_at: datetime | None = None
+    explanations: list[BehaviorExplanation] = Field(default_factory=list)
+    events: list[SwarmEvent] = Field(default_factory=list)
+    status: Literal["running", "completed", "failed"] = "running"
+    error: str | None = None
diff --git a/spd/agent_swarm/scripts/__init__.py b/spd/agent_swarm/scripts/__init__.py
new file mode 100644
index 000000000..9d0e8ed1b
--- /dev/null
+++ b/spd/agent_swarm/scripts/__init__.py
@@ -0,0 +1 @@
+"""Agent swarm SLURM scripts."""
diff --git a/spd/agent_swarm/scripts/run_agent.py b/spd/agent_swarm/scripts/run_agent.py
new file mode 100644
index 000000000..83a32752c
--- /dev/null
+++ b/spd/agent_swarm/scripts/run_agent.py
@@ -0,0 +1,284 @@
+"""Worker script that runs inside each SLURM job.
+
+This script:
+1. Creates an isolated output directory for this agent
+2. Starts the app backend with an isolated database
+3. Loads the SPD run
+4. Launches Claude Code with investigation instructions
+5. Handles cleanup on exit
+"""
+
+import os
+import signal
+import socket
+import subprocess
+import sys
+import time
+from pathlib import Path
+from types import FrameType
+
+import fire
+import requests
+
+from spd.agent_swarm.agent_prompt import get_agent_prompt
+from spd.agent_swarm.schemas import SwarmEvent
+from spd.agent_swarm.scripts.run_slurm import get_swarm_output_dir
+from spd.log import logger
+
+
+def find_available_port(start_port: int = 8000, max_attempts: int = 100) -> int:
+    """Find an available port starting from start_port."""
+    for offset in range(max_attempts):
+        port = start_port + offset
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            try:
+                s.bind(("localhost", port))
+                return port
+            except OSError:
+                continue
+    raise RuntimeError(
+        f"Could not find available port in range {start_port}-{start_port + max_attempts}"
+    )
+
+
+def wait_for_backend(port: int, timeout: float = 120.0) -> bool:
+    """Wait for the backend to become healthy."""
+    url = f"http://localhost:{port}/api/health"
+    start = time.time()
+    while time.time() - start < timeout:
+        try:
+            resp = requests.get(url, timeout=5)
+            if resp.status_code == 200:
+                return True
+        except requests.exceptions.ConnectionError:
+            pass
+        time.sleep(1)
+    return False
+
+
+def load_run(port: int, wandb_path: str, context_length: int) -> bool:
+    """Load the SPD run into the backend."""
+    url = f"http://localhost:{port}/api/runs/load"
+    params = {"wandb_path": wandb_path, "context_length": context_length}
+    try:
+        resp = requests.post(url, params=params, timeout=300)
+        return resp.status_code == 200
+    except Exception as e:
+        logger.error(f"Failed to load run: {e}")
+        return False
+
+
+def log_event(events_path: Path, event: SwarmEvent) -> None:
+    """Append an event to the events log."""
+    with open(events_path, "a") as f:
+        f.write(event.model_dump_json() + "\n")
+
+
+def run_agent(
+    wandb_path: str,
+    task_id: int,
+    swarm_id: str,
+    context_length: int = 128,
+) -> None:
+    """Run a single investigation agent.
+
+    Args:
+        wandb_path: WandB path of the SPD run.
+        task_id: SLURM task ID (1-indexed).
+        swarm_id: Unique identifier for this swarm.
+        context_length: Context length for prompts.
+    """
+    # Setup output directory
+    swarm_dir = get_swarm_output_dir(swarm_id)
+    task_dir = swarm_dir / f"task_{task_id}"
+    task_dir.mkdir(parents=True, exist_ok=True)
+
+    events_path = task_dir / "events.jsonl"
+    explanations_path = task_dir / "explanations.jsonl"
+    db_path = task_dir / "app.db"
+
+    # Initialize empty output files
+    explanations_path.touch()
+
+    log_event(
+        events_path,
+        SwarmEvent(
+            event_type="start",
+            message=f"Agent {task_id} starting",
+            details={"wandb_path": wandb_path, "swarm_id": swarm_id},
+        ),
+    )
+
+    # Find available port (offset by task_id to reduce collisions)
+    port = find_available_port(start_port=8000 + (task_id - 1) * 10)
+    logger.info(f"[Task {task_id}] Using port {port}")
+
+    log_event(
+        events_path,
+        SwarmEvent(
+            event_type="progress",
+            message=f"Starting backend on port {port}",
+            details={"port": port, "db_path": str(db_path)},
+        ),
+    )
+
+    # Start backend with isolated database
+    env = os.environ.copy()
+    env["SPD_APP_DB_PATH"] = str(db_path)
+
+    backend_cmd = [
+        sys.executable,
+        "-m",
+        "spd.app.backend.server",
+        "--port",
+        str(port),
+    ]
+
+    backend_proc = subprocess.Popen(
+        backend_cmd,
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+    )
+
+    # Setup cleanup handler
+    def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
+        _ = frame  # Unused but required by signal handler signature
+        logger.info(f"[Task {task_id}] Cleaning up...")
+        if backend_proc.poll() is None:
+            backend_proc.terminate()
+            try:
+                backend_proc.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                backend_proc.kill()
+        if signum is not None:
+            sys.exit(1)
+
+    signal.signal(signal.SIGTERM, cleanup)
+    signal.signal(signal.SIGINT, cleanup)
+
+    try:
+        # Wait for backend to be ready
+        logger.info(f"[Task {task_id}] Waiting for backend...")
+        if not wait_for_backend(port):
+            log_event(
+                events_path,
+                SwarmEvent(
+                    event_type="error",
+                    message="Backend failed to start",
+                ),
+            )
+            raise RuntimeError("Backend failed to start")
+
+        logger.info(f"[Task {task_id}] Backend ready, loading run...")
+        log_event(
+            events_path,
+            SwarmEvent(
+                event_type="progress",
+                message="Backend ready, loading run",
+            ),
+        )
+
+        # Load the SPD run
+        if not load_run(port, wandb_path, context_length):
+            log_event(
+                events_path,
+                SwarmEvent(
+                    event_type="error",
+                    message="Failed to load run",
+                    details={"wandb_path": wandb_path},
+                ),
+            )
+            raise RuntimeError(f"Failed to load run: {wandb_path}")
+
+        logger.info(f"[Task {task_id}] Run loaded, launching Claude Code...")
+        log_event(
+            events_path,
+            SwarmEvent(
+                event_type="progress",
+                message="Run loaded, launching Claude Code agent",
+            ),
+        )
+
+        # Generate agent prompt
+        agent_prompt = get_agent_prompt(
+            port=port,
+            wandb_path=wandb_path,
+            task_id=task_id,
+            output_dir=str(task_dir),
+        )
+
+        # Write prompt to file for reference
+        prompt_path = task_dir / "agent_prompt.md"
+        prompt_path.write_text(agent_prompt)
+
+        # Launch Claude Code
+        # The agent will investigate behaviors and write to the output files
+        claude_cmd = [
+            "claude",
+            "--print",  # Print output to stdout
+            "--dangerously-skip-permissions",  # Allow file writes
+        ]
+
+        logger.info(f"[Task {task_id}] Starting Claude Code session...")
+
+        claude_proc = subprocess.Popen(
+            claude_cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            cwd=str(task_dir),
+        )
+
+        # Send the investigation prompt
+        investigation_request = f"""
+{agent_prompt}
+
+---
+
+Please begin your investigation. Start by checking the backend status and exploring
+available component interpretations. Then find an interesting behavior and investigate it.
+
+Remember to log your progress to events.jsonl and write complete explanations to
+explanations.jsonl when you discover something.
+"""
+
+        stdout, _ = claude_proc.communicate(input=investigation_request)
+
+        # Save Claude's output
+        output_path = task_dir / "claude_output.txt"
+        output_path.write_text(stdout or "")
+
+        log_event(
+            events_path,
+            SwarmEvent(
+                event_type="complete",
+                message="Investigation complete",
+                details={"exit_code": claude_proc.returncode},
+            ),
+        )
+
+        logger.info(f"[Task {task_id}] Investigation complete")
+
+    except Exception as e:
+        log_event(
+            events_path,
+            SwarmEvent(
+                event_type="error",
+                message=f"Agent failed: {e}",
+                details={"error_type": type(e).__name__},
+            ),
+        )
+        logger.error(f"[Task {task_id}] Failed: {e}")
+        raise
+    finally:
+        cleanup()
+
+
+def cli() -> None:
+    fire.Fire(run_agent)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/spd/agent_swarm/scripts/run_slurm.py b/spd/agent_swarm/scripts/run_slurm.py
new file mode 100644
index 000000000..8b99e253d
--- /dev/null
+++ b/spd/agent_swarm/scripts/run_slurm.py
@@ -0,0 +1,119 @@
+"""SLURM launcher for agent swarm.
+
+Submits a SLURM array job where each task runs an independent agent investigating
+behaviors in an SPD model decomposition.
+
+Each agent:
+1. Starts an isolated app backend (unique port, isolated database)
+2. Launches Claude Code with investigation instructions
+3. Writes findings to append-only JSONL files
+"""
+
+import secrets
+from pathlib import Path
+
+from spd.log import logger
+from spd.settings import SPD_OUT_DIR
+from spd.utils.git_utils import create_git_snapshot
+from spd.utils.slurm import (
+    SlurmArrayConfig,
+    generate_array_script,
+    submit_slurm_job,
+)
+
+
+def get_swarm_output_dir(swarm_id: str) -> Path:
+    """Get the output directory for a swarm run."""
+    return SPD_OUT_DIR / "agent_swarm" / swarm_id
+
+
+def launch_agent_swarm(
+    wandb_path: str,
+    n_agents: int,
+    context_length: int = 128,
+    partition: str = "h200-reserved",
+    time: str = "8:00:00",
+    job_suffix: str | None = None,
+) -> None:
+    """Launch a swarm of agents to investigate behaviors.
+
+    Args:
+        wandb_path: WandB run path for the SPD decomposition.
+        n_agents: Number of agents to launch.
+        context_length: Context length for prompts.
+        partition: SLURM partition.
+        time: Time limit per agent.
+        job_suffix: Optional suffix for job names.
+    """
+    swarm_id = f"swarm-{secrets.token_hex(4)}"
+    output_dir = get_swarm_output_dir(swarm_id)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    snapshot_branch, commit_hash = create_git_snapshot(swarm_id)
+    logger.info(f"Created git snapshot: {snapshot_branch} ({commit_hash[:8]})")
+
+    suffix = f"-{job_suffix}" if job_suffix else ""
+    job_name = f"spd-swarm{suffix}"
+
+    # Write swarm metadata
+    metadata_path = output_dir / "metadata.json"
+    import json
+
+    metadata = {
+        "swarm_id": swarm_id,
+        "wandb_path": wandb_path,
+        "n_agents": n_agents,
+        "context_length": context_length,
+        "snapshot_branch": snapshot_branch,
+        "commit_hash": commit_hash,
+    }
+    metadata_path.write_text(json.dumps(metadata, indent=2))
+
+    # Build worker commands (SLURM arrays are 1-indexed)
+    worker_commands = []
+    for task_id in range(1, n_agents + 1):
+        cmd = (
+            f"python -m spd.agent_swarm.scripts.run_agent "
+            f'"{wandb_path}" '
+            f"--task_id {task_id} "
+            f"--swarm_id {swarm_id} "
+            f"--context_length {context_length}"
+        )
+        worker_commands.append(cmd)
+
+    array_config = SlurmArrayConfig(
+        job_name=job_name,
+        partition=partition,
+        n_gpus=1,
+        time=time,
+        snapshot_branch=snapshot_branch,
+        max_concurrent_tasks=min(n_agents, 8),  # Respect cluster limits
+    )
+    array_script = generate_array_script(array_config, worker_commands)
+    array_result = submit_slurm_job(
+        array_script,
+        "agent_swarm",
+        is_array=True,
+        n_array_tasks=n_agents,
+    )
+
+    logger.section("Agent swarm jobs submitted!")
+    logger.values(
+        {
+            "Swarm ID": swarm_id,
+            "WandB path": wandb_path,
+            "N agents": n_agents,
+            "Context length": context_length,
+            "Output directory": str(output_dir),
+            "Snapshot": f"{snapshot_branch} ({commit_hash[:8]})",
+            "Job ID": array_result.job_id,
+            "Logs": array_result.log_pattern,
+            "Script": str(array_result.script_path),
+        }
+    )
+    logger.info("")
+    logger.info("Monitor progress:")
+    logger.info(f"  tail -f {output_dir}/task_*/events.jsonl")
+    logger.info("")
+    logger.info("View explanations:")
+    logger.info(f"  cat {output_dir}/task_*/explanations.jsonl | jq .")
diff --git a/spd/agent_swarm/scripts/run_slurm_cli.py b/spd/agent_swarm/scripts/run_slurm_cli.py
new file mode 100644
index 000000000..20a6d8457
--- /dev/null
+++ b/spd/agent_swarm/scripts/run_slurm_cli.py
@@ -0,0 +1,62 @@
+"""CLI entry point for agent swarm SLURM launcher.
+
+Thin wrapper for fast --help. Heavy imports deferred to run_slurm.py.
+
+Usage:
+    spd-swarm <wandb_path> --n_agents 10
+    spd-swarm <wandb_path> --n_agents 5 --context_length 128
+
+Examples:
+    # Launch 10 agents to investigate a decomposition
+    spd-swarm goodfire-ai/spd/runs/abc123 --n_agents 10
+
+    # Launch 5 agents with custom context length and time limit
+    spd-swarm goodfire-ai/spd/runs/abc123 --n_agents 5 --context_length 64 --time 4:00:00
+"""
+
+import fire
+
+from spd.settings import DEFAULT_PARTITION_NAME
+
+
+def main(
+    wandb_path: str,
+    n_agents: int,
+    context_length: int = 128,
+    partition: str = DEFAULT_PARTITION_NAME,
+    time: str = "8:00:00",
+    job_suffix: str | None = None,
+) -> None:
+    """Launch a swarm of agents to investigate behaviors in an SPD model.
+
+    Each agent runs in its own SLURM job with an isolated app backend instance.
+    Agents use Claude Code to investigate behaviors and write findings to
+    append-only JSONL files.
+
+    Args:
+        wandb_path: WandB run path for the SPD decomposition to investigate.
+            Format: "entity/project/runs/run_id" or "wandb:entity/project/run_id"
+        n_agents: Number of agents to launch (each gets 1 GPU).
+        context_length: Context length for prompts (default 128).
+        partition: SLURM partition name.
+        time: Job time limit per agent (default 8 hours).
+        job_suffix: Optional suffix for SLURM job names.
+    """
+    from spd.agent_swarm.scripts.run_slurm import launch_agent_swarm
+
+    launch_agent_swarm(
+        wandb_path=wandb_path,
+        n_agents=n_agents,
+        context_length=context_length,
+        partition=partition,
+        time=time,
+        job_suffix=job_suffix,
+    )
+
+
+def cli() -> None:
+    fire.Fire(main)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/spd/app/backend/database.py b/spd/app/backend/database.py
index e5ee4db59..1ee06ce5a 100644
--- a/spd/app/backend/database.py
+++ b/spd/app/backend/database.py
@@ -8,6 +8,7 @@
 
 import hashlib
 import json
+import os
 import sqlite3
 from dataclasses import asdict
 from pathlib import Path
@@ -23,8 +24,21 @@
 GraphType = Literal["standard", "optimized", "manual"]
 
 # Persistent data directories
+# Can be overridden via SPD_APP_DB_PATH environment variable for isolation
 _APP_DATA_DIR = REPO_ROOT / ".data" / "app"
-DEFAULT_DB_PATH = _APP_DATA_DIR / "prompt_attr.db"
+_DEFAULT_DB_PATH = _APP_DATA_DIR / "prompt_attr.db"
+
+
+def get_default_db_path() -> Path:
+    """Get the default database path, respecting SPD_APP_DB_PATH env var."""
+    env_path = os.environ.get("SPD_APP_DB_PATH")
+    if env_path:
+        return Path(env_path)
+    return _DEFAULT_DB_PATH
+
+
+# For backwards compatibility
+DEFAULT_DB_PATH = _DEFAULT_DB_PATH
 
 
 class Run(BaseModel):
@@ -107,7 +121,7 @@ class PromptAttrDB:
     """
 
     def __init__(self, db_path: Path | None = None, check_same_thread: bool = True):
-        self.db_path = db_path or DEFAULT_DB_PATH
+        self.db_path = db_path or get_default_db_path()
         self._check_same_thread = check_same_thread
         self._conn: sqlite3.Connection | None = None
 

From 498d459e89f1360464dbdfce4a8681e9ab75f093 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 30 Jan 2026 21:04:25 +0000
Subject: [PATCH 02/62] Stream Claude Code output to file in real-time

Previously used communicate() which buffers all output until process
completes. Now streams directly to claude_output.txt so you can monitor
agent activity with: tail -f <task_dir>/claude_output.txt

https://claude.ai/code/session_01UMpYFZ3A98vsPkqoq6zvT6
---
 spd/agent_swarm/scripts/run_agent.py | 39 +++++++++++++++-------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/spd/agent_swarm/scripts/run_agent.py b/spd/agent_swarm/scripts/run_agent.py
index 83a32752c..072d8a8c6 100644
--- a/spd/agent_swarm/scripts/run_agent.py
+++ b/spd/agent_swarm/scripts/run_agent.py
@@ -212,8 +212,8 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
         prompt_path = task_dir / "agent_prompt.md"
         prompt_path.write_text(agent_prompt)
 
-        # Launch Claude Code
-        # The agent will investigate behaviors and write to the output files
+        # Launch Claude Code with output streaming to file
+        claude_output_path = task_dir / "claude_output.txt"
         claude_cmd = [
             "claude",
             "--print",  # Print output to stdout
@@ -221,18 +221,21 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
         ]
 
         logger.info(f"[Task {task_id}] Starting Claude Code session...")
+        logger.info(f"[Task {task_id}] Monitor with: tail -f {claude_output_path}")
+
+        # Open output file for streaming writes
+        with open(claude_output_path, "w") as output_file:
+            claude_proc = subprocess.Popen(
+                claude_cmd,
+                stdin=subprocess.PIPE,
+                stdout=output_file,
+                stderr=subprocess.STDOUT,
+                text=True,
+                cwd=str(task_dir),
+            )
 
-        claude_proc = subprocess.Popen(
-            claude_cmd,
-            stdin=subprocess.PIPE,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            text=True,
-            cwd=str(task_dir),
-        )
-
-        # Send the investigation prompt
-        investigation_request = f"""
+            # Send the investigation prompt and close stdin
+            investigation_request = f"""
 {agent_prompt}
 
 ---
@@ -243,12 +246,12 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
 Remember to log your progress to events.jsonl and write complete explanations to
 explanations.jsonl when you discover something.
 """
+            assert claude_proc.stdin is not None
+            claude_proc.stdin.write(investigation_request)
+            claude_proc.stdin.close()
 
-        stdout, _ = claude_proc.communicate(input=investigation_request)
-
-        # Save Claude's output
-        output_path = task_dir / "claude_output.txt"
-        output_path.write_text(stdout or "")
+            # Wait for Claude to finish (output streams to file in real-time)
+            claude_proc.wait()
 
         log_event(
             events_path,

From efe5928ebafca30d30d9dcc1b9e40a1412b1fa19 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 30 Jan 2026 21:31:05 +0000
Subject: [PATCH 03/62] Use stream-json output format and add max_turns limit

- Switch to --output-format stream-json for structured JSONL output
- Add --max-turns parameter (default 50) to prevent runaway agents
- Output file changed from claude_output.txt to claude_output.jsonl
- Updated monitoring commands in logs to use jq for parsing

Monitor with: tail -f task_*/claude_output.jsonl | jq -r '.result // empty'

https://claude.ai/code/session_01UMpYFZ3A98vsPkqoq6zvT6
---
 spd/agent_swarm/scripts/run_agent.py     | 15 ++++++++++-----
 spd/agent_swarm/scripts/run_slurm.py     | 10 +++++++++-
 spd/agent_swarm/scripts/run_slurm_cli.py |  9 ++++++---
 3 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/spd/agent_swarm/scripts/run_agent.py b/spd/agent_swarm/scripts/run_agent.py
index 072d8a8c6..f048335e3 100644
--- a/spd/agent_swarm/scripts/run_agent.py
+++ b/spd/agent_swarm/scripts/run_agent.py
@@ -79,6 +79,7 @@ def run_agent(
     task_id: int,
     swarm_id: str,
     context_length: int = 128,
+    max_turns: int = 50,
 ) -> None:
     """Run a single investigation agent.
 
@@ -87,6 +88,7 @@ def run_agent(
         task_id: SLURM task ID (1-indexed).
         swarm_id: Unique identifier for this swarm.
         context_length: Context length for prompts.
+        max_turns: Maximum agentic turns before stopping (prevents runaway agents).
     """
     # Setup output directory
     swarm_dir = get_swarm_output_dir(swarm_id)
@@ -212,16 +214,19 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
         prompt_path = task_dir / "agent_prompt.md"
         prompt_path.write_text(agent_prompt)
 
-        # Launch Claude Code with output streaming to file
-        claude_output_path = task_dir / "claude_output.txt"
+        # Launch Claude Code with streaming JSON output
+        claude_output_path = task_dir / "claude_output.jsonl"
         claude_cmd = [
             "claude",
-            "--print",  # Print output to stdout
-            "--dangerously-skip-permissions",  # Allow file writes
+            "--print",
+            "--output-format", "stream-json",  # Structured JSONL for parsing
+            "--max-turns", str(max_turns),  # Prevent runaway agents
+            "--dangerously-skip-permissions",
         ]
 
-        logger.info(f"[Task {task_id}] Starting Claude Code session...")
+        logger.info(f"[Task {task_id}] Starting Claude Code (max_turns={max_turns})...")
         logger.info(f"[Task {task_id}] Monitor with: tail -f {claude_output_path}")
+        logger.info(f"[Task {task_id}] Parse with: tail -f {claude_output_path} | jq -r '.result // empty'")
 
         # Open output file for streaming writes
         with open(claude_output_path, "w") as output_file:
diff --git a/spd/agent_swarm/scripts/run_slurm.py b/spd/agent_swarm/scripts/run_slurm.py
index 8b99e253d..f596e1ed9 100644
--- a/spd/agent_swarm/scripts/run_slurm.py
+++ b/spd/agent_swarm/scripts/run_slurm.py
@@ -31,6 +31,7 @@ def launch_agent_swarm(
     wandb_path: str,
     n_agents: int,
     context_length: int = 128,
+    max_turns: int = 50,
     partition: str = "h200-reserved",
     time: str = "8:00:00",
     job_suffix: str | None = None,
@@ -41,6 +42,7 @@ def launch_agent_swarm(
         wandb_path: WandB run path for the SPD decomposition.
         n_agents: Number of agents to launch.
         context_length: Context length for prompts.
+        max_turns: Maximum agentic turns per agent (prevents runaway).
         partition: SLURM partition.
         time: Time limit per agent.
         job_suffix: Optional suffix for job names.
@@ -64,6 +66,7 @@ def launch_agent_swarm(
         "wandb_path": wandb_path,
         "n_agents": n_agents,
         "context_length": context_length,
+        "max_turns": max_turns,
         "snapshot_branch": snapshot_branch,
         "commit_hash": commit_hash,
     }
@@ -77,7 +80,8 @@ def launch_agent_swarm(
             f'"{wandb_path}" '
             f"--task_id {task_id} "
             f"--swarm_id {swarm_id} "
-            f"--context_length {context_length}"
+            f"--context_length {context_length} "
+            f"--max_turns {max_turns}"
         )
         worker_commands.append(cmd)
 
@@ -104,6 +108,7 @@ def launch_agent_swarm(
             "WandB path": wandb_path,
             "N agents": n_agents,
             "Context length": context_length,
+            "Max turns": max_turns,
             "Output directory": str(output_dir),
             "Snapshot": f"{snapshot_branch} ({commit_hash[:8]})",
             "Job ID": array_result.job_id,
@@ -115,5 +120,8 @@ def launch_agent_swarm(
     logger.info("Monitor progress:")
     logger.info(f"  tail -f {output_dir}/task_*/events.jsonl")
     logger.info("")
+    logger.info("Monitor Claude output (stream-json):")
+    logger.info(f"  tail -f {output_dir}/task_*/claude_output.jsonl | jq -r '.result // empty'")
+    logger.info("")
     logger.info("View explanations:")
     logger.info(f"  cat {output_dir}/task_*/explanations.jsonl | jq .")
diff --git a/spd/agent_swarm/scripts/run_slurm_cli.py b/spd/agent_swarm/scripts/run_slurm_cli.py
index 20a6d8457..9b75ce95f 100644
--- a/spd/agent_swarm/scripts/run_slurm_cli.py
+++ b/spd/agent_swarm/scripts/run_slurm_cli.py
@@ -4,14 +4,14 @@
 
 Usage:
     spd-swarm <wandb_path> --n_agents 10
-    spd-swarm <wandb_path> --n_agents 5 --context_length 128
+    spd-swarm <wandb_path> --n_agents 5 --max_turns 30
 
 Examples:
     # Launch 10 agents to investigate a decomposition
     spd-swarm goodfire-ai/spd/runs/abc123 --n_agents 10
 
-    # Launch 5 agents with custom context length and time limit
-    spd-swarm goodfire-ai/spd/runs/abc123 --n_agents 5 --context_length 64 --time 4:00:00
+    # Launch 5 agents with custom settings
+    spd-swarm goodfire-ai/spd/runs/abc123 --n_agents 5 --max_turns 30 --time 4:00:00
 """
 
 import fire
@@ -23,6 +23,7 @@ def main(
     wandb_path: str,
     n_agents: int,
     context_length: int = 128,
+    max_turns: int = 50,
     partition: str = DEFAULT_PARTITION_NAME,
     time: str = "8:00:00",
     job_suffix: str | None = None,
@@ -38,6 +39,7 @@ def main(
             Format: "entity/project/runs/run_id" or "wandb:entity/project/run_id"
         n_agents: Number of agents to launch (each gets 1 GPU).
         context_length: Context length for prompts (default 128).
+        max_turns: Maximum agentic turns per agent (default 50, prevents runaway).
         partition: SLURM partition name.
         time: Job time limit per agent (default 8 hours).
         job_suffix: Optional suffix for SLURM job names.
@@ -48,6 +50,7 @@ def main(
         wandb_path=wandb_path,
         n_agents=n_agents,
         context_length=context_length,
+        max_turns=max_turns,
         partition=partition,
         time=time,
         job_suffix=job_suffix,

From ef5b0fd80ee91eb9ed5c392ef00581693de964c8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 30 Jan 2026 22:08:49 +0000
Subject: [PATCH 04/62] Fix stream-json output requiring --verbose flag

Claude Code requires --verbose when using --output-format=stream-json
with --print mode.

https://claude.ai/code/session_01UMpYFZ3A98vsPkqoq6zvT6
---
 spd/agent_swarm/scripts/run_agent.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spd/agent_swarm/scripts/run_agent.py b/spd/agent_swarm/scripts/run_agent.py
index f048335e3..b37f4e451 100644
--- a/spd/agent_swarm/scripts/run_agent.py
+++ b/spd/agent_swarm/scripts/run_agent.py
@@ -219,8 +219,9 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
         claude_cmd = [
             "claude",
             "--print",
-            "--output-format", "stream-json",  # Structured JSONL for parsing
-            "--max-turns", str(max_turns),  # Prevent runaway agents
+            "--verbose",  # Required for stream-json output
+            "--output-format", "stream-json",
+            "--max-turns", str(max_turns),
             "--dangerously-skip-permissions",
         ]
 

From f40f02e443bdd7099fd11d1bdf56915e797ea09f Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Fri, 30 Jan 2026 23:24:50 +0000
Subject: [PATCH 05/62] Add GPU lock to prevent concurrent GPU operations

When multiple GPU-intensive requests are made concurrently (graph
computation, optimization, intervention), the backend would hang.
This adds a lock that returns HTTP 503 immediately if a GPU operation
is already in progress, allowing clients to retry later.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 spd/agent_swarm/scripts/run_agent.py    |  10 ++-
 spd/app/backend/routers/graphs.py       |  61 +++++++++------
 spd/app/backend/routers/intervention.py | 100 +++++++++++++-----------
 spd/app/backend/state.py                |  23 ++++++
 4 files changed, 121 insertions(+), 73 deletions(-)

diff --git a/spd/agent_swarm/scripts/run_agent.py b/spd/agent_swarm/scripts/run_agent.py
index b37f4e451..c41c8f30c 100644
--- a/spd/agent_swarm/scripts/run_agent.py
+++ b/spd/agent_swarm/scripts/run_agent.py
@@ -220,14 +220,18 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
             "claude",
             "--print",
             "--verbose",  # Required for stream-json output
-            "--output-format", "stream-json",
-            "--max-turns", str(max_turns),
+            "--output-format",
+            "stream-json",
+            "--max-turns",
+            str(max_turns),
             "--dangerously-skip-permissions",
         ]
 
         logger.info(f"[Task {task_id}] Starting Claude Code (max_turns={max_turns})...")
         logger.info(f"[Task {task_id}] Monitor with: tail -f {claude_output_path}")
-        logger.info(f"[Task {task_id}] Parse with: tail -f {claude_output_path} | jq -r '.result // empty'")
+        logger.info(
+            f"[Task {task_id}] Parse with: tail -f {claude_output_path} | jq -r '.result // empty'"
+        )
 
         # Open output file for streaming writes
         with open(claude_output_path, "w") as output_file:
diff --git a/spd/app/backend/routers/graphs.py b/spd/app/backend/routers/graphs.py
index c0f478744..1ea02e5c6 100644
--- a/spd/app/backend/routers/graphs.py
+++ b/spd/app/backend/routers/graphs.py
@@ -240,8 +240,20 @@ def build_out_probs(
 
 def stream_computation(
     work: Callable[[ProgressCallback], GraphData | GraphDataWithOptimization],
+    gpu_lock: threading.Lock,
 ) -> StreamingResponse:
-    """Run graph computation in a thread with SSE streaming for progress updates."""
+    """Run graph computation in a thread with SSE streaming for progress updates.
+
+    Acquires gpu_lock before starting and holds it until computation completes.
+    Raises 503 if the lock is already held by another operation.
+    """
+    # Try to acquire lock non-blocking - fail fast if GPU is busy
+    if not gpu_lock.acquire(blocking=False):
+        raise HTTPException(
+            status_code=503,
+            detail="GPU operation already in progress. Please wait and retry.",
+        )
+
     progress_queue: queue.Queue[dict[str, Any]] = queue.Queue()
 
     def on_progress(current: int, total: int, stage: str) -> None:
@@ -256,28 +268,31 @@ def compute_thread() -> None:
             progress_queue.put({"type": "error", "error": str(e)})
 
     def generate() -> Generator[str]:
-        thread = threading.Thread(target=compute_thread)
-        thread.start()
-
-        while True:
-            try:
-                msg = progress_queue.get(timeout=0.1)
-            except queue.Empty:
-                if not thread.is_alive():
+        try:
+            thread = threading.Thread(target=compute_thread)
+            thread.start()
+
+            while True:
+                try:
+                    msg = progress_queue.get(timeout=0.1)
+                except queue.Empty:
+                    if not thread.is_alive():
+                        break
+                    continue
+
+                if msg["type"] == "progress":
+                    yield f"data: {json.dumps(msg)}\n\n"
+                elif msg["type"] == "error":
+                    yield f"data: {json.dumps(msg)}\n\n"
+                    break
+                elif msg["type"] == "result":
+                    complete_data = {"type": "complete", "data": msg["result"].model_dump()}
+                    yield f"data: {json.dumps(complete_data)}\n\n"
                     break
-                continue
-
-            if msg["type"] == "progress":
-                yield f"data: {json.dumps(msg)}\n\n"
-            elif msg["type"] == "error":
-                yield f"data: {json.dumps(msg)}\n\n"
-                break
-            elif msg["type"] == "result":
-                complete_data = {"type": "complete", "data": msg["result"].model_dump()}
-                yield f"data: {json.dumps(complete_data)}\n\n"
-                break
 
-        thread.join()
+            thread.join()
+        finally:
+            gpu_lock.release()
 
     return StreamingResponse(generate(), media_type="text/event-stream")
 
@@ -456,7 +471,7 @@ def work(on_progress: ProgressCallback) -> GraphData:
             l0_total=len(filtered_node_ci_vals),
         )
 
-    return stream_computation(work)
+    return stream_computation(work, manager._gpu_lock)
 
 
 def _edge_to_edge_data(edge: Edge) -> EdgeData:
@@ -660,7 +675,7 @@ def work(on_progress: ProgressCallback) -> GraphDataWithOptimization:
             ),
         )
 
-    return stream_computation(work)
+    return stream_computation(work, manager._gpu_lock)
 
 
 def _add_pseudo_layer_nodes(
diff --git a/spd/app/backend/routers/intervention.py b/spd/app/backend/routers/intervention.py
index 4c46e136c..a8fcb3fbc 100644
--- a/spd/app/backend/routers/intervention.py
+++ b/spd/app/backend/routers/intervention.py
@@ -148,45 +148,48 @@ def _run_intervention_forward(
 
 @router.post("")
 @log_errors
-def run_intervention(request: InterventionRequest, loaded: DepLoadedRun) -> InterventionResponse:
+def run_intervention(
+    request: InterventionRequest, loaded: DepLoadedRun, manager: DepStateManager
+) -> InterventionResponse:
     """Run intervention forward pass with specified nodes active (legacy endpoint)."""
-    token_ids = loaded.tokenizer.encode(request.text, add_special_tokens=False)
-    tokens = torch.tensor([token_ids], dtype=torch.long, device=DEVICE)
-
-    active_nodes = [(n.layer, n.seq_pos, n.component_idx) for n in request.nodes]
-
-    seq_len = tokens.shape[1]
-    for _, seq_pos, _ in active_nodes:
-        if seq_pos >= seq_len:
-            raise ValueError(f"seq_pos {seq_pos} out of bounds for text with {seq_len} tokens")
-
-    result = compute_intervention_forward(
-        model=loaded.model,
-        tokens=tokens,
-        active_nodes=active_nodes,
-        top_k=request.top_k,
-        tokenizer=loaded.tokenizer,
-    )
+    with manager.gpu_lock():
+        token_ids = loaded.tokenizer.encode(request.text, add_special_tokens=False)
+        tokens = torch.tensor([token_ids], dtype=torch.long, device=DEVICE)
+
+        active_nodes = [(n.layer, n.seq_pos, n.component_idx) for n in request.nodes]
+
+        seq_len = tokens.shape[1]
+        for _, seq_pos, _ in active_nodes:
+            if seq_pos >= seq_len:
+                raise ValueError(f"seq_pos {seq_pos} out of bounds for text with {seq_len} tokens")
+
+        result = compute_intervention_forward(
+            model=loaded.model,
+            tokens=tokens,
+            active_nodes=active_nodes,
+            top_k=request.top_k,
+            tokenizer=loaded.tokenizer,
+        )
 
-    predictions_per_position = [
-        [
-            TokenPrediction(
-                token=token,
-                token_id=token_id,
-                spd_prob=spd_prob,
-                target_prob=target_prob,
-                logit=logit,
-                target_logit=target_logit,
-            )
-            for token, token_id, spd_prob, logit, target_prob, target_logit in pos_predictions
+        predictions_per_position = [
+            [
+                TokenPrediction(
+                    token=token,
+                    token_id=token_id,
+                    spd_prob=spd_prob,
+                    target_prob=target_prob,
+                    logit=logit,
+                    target_logit=target_logit,
+                )
+                for token, token_id, spd_prob, logit, target_prob, target_logit in pos_predictions
+            ]
+            for pos_predictions in result.predictions_per_position
         ]
-        for pos_predictions in result.predictions_per_position
-    ]
 
-    return InterventionResponse(
-        input_tokens=result.input_tokens,
-        predictions_per_position=predictions_per_position,
-    )
+        return InterventionResponse(
+            input_tokens=result.input_tokens,
+            predictions_per_position=predictions_per_position,
+        )
 
 
 @router.post("/run")
@@ -195,14 +198,16 @@ def run_and_save_intervention(
     request: RunInterventionRequest,
     loaded: DepLoadedRun,
     db: DepDB,
+    manager: DepStateManager,
 ) -> InterventionRunSummary:
     """Run an intervention and save the result."""
-    response = _run_intervention_forward(
-        text=request.text,
-        selected_nodes=request.selected_nodes,
-        top_k=request.top_k,
-        loaded=loaded,
-    )
+    with manager.gpu_lock():
+        response = _run_intervention_forward(
+            text=request.text,
+            selected_nodes=request.selected_nodes,
+            top_k=request.top_k,
+            loaded=loaded,
+        )
 
     run_id = db.save_intervention_run(
         graph_id=request.graph_id,
@@ -310,12 +315,13 @@ def fork_intervention_run(
     modified_text = loaded.tokenizer.decode(modified_token_ids)
 
     # Run the intervention forward pass with modified tokens but same selected nodes
-    response = _run_intervention_forward(
-        text=modified_text,
-        selected_nodes=parent_run.selected_nodes,
-        top_k=request.top_k,
-        loaded=loaded,
-    )
+    with manager.gpu_lock():
+        response = _run_intervention_forward(
+            text=modified_text,
+            selected_nodes=parent_run.selected_nodes,
+            top_k=request.top_k,
+            loaded=loaded,
+        )
 
     # Save the forked run
     fork_id = db.save_forked_intervention_run(
diff --git a/spd/app/backend/state.py b/spd/app/backend/state.py
index 47dacfe51..7364ff1d1 100644
--- a/spd/app/backend/state.py
+++ b/spd/app/backend/state.py
@@ -5,9 +5,13 @@
 - StateManager: Singleton managing app-wide state with proper lifecycle
 """
 
+import threading
+from collections.abc import Generator
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from typing import Any
 
+from fastapi import HTTPException
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
 from spd.app.backend.database import PromptAttrDB, Run
@@ -147,6 +151,7 @@ class StateManager:
 
     def __init__(self) -> None:
         self._state: AppState | None = None
+        self._gpu_lock = threading.Lock()
 
     @classmethod
     def get(cls) -> "StateManager":
@@ -189,3 +194,21 @@ def close(self) -> None:
         """Clean up resources."""
         if self._state is not None:
             self._state.db.close()
+
+    @contextmanager
+    def gpu_lock(self) -> Generator[None]:
+        """Acquire GPU lock or fail with 503 if another GPU operation is in progress.
+
+        Use this for GPU-intensive endpoints to prevent concurrent operations
+        that would cause the server to hang.
+        """
+        acquired = self._gpu_lock.acquire(blocking=False)
+        if not acquired:
+            raise HTTPException(
+                status_code=503,
+                detail="GPU operation already in progress. Please wait and retry.",
+            )
+        try:
+            yield
+        finally:
+            self._gpu_lock.release()

From 567fb198c938513c2a348c61dacc97f9435e45b8 Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Fri, 30 Jan 2026 23:38:11 +0000
Subject: [PATCH 06/62] Add research_log.md for human-readable agent progress

Agents now create and update a research_log.md file with readable
progress updates. This makes it easy to follow what the agent is
doing and discovering without parsing JSONL files.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 spd/agent_swarm/CLAUDE.md            | 11 ++++-
 spd/agent_swarm/agent_prompt.py      | 70 +++++++++++++++++++++++-----
 spd/agent_swarm/scripts/run_agent.py | 14 ++++--
 3 files changed, 77 insertions(+), 18 deletions(-)

diff --git a/spd/agent_swarm/CLAUDE.md b/spd/agent_swarm/CLAUDE.md
index ee2e89be2..ee4a57db4 100644
--- a/spd/agent_swarm/CLAUDE.md
+++ b/spd/agent_swarm/CLAUDE.md
@@ -42,11 +42,12 @@ spd/agent_swarm/
 SPD_OUT_DIR/agent_swarm/<swarm_id>/
 ├── metadata.json         # Swarm configuration
 ├── task_1/
-│   ├── events.jsonl      # Progress and observations
+│   ├── research_log.md   # Human-readable progress log (PRIMARY OUTPUT)
+│   ├── events.jsonl      # Structured progress and observations
 │   ├── explanations.jsonl # Complete behavior explanations
 │   ├── app.db            # Isolated SQLite database
 │   ├── agent_prompt.md   # The prompt given to the agent
-│   └── claude_output.txt # Raw Claude Code output
+│   └── claude_output.jsonl # Raw Claude Code output (stream-json format)
 ├── task_2/
 │   └── ...
 └── task_N/
@@ -90,6 +91,12 @@ This prevents conflicts when multiple agents run on the same machine.
 ## Monitoring
 
 ```bash
+# Watch research logs (best way to follow agent progress)
+tail -f SPD_OUT_DIR/agent_swarm/<swarm_id>/task_*/research_log.md
+
+# Watch a specific agent's research log
+cat SPD_OUT_DIR/agent_swarm/<swarm_id>/task_1/research_log.md
+
 # Watch events from all agents
 tail -f SPD_OUT_DIR/agent_swarm/<swarm_id>/task_*/events.jsonl
 
diff --git a/spd/agent_swarm/agent_prompt.py b/spd/agent_swarm/agent_prompt.py
index e3cf4fd21..8ee02d6dd 100644
--- a/spd/agent_swarm/agent_prompt.py
+++ b/spd/agent_swarm/agent_prompt.py
@@ -254,10 +254,50 @@
 
 ## Output Format
 
-Write your findings by appending to the output files:
+Write your findings to the output files. **The research log is your primary output for humans to read.**
+
+### research_log.md (MOST IMPORTANT - Write here frequently!)
+This is a human-readable log of your investigation. Write here often so someone can follow your progress.
+Use clear markdown formatting:
+
+```markdown
+## [HH:MM] Starting Investigation
+
+Looking at component interpretations to find interesting patterns...
+
+## [HH:MM] Hypothesis: Gendered Pronoun Circuit
+
+Found components that seem related to pronouns:
+- h.0.mlp.c_fc:42 - "he/his pronouns after male subjects"
+- h.0.mlp.c_fc:89 - "she/her pronouns after female subjects"
+
+Testing with prompt: "The boy said that he"
+
+## [HH:MM] Optimization Results
+
+Ran optimization for "he" prediction at position 4:
+- Found 15 active components
+- Key components: h.0.mlp.c_fc:42 (CI=0.92), h.1.attn.o_proj:156 (CI=0.78)
+
+## [HH:MM] Ablation Test
+
+Ablating h.0.mlp.c_fc:42:
+- Before: P(he)=0.82, P(she)=0.11
+- After:  P(he)=0.23, P(she)=0.45
+
+This confirms the component is important for masculine pronoun prediction!
+
+## [HH:MM] Conclusion
+
+Found a circuit for gendered pronoun prediction. Components h.0.mlp.c_fc:42 and
+h.1.attn.o_proj:156 work together to predict masculine pronouns after male subjects.
+```
+
+**IMPORTANT**: Update the research log every few minutes with your current progress,
+findings, and next steps. This is how humans monitor your work!
 
 ### events.jsonl
-Log progress and observations:
+Log structured progress and observations:
 ```json
 {{"event_type": "observation", "message": "Component h.0.mlp.c_fc:5 has high CI when subject is male", "details": {{"ci_value": 0.85}}, "timestamp": "..."}}
 ```
@@ -284,15 +324,20 @@
 
 ## Getting Started
 
-1. Check the current status: `curl http://localhost:{port}/api/status`
-2. Explore available interpretations: `curl http://localhost:{port}/api/correlations/interpretations`
-3. Search for interesting prompts or create your own
-4. Optimize a sparse circuit for a behavior you find
-5. Investigate the components involved
-6. Test hypotheses with ablations
-7. Document your findings
+1. **Create your research log**: Start by creating `research_log.md` with a header
+2. Check the current status: `curl http://localhost:{port}/api/status`
+3. Explore available interpretations: `curl http://localhost:{port}/api/correlations/interpretations`
+4. Search for interesting prompts or create your own
+5. **Update research_log.md** with what you're investigating
+6. Optimize a sparse circuit for a behavior you find
+7. Investigate the components involved
+8. Test hypotheses with ablations
+9. **Update research_log.md** with findings
+10. Document complete explanations in `explanations.jsonl`
+
+**Remember to update research_log.md frequently** - this is how humans follow your progress!
 
-Remember: You are exploring! Not every investigation will lead to a clear explanation.
+You are exploring! Not every investigation will lead to a clear explanation.
 Document what you learn, even if it's "this was more complicated than expected."
 
 Good luck, and happy investigating!
@@ -322,9 +367,10 @@ def get_agent_prompt(port: int, wandb_path: str, task_id: int, output_dir: str)
 - **Output Directory**: {output_dir}
 
 Your output files:
-- `{output_dir}/events.jsonl` - Log events and observations here
+- `{output_dir}/research_log.md` - **PRIMARY OUTPUT** - Write readable progress updates here frequently!
+- `{output_dir}/events.jsonl` - Log structured events and observations here
 - `{output_dir}/explanations.jsonl` - Write complete explanations here
 
-To append to these files, use the Write tool or shell redirection.
+**Start by creating research_log.md with a header, then update it every few minutes!**
 """
     return prompt + runtime_context
diff --git a/spd/agent_swarm/scripts/run_agent.py b/spd/agent_swarm/scripts/run_agent.py
index c41c8f30c..3c5a78449 100644
--- a/spd/agent_swarm/scripts/run_agent.py
+++ b/spd/agent_swarm/scripts/run_agent.py
@@ -250,11 +250,17 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
 
 ---
 
-Please begin your investigation. Start by checking the backend status and exploring
-available component interpretations. Then find an interesting behavior and investigate it.
+Please begin your investigation:
 
-Remember to log your progress to events.jsonl and write complete explanations to
-explanations.jsonl when you discover something.
+1. **FIRST**: Create `{task_dir}/research_log.md` with a header like "# Research Log - Task {task_id}"
+2. Check the backend status and explore component interpretations
+3. Find an interesting behavior to investigate
+4. **Update research_log.md frequently** with your progress, findings, and next steps
+
+Remember:
+- research_log.md is your PRIMARY output - humans will read this to follow your work
+- Update it every few minutes with what you're doing and discovering
+- Write complete explanations to explanations.jsonl when you finish investigating a behavior
 """
             assert claude_proc.stdin is not None
             claude_proc.stdin.write(investigation_request)

From 4c4a843b2bb6a9b6e7fb67508e6a6c63960494fc Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Fri, 30 Jan 2026 23:56:57 +0000
Subject: [PATCH 07/62] Add full timestamps to research log examples

Show YYYY-MM-DD HH:MM:SS format and provide tip for getting timestamps.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 spd/agent_swarm/agent_prompt.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/spd/agent_swarm/agent_prompt.py b/spd/agent_swarm/agent_prompt.py
index 8ee02d6dd..b469b47b5 100644
--- a/spd/agent_swarm/agent_prompt.py
+++ b/spd/agent_swarm/agent_prompt.py
@@ -261,11 +261,11 @@
 Use clear markdown formatting:
 
 ```markdown
-## [HH:MM] Starting Investigation
+## [2026-01-30 14:23:15] Starting Investigation
 
 Looking at component interpretations to find interesting patterns...
 
-## [HH:MM] Hypothesis: Gendered Pronoun Circuit
+## [2026-01-30 14:25:42] Hypothesis: Gendered Pronoun Circuit
 
 Found components that seem related to pronouns:
 - h.0.mlp.c_fc:42 - "he/his pronouns after male subjects"
@@ -273,13 +273,13 @@
 
 Testing with prompt: "The boy said that he"
 
-## [HH:MM] Optimization Results
+## [2026-01-30 14:28:03] Optimization Results
 
 Ran optimization for "he" prediction at position 4:
 - Found 15 active components
 - Key components: h.0.mlp.c_fc:42 (CI=0.92), h.1.attn.o_proj:156 (CI=0.78)
 
-## [HH:MM] Ablation Test
+## [2026-01-30 14:31:17] Ablation Test
 
 Ablating h.0.mlp.c_fc:42:
 - Before: P(he)=0.82, P(she)=0.11
@@ -287,12 +287,14 @@
 
 This confirms the component is important for masculine pronoun prediction!
 
-## [HH:MM] Conclusion
+## [2026-01-30 14:35:44] Conclusion
 
 Found a circuit for gendered pronoun prediction. Components h.0.mlp.c_fc:42 and
 h.1.attn.o_proj:156 work together to predict masculine pronouns after male subjects.
 ```
 
+**TIP**: Get the current timestamp with `date '+%Y-%m-%d %H:%M:%S'` for your log entries.
+
 **IMPORTANT**: Update the research log every few minutes with your current progress,
 findings, and next steps. This is how humans monitor your work!
 

From cb6e6f063808af3f46f976f41531dd9ff92b4351 Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Sat, 31 Jan 2026 19:08:07 +0000
Subject: [PATCH 08/62] wip: Integrate agent swarm with MCP for Claude Code
 tool access

---
 spd/agent_swarm/CLAUDE.md                     |   69 +-
 spd/agent_swarm/agent_prompt.py               |  369 ++----
 spd/agent_swarm/scripts/run_agent.py          |   52 +-
 spd/app/CLAUDE.md                             |    3 +-
 spd/app/backend/routers/__init__.py           |    4 +
 spd/app/backend/routers/investigations.py     |  262 ++++
 spd/app/backend/routers/mcp.py                | 1171 +++++++++++++++++
 spd/app/backend/server.py                     |   29 +
 .../src/components/InvestigationsTab.svelte   |  497 +++++++
 .../frontend/src/components/RunView.svelte    |   15 +-
 spd/app/frontend/src/lib/api/index.ts         |    1 +
 .../frontend/src/lib/api/investigations.ts    |   55 +
 12 files changed, 2211 insertions(+), 316 deletions(-)
 create mode 100644 spd/app/backend/routers/investigations.py
 create mode 100644 spd/app/backend/routers/mcp.py
 create mode 100644 spd/app/frontend/src/components/InvestigationsTab.svelte
 create mode 100644 spd/app/frontend/src/lib/api/investigations.ts

diff --git a/spd/agent_swarm/CLAUDE.md b/spd/agent_swarm/CLAUDE.md
index ee4a57db4..48fabc504 100644
--- a/spd/agent_swarm/CLAUDE.md
+++ b/spd/agent_swarm/CLAUDE.md
@@ -7,9 +7,10 @@ that investigate behaviors in SPD model decompositions.
 
 The agent swarm system allows you to:
 1. Launch many parallel agents (each as a SLURM job with 1 GPU)
-2. Each agent runs an isolated app backend instance
-3. Agents investigate behaviors using the SPD app API
-4. Findings are written to append-only JSONL files
+2. Each agent runs an isolated app backend instance with MCP support
+3. Agents investigate behaviors using SPD tools via MCP (Model Context Protocol)
+4. Progress is streamed in real-time via MCP SSE events
+5. Findings are written to append-only JSONL files
 
 ## Usage
 
@@ -36,22 +37,55 @@ spd/agent_swarm/
     └── run_agent.py      # Worker script (runs in each SLURM job)
 ```
 
+## MCP Tools
+
+Agents access ALL SPD functionality via MCP (Model Context Protocol). The backend exposes
+these tools at `/mcp`. Agents don't need file system access - everything is done through MCP.
+
+**Analysis Tools:**
+
+| Tool | Description |
+|------|-------------|
+| `optimize_graph` | Find minimal circuit for a behavior (streams progress) |
+| `get_component_info` | Get component interpretation, token stats, correlations |
+| `run_ablation` | Test circuit by running with selected components only |
+| `search_dataset` | Search SimpleStories training data for patterns |
+| `create_prompt` | Tokenize text and get next-token probabilities |
+
+**Output Tools:**
+
+| Tool | Description |
+|------|-------------|
+| `update_research_log` | Append content to the agent's research log (PRIMARY OUTPUT) |
+| `save_explanation` | Save a complete, validated behavior explanation |
+| `set_investigation_summary` | Set title and summary shown in the investigations UI |
+| `submit_suggestion` | Submit ideas for improving the tools or system |
+
+The `optimize_graph` tool streams progress events via SSE, giving real-time visibility
+into long-running optimization operations.
+
+Suggestions from all agents are collected in `SPD_OUT_DIR/agent_swarm/suggestions.jsonl` (global file).
+
 ## Output Structure
 
 ```
-SPD_OUT_DIR/agent_swarm/<swarm_id>/
-├── metadata.json         # Swarm configuration
-├── task_1/
-│   ├── research_log.md   # Human-readable progress log (PRIMARY OUTPUT)
-│   ├── events.jsonl      # Structured progress and observations
-│   ├── explanations.jsonl # Complete behavior explanations
-│   ├── app.db            # Isolated SQLite database
-│   ├── agent_prompt.md   # The prompt given to the agent
-│   └── claude_output.jsonl # Raw Claude Code output (stream-json format)
-├── task_2/
-│   └── ...
-└── task_N/
-    └── ...
+SPD_OUT_DIR/agent_swarm/
+├── suggestions.jsonl         # System improvement suggestions from ALL agents (global)
+└── <swarm_id>/
+    ├── metadata.json         # Swarm configuration
+    ├── task_1/
+    │   ├── research_log.md   # Human-readable progress log (PRIMARY OUTPUT)
+    │   ├── events.jsonl      # Structured progress and observations
+    │   ├── explanations.jsonl # Complete behavior explanations
+    │   ├── summary.json      # Agent-provided title and summary for UI
+    │   ├── app.db            # Isolated SQLite database
+    │   ├── agent_prompt.md   # The prompt given to the agent
+    │   ├── mcp_config.json   # MCP server configuration for Claude Code
+    │   └── claude_output.jsonl # Raw Claude Code output (stream-json format)
+    ├── task_2/
+    │   └── ...
+    └── task_N/
+        └── ...
 ```
 
 ## Key Files
@@ -103,6 +137,9 @@ tail -f SPD_OUT_DIR/agent_swarm/<swarm_id>/task_*/events.jsonl
 # View all explanations
 cat SPD_OUT_DIR/agent_swarm/<swarm_id>/task_*/explanations.jsonl | jq .
 
+# View agent suggestions for system improvement (global file)
+cat SPD_OUT_DIR/agent_swarm/suggestions.jsonl | jq .
+
 # Check SLURM job status
 squeue --me
 
diff --git a/spd/agent_swarm/agent_prompt.py b/spd/agent_swarm/agent_prompt.py
index b469b47b5..44424c190 100644
--- a/spd/agent_swarm/agent_prompt.py
+++ b/spd/agent_swarm/agent_prompt.py
@@ -1,8 +1,7 @@
 """System prompt for SPD investigation agents.
 
 This module contains the detailed instructions given to each agent in the swarm.
-The prompt explains how to use the SPD app API and the scientific methodology
-for investigating model behaviors.
+The agent has access to SPD tools via MCP - tools are self-documenting.
 """
 
 AGENT_SYSTEM_PROMPT = """
@@ -10,7 +9,7 @@
 
 You are a research agent investigating behaviors in a neural network model decomposition.
 Your goal is to find interesting behaviors, understand how components interact to produce
-them, and document your findings as explanations.
+them, and document your findings.
 
 ## Your Mission
 
@@ -23,326 +22,126 @@
 2. **Understand the mechanism**: Figure out which components are involved and how they
    work together to produce the behavior
 
-3. **Document your findings**: Write a clear explanation with supporting evidence
+3. **Document your findings**: Write clear explanations with supporting evidence
 
-## The SPD App Backend
+## Available Tools (via MCP)
 
-You have access to an SPD (Stochastic Parameter Decomposition) app backend running at:
-`http://localhost:{port}`
+You have access to SPD analysis tools. Use them directly - they have full documentation.
 
-This app provides APIs for:
-- Loading decomposed models
-- Computing attribution graphs showing how components interact
-- Optimizing sparse circuits for specific behaviors
-- Running interventions (ablations) to test hypotheses
-- Viewing component interpretations and correlations
-- Searching the training dataset
+**Analysis Tools:**
+- **optimize_graph**: Find the minimal circuit for a behavior (e.g., "boy" → "he")
+- **get_component_info**: Get interpretation and token stats for a component
+- **run_ablation**: Test a circuit by running with only selected components
+- **search_dataset**: Find examples in the training data
+- **create_prompt**: Tokenize text for analysis
 
-## API Reference
-
-### Health Check
-```bash
-curl http://localhost:{port}/api/health
-# Returns: {{"status": "ok"}}
-```
-
-### Load a Run (ALREADY DONE FOR YOU)
-The run is pre-loaded. Check status with:
-```bash
-curl http://localhost:{port}/api/status
-```
-
-### Create a Custom Prompt
-To analyze a specific prompt:
-```bash
-curl -X POST "http://localhost:{port}/api/prompts/custom?text=The%20boy%20ate%20his"
-# Returns: {{"id": <prompt_id>, "token_ids": [...], "tokens": [...], "preview": "...", "next_token_probs": [...]}}
-```
-
-### Compute Optimized Attribution Graph (MOST IMPORTANT)
-This optimizes a sparse circuit that achieves a behavior:
-```bash
-curl -X POST "http://localhost:{port}/api/graphs/optimized/stream?prompt_id=<id>&loss_type=ce&loss_position=<pos>&label_token=<token_id>&steps=100&imp_min_coeff=0.1&pnorm=0.5&mask_type=hard&loss_coeff=1.0&ci_threshold=0.01&normalize=target"
-# Streams SSE events, final event has type="complete" with graph data
-```
-
-Parameters:
-- `prompt_id`: ID from creating custom prompt
-- `loss_type`: "ce" for cross-entropy (predicting specific token) or "kl" (matching full distribution)
-- `loss_position`: Token position to optimize (0-indexed, usually last position)
-- `label_token`: Token ID to predict (for CE loss)
-- `steps`: Optimization steps (50-200 typical)
-- `imp_min_coeff`: Importance minimization coefficient (0.05-0.3)
-- `pnorm`: P-norm for sparsity (0.3-1.0, lower = sparser)
-- `mask_type`: "hard" for binary masks, "soft" for continuous
-- `ci_threshold`: Threshold for including nodes in graph (0.01-0.1)
-- `normalize`: "target" normalizes by target layer, "none" for raw values
-
-### Get Component Interpretations
-```bash
-curl "http://localhost:{port}/api/correlations/interpretations"
-# Returns: {{"h.0.mlp.c_fc:5": {{"label": "...", "confidence": "high"}}, ...}}
-```
-
-Get full interpretation details:
-```bash
-curl "http://localhost:{port}/api/correlations/interpretations/h.0.mlp.c_fc/5"
-# Returns: {{"reasoning": "...", "prompt": "..."}}
-```
-
-### Get Component Token Statistics
-```bash
-curl "http://localhost:{port}/api/correlations/token_stats/h.0.mlp.c_fc/5?top_k=20"
-# Returns input/output token associations
-```
-
-### Get Component Correlations
-```bash
-curl "http://localhost:{port}/api/correlations/components/h.0.mlp.c_fc/5?top_k=20"
-# Returns components that frequently co-activate
-```
-
-### Run Intervention (Ablation)
-Test a hypothesis by running the model with only selected components active:
-```bash
-curl -X POST "http://localhost:{port}/api/intervention/run" \\
-  -H "Content-Type: application/json" \\
-  -d '{{"graph_id": <id>, "text": "The boy ate his", "selected_nodes": ["h.0.mlp.c_fc:3:5", "h.1.attn.o_proj:3:10"], "top_k": 10}}'
-# Returns predictions with only selected components active vs full model
-```
-
-Node format: "layer:seq_pos:component_idx"
-- `layer`: e.g., "h.0.mlp.c_fc", "h.1.attn.o_proj"
-- `seq_pos`: Position in sequence (0-indexed)
-- `component_idx`: Component index within layer
-
-### Search Dataset
-Find prompts with specific patterns:
-```bash
-curl -X POST "http://localhost:{port}/api/dataset/search?query=she%20said&split=train"
-curl "http://localhost:{port}/api/dataset/results?page=1&page_size=20"
-```
-
-### Get Random Samples with Loss
-Find high/low loss examples:
-```bash
-curl "http://localhost:{port}/api/dataset/random_with_loss?n_samples=20&seed=42"
-```
-
-### Probe Component Activation
-See how a component responds to arbitrary text:
-```bash
-curl -X POST "http://localhost:{port}/api/activation_contexts/probe" \\
-  -H "Content-Type: application/json" \\
-  -d '{{"text": "The boy ate his", "layer": "h.0.mlp.c_fc", "component_idx": 5}}'
-# Returns CI values and activations at each position
-```
-
-### Get Dataset Attributions
-See which components influence each other across the training data:
-```bash
-curl "http://localhost:{port}/api/dataset_attributions/h.0.mlp.c_fc/5?k=10"
-# Returns positive/negative sources and targets
-```
+**Output Tools:**
+- **update_research_log**: Append to your research log (PRIMARY OUTPUT - use frequently!)
+- **save_explanation**: Save a complete, validated behavior explanation
+- **set_investigation_summary**: Set a title and summary for your investigation (shown in UI)
+- **submit_suggestion**: Submit ideas for improving the tools or system
 
 ## Investigation Methodology
 
 ### Step 1: Find an Interesting Behavior
 
-Start by exploring the model's behavior:
-
-1. **Search for patterns**: Use `/api/dataset/search` to find prompts with specific
-   linguistic patterns (pronouns, verb conjugations, completions, etc.)
-
-2. **Look at high-loss examples**: Use `/api/dataset/random_with_loss` to find where
-   the model struggles or succeeds
-
-3. **Create test prompts**: Use `/api/prompts/custom` to create prompts that test
-   specific capabilities
-
-Good behaviors to investigate:
-- Gendered pronoun prediction ("The doctor said she" vs "The doctor said he")
-- Subject-verb agreement ("The cats are" vs "The cat is")
-- Pattern completion ("1, 2, 3," → "4")
-- Semantic associations ("The capital of France is" → "Paris")
-- Grammatical structure (completing sentences correctly)
+Start by exploring:
+- Search for linguistic patterns: pronouns, verb agreement, completions
+- Create test prompts that show clear model behavior
+- Good targets: gendered pronouns, subject-verb agreement, semantic associations
 
 ### Step 2: Optimize a Sparse Circuit
 
 Once you have a behavior:
-
-1. **Create the prompt** via `/api/prompts/custom`
-
-2. **Identify the target token**: What token should be predicted? Get its ID from
-   the tokenizer or from the prompt creation response.
-
-3. **Run optimization** via `/api/graphs/optimized/stream`:
-   - Use `loss_type=ce` with the target token
-   - Set `loss_position` to the position where prediction matters
-   - Start with `imp_min_coeff=0.1`, `pnorm=0.5`, `steps=100`
-   - Use `ci_threshold=0.01` to see active components
-
-4. **Examine the graph**: The response shows:
-   - `nodeCiVals`: Which components are active (high CI = important)
-   - `edges`: How components connect (gradient flow)
-   - `outputProbs`: Model predictions
+1. Use `optimize_graph` with your prompt and target token
+2. Examine which components have high CI values
+3. Note the circuit size (fewer = cleaner mechanism)
 
 ### Step 3: Understand Component Roles
 
-For each important component in the graph:
-
-1. **Check the interpretation**: Use `/api/correlations/interpretations/<layer>/<idx>`
-   to see if we already have an idea what this component does
-
-2. **Look at token stats**: Use `/api/correlations/token_stats/<layer>/<idx>` to see
-   what tokens activate this component (input) and what it predicts (output)
-
-3. **Check correlations**: Use `/api/correlations/components/<layer>/<idx>` to see
-   what other components co-activate
-
-4. **Probe on variations**: Use `/api/activation_contexts/probe` to see how the
-   component responds to related prompts
+For each important component:
+1. Use `get_component_info` to see its interpretation and token stats
+2. Look at what tokens activate it (input) and what it predicts (output)
+3. Check correlated components
 
 ### Step 4: Test with Ablations
 
 Form hypotheses and test them:
-
-1. **Hypothesis**: "Component X stores information about gender"
-
-2. **Test**: Run intervention with and without component X
-   - If prediction changes as expected → supports hypothesis
-   - If no change → component may not be necessary for this
-   - If unexpected change → revise hypothesis
-
-3. **Control**: Try ablating other components to ensure specificity
+1. Use `run_ablation` with the circuit's components
+2. Verify predictions match expectations
+3. Try removing individual components to find critical ones
 
 ### Step 5: Document Your Findings
 
-Write a `BehaviorExplanation` with:
-- Clear subject prompt
-- Description of the behavior
-- Components and their roles
-- How they work together
-- Supporting evidence from ablations/attributions
-- Confidence level
-- Alternative hypotheses you considered
-- Limitations
+Use `update_research_log` frequently - this is how humans monitor your work!
+When you complete an investigation, use `save_explanation` to create a structured record.
 
 ## Scientific Principles
 
-### Be Epistemologically Humble
-- Your first hypothesis is probably wrong or incomplete
-- Always consider alternative explanations
-- A single confirming example doesn't prove a theory
-- Look for disconfirming evidence
-
-### Be Bayesian
-- Start with priors from component interpretations
-- Update beliefs based on evidence
-- Consider the probability of the evidence under different hypotheses
-- Don't anchor too strongly on initial observations
-
-### Triangulate Evidence
-- Don't rely on a single type of evidence
-- Ablation results + attribution patterns + token stats together are stronger
-- Look for convergent evidence from multiple sources
-
-### Document Uncertainty
-- Be explicit about what you're confident in vs. uncertain about
-- Note when evidence is weak or ambiguous
-- Identify what additional tests would strengthen the explanation
+- **Be skeptical**: Your first hypothesis is probably incomplete
+- **Triangulate**: Don't rely on a single type of evidence
+- **Document uncertainty**: Note what you're confident in vs. uncertain about
+- **Consider alternatives**: What else could explain the behavior?
 
 ## Output Format
 
-Write your findings to the output files. **The research log is your primary output for humans to read.**
+### Research Log (PRIMARY OUTPUT - Update frequently!)
 
-### research_log.md (MOST IMPORTANT - Write here frequently!)
-This is a human-readable log of your investigation. Write here often so someone can follow your progress.
-Use clear markdown formatting:
+Use `update_research_log` with markdown content. Call it every few minutes to show progress:
 
-```markdown
-## [2026-01-30 14:23:15] Starting Investigation
-
-Looking at component interpretations to find interesting patterns...
-
-## [2026-01-30 14:25:42] Hypothesis: Gendered Pronoun Circuit
-
-Found components that seem related to pronouns:
-- h.0.mlp.c_fc:42 - "he/his pronouns after male subjects"
-- h.0.mlp.c_fc:89 - "she/her pronouns after female subjects"
-
-Testing with prompt: "The boy said that he"
-
-## [2026-01-30 14:28:03] Optimization Results
-
-Ran optimization for "he" prediction at position 4:
-- Found 15 active components
-- Key components: h.0.mlp.c_fc:42 (CI=0.92), h.1.attn.o_proj:156 (CI=0.78)
+Example calls:
+```
+update_research_log("# Research Log - Task 1\n\nStarting investigation...\n\n")
 
-## [2026-01-30 14:31:17] Ablation Test
+update_research_log("## [14:25:42] Hypothesis: Gendered Pronoun Circuit\n\nTesting prompt: 'The boy said that' → expecting ' he'\n\nUsed optimize_graph - found 15 active components:\n- h.0.mlp.c_fc:407 (CI=0.95) - 'male subjects'\n- h.3.attn.o_proj:262 (CI=0.92) - 'masculine pronouns'\n\n")
 
-Ablating h.0.mlp.c_fc:42:
-- Before: P(he)=0.82, P(she)=0.11
-- After:  P(he)=0.23, P(she)=0.45
+update_research_log("## [14:28:03] Ablation Test\n\nResult: P(he) = 0.89 (vs 0.22 baseline)\n\nThis confirms the circuit is sufficient!\n\n")
+```
 
-This confirms the component is important for masculine pronoun prediction!
+### Saving Explanations
 
-## [2026-01-30 14:35:44] Conclusion
+When you have a complete explanation, use `save_explanation`:
 
-Found a circuit for gendered pronoun prediction. Components h.0.mlp.c_fc:42 and
-h.1.attn.o_proj:156 work together to predict masculine pronouns after male subjects.
+```
+save_explanation(
+  subject_prompt="The boy said that",
+  behavior_description="Predicts masculine pronoun 'he' after male subject",
+  components_involved=[
+    {{"component_key": "h.0.mlp.c_fc:407", "role": "Male subject detector"}},
+    {{"component_key": "h.3.attn.o_proj:262", "role": "Masculine pronoun promoter"}}
+  ],
+  explanation="Component h.0.mlp.c_fc:407 activates on male subjects...",
+  confidence="medium",
+  limitations=["Only tested on simple sentences"]
+)
 ```
 
-**TIP**: Get the current timestamp with `date '+%Y-%m-%d %H:%M:%S'` for your log entries.
+### Submitting Suggestions
 
-**IMPORTANT**: Update the research log every few minutes with your current progress,
-findings, and next steps. This is how humans monitor your work!
+If you have ideas for improving the system, use `submit_suggestion`:
 
-### events.jsonl
-Log structured progress and observations:
-```json
-{{"event_type": "observation", "message": "Component h.0.mlp.c_fc:5 has high CI when subject is male", "details": {{"ci_value": 0.85}}, "timestamp": "..."}}
 ```
-
-### explanations.jsonl
-When you have a complete explanation:
-```json
-{{
-  "subject_prompt": "The boy ate his lunch",
-  "behavior_description": "Correctly predicts gendered pronoun 'his' after male subject",
-  "components_involved": [
-    {{"component_key": "h.0.mlp.c_fc:5", "role": "Encodes subject gender as male", "interpretation": "male names/subjects"}},
-    {{"component_key": "h.1.attn.o_proj:10", "role": "Transmits gender information to output", "interpretation": null}}
-  ],
-  "explanation": "Component h.0.mlp.c_fc:5 activates on male subjects and stores gender information...",
-  "supporting_evidence": [
-    {{"evidence_type": "ablation", "description": "Removing component causes prediction to change from 'his' to 'her'", "details": {{"without_component": {{"his": 0.1, "her": 0.6}}, "with_component": {{"his": 0.8, "her": 0.1}}}}}}
-  ],
-  "confidence": "medium",
-  "alternative_hypotheses": ["Component might encode broader concept of masculine entities, not just humans"],
-  "limitations": ["Only tested on simple subject-pronoun sentences"]
-}}
+submit_suggestion(
+  category="tool_improvement",
+  title="Add batch ablation support",
+  description="It would be faster to test multiple ablations at once...",
+  context="I was testing 10 different component subsets one at a time"
+)
 ```
 
 ## Getting Started
 
-1. **Create your research log**: Start by creating `research_log.md` with a header
-2. Check the current status: `curl http://localhost:{port}/api/status`
-3. Explore available interpretations: `curl http://localhost:{port}/api/correlations/interpretations`
-4. Search for interesting prompts or create your own
-5. **Update research_log.md** with what you're investigating
-6. Optimize a sparse circuit for a behavior you find
-7. Investigate the components involved
-8. Test hypotheses with ablations
-9. **Update research_log.md** with findings
-10. Document complete explanations in `explanations.jsonl`
-
-**Remember to update research_log.md frequently** - this is how humans follow your progress!
+1. **Create your research log** with `update_research_log("# Research Log - Task N\n\n...")`
+2. Use analysis tools to explore the model
+3. Find an interesting behavior to investigate
+4. **Call `update_research_log` frequently** - humans are watching!
+5. Use `save_explanation` for complete findings
+6. **Call `set_investigation_summary`** with a title and summary when done (or periodically for updates)
 
 You are exploring! Not every investigation will lead to a clear explanation.
 Document what you learn, even if it's "this was more complicated than expected."
 
-Good luck, and happy investigating!
+Good luck!
 """
 
 
@@ -350,7 +149,7 @@ def get_agent_prompt(port: int, wandb_path: str, task_id: int, output_dir: str)
     """Generate the full agent prompt with runtime parameters filled in.
 
     Args:
-        port: The port the backend is running on.
+        port: The port the backend is running on (for reference, tools use MCP).
         wandb_path: The WandB path of the loaded run.
         task_id: The SLURM task ID for this agent.
         output_dir: Path to the agent's output directory.
@@ -358,21 +157,19 @@ def get_agent_prompt(port: int, wandb_path: str, task_id: int, output_dir: str)
     Returns:
         The complete agent prompt with parameters substituted.
     """
-    prompt = AGENT_SYSTEM_PROMPT.format(port=port)
-
     runtime_context = f"""
 ## Runtime Context
 
-- **Backend URL**: http://localhost:{port}
-- **Loaded Run**: {wandb_path}
+- **Model Run**: {wandb_path}
 - **Task ID**: {task_id}
-- **Output Directory**: {output_dir}
 
-Your output files:
-- `{output_dir}/research_log.md` - **PRIMARY OUTPUT** - Write readable progress updates here frequently!
-- `{output_dir}/events.jsonl` - Log structured events and observations here
-- `{output_dir}/explanations.jsonl` - Write complete explanations here
+Use the MCP tools for ALL output:
+- `update_research_log` → **PRIMARY OUTPUT** - Update frequently with your progress!
+- `save_explanation` → Save complete, validated behavior explanations
+- `submit_suggestion` → Share ideas for improving the system
 
-**Start by creating research_log.md with a header, then update it every few minutes!**
+**Start by calling update_research_log to create your log, then investigate!**
 """
-    return prompt + runtime_context
+    # Note: output_dir and port are available but agents shouldn't need them
+    _ = output_dir, port
+    return AGENT_SYSTEM_PROMPT + runtime_context
diff --git a/spd/agent_swarm/scripts/run_agent.py b/spd/agent_swarm/scripts/run_agent.py
index 3c5a78449..627b6d473 100644
--- a/spd/agent_swarm/scripts/run_agent.py
+++ b/spd/agent_swarm/scripts/run_agent.py
@@ -4,10 +4,12 @@
 1. Creates an isolated output directory for this agent
 2. Starts the app backend with an isolated database
 3. Loads the SPD run
-4. Launches Claude Code with investigation instructions
-5. Handles cleanup on exit
+4. Configures MCP server for Claude Code
+5. Launches Claude Code with investigation instructions
+6. Handles cleanup on exit
 """
 
+import json
 import os
 import signal
 import socket
@@ -26,6 +28,21 @@
 from spd.log import logger
 
 
+def write_mcp_config(task_dir: Path, port: int) -> Path:
+    """Write MCP configuration file for Claude Code."""
+    mcp_config = {
+        "mcpServers": {
+            "spd": {
+                "type": "http",
+                "url": f"http://localhost:{port}/mcp",
+            }
+        }
+    }
+    config_path = task_dir / "mcp_config.json"
+    config_path.write_text(json.dumps(mcp_config, indent=2))
+    return config_path
+
+
 def find_available_port(start_port: int = 8000, max_attempts: int = 100) -> int:
     """Find an available port starting from start_port."""
     for offset in range(max_attempts):
@@ -124,9 +141,13 @@ def run_agent(
         ),
     )
 
-    # Start backend with isolated database
+    # Start backend with isolated database and swarm configuration
     env = os.environ.copy()
     env["SPD_APP_DB_PATH"] = str(db_path)
+    env["SPD_MCP_EVENTS_PATH"] = str(events_path)
+    env["SPD_MCP_TASK_DIR"] = str(task_dir)
+    # Suggestions go to a global file (one level above swarm dirs)
+    env["SPD_MCP_SUGGESTIONS_PATH"] = str(swarm_dir.parent / "suggestions.jsonl")
 
     backend_cmd = [
         sys.executable,
@@ -214,7 +235,12 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
         prompt_path = task_dir / "agent_prompt.md"
         prompt_path.write_text(agent_prompt)
 
-        # Launch Claude Code with streaming JSON output
+        # Write MCP config for Claude Code
+        mcp_config_path = write_mcp_config(task_dir, port)
+        logger.info(f"[Task {task_id}] MCP config written to {mcp_config_path}")
+
+        # Launch Claude Code with streaming JSON output and MCP
+        # No --dangerously-skip-permissions needed - agents use MCP tools for all I/O
         claude_output_path = task_dir / "claude_output.jsonl"
         claude_cmd = [
             "claude",
@@ -224,7 +250,8 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
             "stream-json",
             "--max-turns",
             str(max_turns),
-            "--dangerously-skip-permissions",
+            "--mcp-config",
+            str(mcp_config_path),
         ]
 
         logger.info(f"[Task {task_id}] Starting Claude Code (max_turns={max_turns})...")
@@ -252,15 +279,16 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
 
 Please begin your investigation:
 
-1. **FIRST**: Create `{task_dir}/research_log.md` with a header like "# Research Log - Task {task_id}"
-2. Check the backend status and explore component interpretations
-3. Find an interesting behavior to investigate
-4. **Update research_log.md frequently** with your progress, findings, and next steps
+1. **FIRST**: Use the `update_research_log` tool to create your research log with a header like:
+   "# Research Log - Task {task_id}\\n\\nStarting investigation of {wandb_path}\\n\\n"
+2. Explore component interpretations using `get_component_info`
+3. Find an interesting behavior to investigate with `optimize_graph`
+4. **Use `update_research_log` frequently** to document your progress, findings, and next steps
 
 Remember:
-- research_log.md is your PRIMARY output - humans will read this to follow your work
-- Update it every few minutes with what you're doing and discovering
-- Write complete explanations to explanations.jsonl when you finish investigating a behavior
+- The research log is your PRIMARY output - use `update_research_log` every few minutes
+- Use `save_explanation` to record complete, validated explanations
+- Use `submit_suggestion` if you have ideas for improving the tools or system
 """
             assert claude_proc.stdin is not None
             claude_proc.stdin.write(investigation_request)
diff --git a/spd/app/CLAUDE.md b/spd/app/CLAUDE.md
index 95d6bc1b3..c1eed4f49 100644
--- a/spd/app/CLAUDE.md
+++ b/spd/app/CLAUDE.md
@@ -48,7 +48,8 @@ backend/
     ├── correlations.py    # Component correlations + token stats + interpretations
     ├── clusters.py        # Component clustering
     ├── dataset_search.py  # SimpleStories dataset search
-    └── agents.py          # Various useful endpoints that AI agents should look at when helping 
+    ├── agents.py          # Various useful endpoints that AI agents should look at when helping
+    └── mcp.py             # MCP (Model Context Protocol) endpoint for Claude Code 
 ```
 
 Note: Activation contexts, correlations, and token stats are now loaded from pre-harvested data (see `spd/harvest/`). The app no longer computes these on-the-fly.
diff --git a/spd/app/backend/routers/__init__.py b/spd/app/backend/routers/__init__.py
index 79cea1087..83f6e8ac2 100644
--- a/spd/app/backend/routers/__init__.py
+++ b/spd/app/backend/routers/__init__.py
@@ -9,6 +9,8 @@
 from spd.app.backend.routers.dataset_search import router as dataset_search_router
 from spd.app.backend.routers.graphs import router as graphs_router
 from spd.app.backend.routers.intervention import router as intervention_router
+from spd.app.backend.routers.investigations import router as investigations_router
+from spd.app.backend.routers.mcp import router as mcp_router
 from spd.app.backend.routers.prompts import router as prompts_router
 from spd.app.backend.routers.runs import router as runs_router
 
@@ -22,6 +24,8 @@
     "dataset_search_router",
     "graphs_router",
     "intervention_router",
+    "investigations_router",
+    "mcp_router",
     "prompts_router",
     "runs_router",
 ]
diff --git a/spd/app/backend/routers/investigations.py b/spd/app/backend/routers/investigations.py
new file mode 100644
index 000000000..3ea6244c3
--- /dev/null
+++ b/spd/app/backend/routers/investigations.py
@@ -0,0 +1,262 @@
+"""Investigations endpoint for viewing agent swarm results.
+
+Lists and serves investigation data from SPD_OUT_DIR/agent_swarm/.
+Each task is treated as an independent investigation (flattened across swarms).
+"""
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+
+from spd.settings import SPD_OUT_DIR
+
+router = APIRouter(prefix="/api/investigations", tags=["investigations"])
+
+SWARM_DIR = SPD_OUT_DIR / "agent_swarm"
+
+
+class InvestigationSummary(BaseModel):
+    """Summary of a single investigation (task)."""
+
+    id: str  # swarm_id/task_id
+    swarm_id: str
+    task_id: int
+    wandb_path: str | None
+    created_at: str
+    has_research_log: bool
+    has_explanations: bool
+    event_count: int
+    last_event_time: str | None
+    last_event_message: str | None
+    # Agent-provided summary
+    title: str | None
+    summary: str | None
+    status: str | None  # in_progress, completed, inconclusive
+
+
+class EventEntry(BaseModel):
+    """A single event from events.jsonl."""
+
+    event_type: str
+    timestamp: str
+    message: str
+    details: dict[str, Any] | None = None
+
+
+class InvestigationDetail(BaseModel):
+    """Full detail of an investigation including logs."""
+
+    id: str
+    swarm_id: str
+    task_id: int
+    wandb_path: str | None
+    created_at: str
+    research_log: str | None
+    events: list[EventEntry]
+    explanations: list[dict[str, Any]]
+    # Agent-provided summary
+    title: str | None
+    summary: str | None
+    status: str | None
+
+
+def _parse_swarm_metadata(swarm_path: Path) -> dict[str, Any] | None:
+    """Parse metadata.json from a swarm directory."""
+    metadata_path = swarm_path / "metadata.json"
+    if not metadata_path.exists():
+        return None
+    try:
+        data: dict[str, Any] = json.loads(metadata_path.read_text())
+        return data
+    except Exception:
+        return None
+
+
+def _get_last_event(events_path: Path) -> tuple[str | None, str | None, int]:
+    """Get the last event timestamp, message, and total count from events.jsonl."""
+    if not events_path.exists():
+        return None, None, 0
+
+    last_time = None
+    last_msg = None
+    count = 0
+
+    try:
+        with open(events_path) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                count += 1
+                try:
+                    event = json.loads(line)
+                    last_time = event.get("timestamp")
+                    last_msg = event.get("message")
+                except json.JSONDecodeError:
+                    continue
+    except Exception:
+        pass
+
+    return last_time, last_msg, count
+
+
+def _parse_task_summary(task_path: Path) -> tuple[str | None, str | None, str | None]:
+    """Parse summary.json from a task directory. Returns (title, summary, status)."""
+    summary_path = task_path / "summary.json"
+    if not summary_path.exists():
+        return None, None, None
+    try:
+        data: dict[str, Any] = json.loads(summary_path.read_text())
+        return data.get("title"), data.get("summary"), data.get("status")
+    except Exception:
+        return None, None, None
+
+
+def _get_task_created_at(task_path: Path, swarm_metadata: dict[str, Any] | None) -> str:
+    """Get creation time for a task."""
+    # Try to get from first event
+    events_path = task_path / "events.jsonl"
+    if events_path.exists():
+        try:
+            with open(events_path) as f:
+                first_line = f.readline().strip()
+                if first_line:
+                    event = json.loads(first_line)
+                    if "timestamp" in event:
+                        return event["timestamp"]
+        except Exception:
+            pass
+
+    # Fall back to swarm metadata
+    if swarm_metadata and "created_at" in swarm_metadata:
+        return swarm_metadata["created_at"]
+
+    # Fall back to directory mtime
+    return datetime.fromtimestamp(task_path.stat().st_mtime).isoformat()
+
+
+@router.get("")
+def list_investigations() -> list[InvestigationSummary]:
+    """List all investigations (tasks) flattened across swarms."""
+    if not SWARM_DIR.exists():
+        return []
+
+    results = []
+
+    for swarm_path in SWARM_DIR.iterdir():
+        if not swarm_path.is_dir() or not swarm_path.name.startswith("swarm-"):
+            continue
+
+        swarm_id = swarm_path.name
+        metadata = _parse_swarm_metadata(swarm_path)
+        wandb_path = metadata.get("wandb_path") if metadata else None
+
+        for task_path in swarm_path.iterdir():
+            if not task_path.is_dir() or not task_path.name.startswith("task_"):
+                continue
+
+            try:
+                task_id = int(task_path.name.split("_")[1])
+            except (ValueError, IndexError):
+                continue
+
+            events_path = task_path / "events.jsonl"
+            last_time, last_msg, event_count = _get_last_event(events_path)
+            title, summary, status = _parse_task_summary(task_path)
+
+            results.append(
+                InvestigationSummary(
+                    id=f"{swarm_id}/{task_id}",
+                    swarm_id=swarm_id,
+                    task_id=task_id,
+                    wandb_path=wandb_path,
+                    created_at=_get_task_created_at(task_path, metadata),
+                    has_research_log=(task_path / "research_log.md").exists(),
+                    has_explanations=(task_path / "explanations.jsonl").exists()
+                    and (task_path / "explanations.jsonl").stat().st_size > 0,
+                    event_count=event_count,
+                    last_event_time=last_time,
+                    last_event_message=last_msg,
+                    title=title,
+                    summary=summary,
+                    status=status,
+                )
+            )
+
+    # Sort by creation time, newest first
+    results.sort(key=lambda x: x.created_at, reverse=True)
+    return results
+
+
+@router.get("/{swarm_id}/{task_id}")
+def get_investigation(swarm_id: str, task_id: int) -> InvestigationDetail:
+    """Get full details of an investigation."""
+    swarm_path = SWARM_DIR / swarm_id
+    task_path = swarm_path / f"task_{task_id}"
+
+    if not task_path.exists() or not task_path.is_dir():
+        raise HTTPException(status_code=404, detail=f"Investigation {swarm_id}/{task_id} not found")
+
+    metadata = _parse_swarm_metadata(swarm_path)
+
+    # Read research log
+    research_log = None
+    research_log_path = task_path / "research_log.md"
+    if research_log_path.exists():
+        research_log = research_log_path.read_text()
+
+    # Read events
+    events = []
+    events_path = task_path / "events.jsonl"
+    if events_path.exists():
+        with open(events_path) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    event = json.loads(line)
+                    events.append(
+                        EventEntry(
+                            event_type=event.get("event_type", "unknown"),
+                            timestamp=event.get("timestamp", ""),
+                            message=event.get("message", ""),
+                            details=event.get("details"),
+                        )
+                    )
+                except json.JSONDecodeError:
+                    continue
+
+    # Read explanations
+    explanations: list[dict[str, Any]] = []
+    explanations_path = task_path / "explanations.jsonl"
+    if explanations_path.exists():
+        with open(explanations_path) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    explanations.append(json.loads(line))
+                except json.JSONDecodeError:
+                    continue
+
+    title, summary, status = _parse_task_summary(task_path)
+
+    return InvestigationDetail(
+        id=f"{swarm_id}/{task_id}",
+        swarm_id=swarm_id,
+        task_id=task_id,
+        wandb_path=metadata.get("wandb_path") if metadata else None,
+        created_at=_get_task_created_at(task_path, metadata),
+        research_log=research_log,
+        events=events,
+        explanations=explanations,
+        title=title,
+        summary=summary,
+        status=status,
+    )
diff --git a/spd/app/backend/routers/mcp.py b/spd/app/backend/routers/mcp.py
new file mode 100644
index 000000000..c986109a0
--- /dev/null
+++ b/spd/app/backend/routers/mcp.py
@@ -0,0 +1,1171 @@
+"""MCP (Model Context Protocol) endpoint for Claude Code integration.
+
+This router implements the MCP JSON-RPC protocol over HTTP, allowing Claude Code
+to use SPD tools directly with proper schemas and streaming progress.
+
+MCP Spec: https://modelcontextprotocol.io/specification/2025-06-18/basic/transports
+"""
+
+import inspect
+import json
+import queue
+import threading
+import traceback
+from collections.abc import Generator
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any, Literal
+
+import torch
+from fastapi import APIRouter, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from pydantic import BaseModel
+
+from spd.app.backend.compute import (
+    compute_intervention_forward,
+    compute_prompt_attributions_optimized,
+)
+from spd.app.backend.database import StoredGraph
+from spd.app.backend.optim_cis import CELossConfig, OptimCIConfig
+from spd.app.backend.routers.graphs import build_out_probs
+from spd.app.backend.state import StateManager
+from spd.configs import ImportanceMinimalityLossConfig
+from spd.harvest import analysis
+from spd.log import logger
+from spd.utils.distributed_utils import get_device
+
+router = APIRouter(tags=["mcp"])
+
+DEVICE = get_device()
+
+# MCP protocol version
+MCP_PROTOCOL_VERSION = "2024-11-05"
+
+# Optional paths for swarm integration (set via environment at runtime)
+_events_log_path: Path | None = None
+_task_dir: Path | None = None
+_suggestions_path: Path | None = None
+
+
+def set_events_log_path(path: Path | None) -> None:
+    """Set the path for logging MCP tool events (for swarm monitoring)."""
+    global _events_log_path
+    _events_log_path = path
+
+
+def set_task_dir(path: Path | None) -> None:
+    """Set the task directory for research log and explanations output."""
+    global _task_dir
+    _task_dir = path
+
+
+def set_suggestions_path(path: Path | None) -> None:
+    """Set the path for the central suggestions file."""
+    global _suggestions_path
+    _suggestions_path = path
+
+
+def _log_event(event_type: str, message: str, details: dict[str, Any] | None = None) -> None:
+    """Log an event to the events file if configured."""
+    if _events_log_path is None:
+        return
+    event = {
+        "event_type": event_type,
+        "timestamp": datetime.now(UTC).isoformat(),
+        "message": message,
+        "details": details or {},
+    }
+    with open(_events_log_path, "a") as f:
+        f.write(json.dumps(event) + "\n")
+
+
+# =============================================================================
+# MCP Protocol Types
+# =============================================================================
+
+
+class MCPRequest(BaseModel):
+    """JSON-RPC 2.0 request."""
+
+    jsonrpc: Literal["2.0"]
+    id: int | str | None = None
+    method: str
+    params: dict[str, Any] | None = None
+
+
+class MCPResponse(BaseModel):
+    """JSON-RPC 2.0 response."""
+
+    jsonrpc: Literal["2.0"] = "2.0"
+    id: int | str | None
+    result: Any | None = None
+    error: dict[str, Any] | None = None
+
+
+class ToolDefinition(BaseModel):
+    """MCP tool definition."""
+
+    name: str
+    description: str
+    inputSchema: dict[str, Any]
+
+
+# =============================================================================
+# Tool Definitions
+# =============================================================================
+
+TOOLS: list[ToolDefinition] = [
+    ToolDefinition(
+        name="optimize_graph",
+        description="""Optimize a sparse circuit for a specific behavior.
+
+Given a prompt and target token, finds the minimal set of components that produce the target prediction.
+Returns the optimized graph with component CI values and edges showing information flow.
+
+This is the primary tool for understanding how the model produces a specific output.""",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "prompt_text": {
+                    "type": "string",
+                    "description": "The input text to analyze (e.g., 'The boy said that')",
+                },
+                "target_token": {
+                    "type": "string",
+                    "description": "The token to predict (e.g., ' he'). Include leading space if needed.",
+                },
+                "loss_position": {
+                    "type": "integer",
+                    "description": "Position to optimize prediction at (0-indexed, usually last position). If not specified, uses the last position.",
+                },
+                "steps": {
+                    "type": "integer",
+                    "description": "Optimization steps (default: 100, more = sparser but slower)",
+                    "default": 100,
+                },
+                "ci_threshold": {
+                    "type": "number",
+                    "description": "CI threshold for including components (default: 0.5, lower = more components)",
+                    "default": 0.5,
+                },
+            },
+            "required": ["prompt_text", "target_token"],
+        },
+    ),
+    ToolDefinition(
+        name="get_component_info",
+        description="""Get detailed information about a component.
+
+Returns the component's interpretation (what it does), token statistics (what tokens
+activate it and what it predicts), and correlated components.
+
+Use this to understand what role a component plays in a circuit.""",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "layer": {
+                    "type": "string",
+                    "description": "Layer name (e.g., 'h.0.mlp.c_fc', 'h.2.attn.o_proj')",
+                },
+                "component_idx": {
+                    "type": "integer",
+                    "description": "Component index within the layer",
+                },
+                "top_k": {
+                    "type": "integer",
+                    "description": "Number of top tokens/correlations to return (default: 20)",
+                    "default": 20,
+                },
+            },
+            "required": ["layer", "component_idx"],
+        },
+    ),
+    ToolDefinition(
+        name="run_ablation",
+        description="""Run an ablation experiment with only selected components active.
+
+Tests a hypothesis by running the model with a sparse set of components.
+Returns predictions showing what the circuit produces vs the full model.
+
+Use this to verify that identified components are necessary and sufficient.""",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "text": {
+                    "type": "string",
+                    "description": "Input text for the ablation",
+                },
+                "selected_nodes": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Node keys to keep active (format: 'layer:seq_pos:component_idx')",
+                },
+                "top_k": {
+                    "type": "integer",
+                    "description": "Number of top predictions to return per position (default: 10)",
+                    "default": 10,
+                },
+            },
+            "required": ["text", "selected_nodes"],
+        },
+    ),
+    ToolDefinition(
+        name="search_dataset",
+        description="""Search the SimpleStories training dataset for patterns.
+
+Finds stories containing the query string. Use this to find examples of
+specific linguistic patterns (pronouns, verb forms, etc.) for investigation.""",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "query": {
+                    "type": "string",
+                    "description": "Text to search for (case-insensitive)",
+                },
+                "limit": {
+                    "type": "integer",
+                    "description": "Maximum results to return (default: 20)",
+                    "default": 20,
+                },
+            },
+            "required": ["query"],
+        },
+    ),
+    ToolDefinition(
+        name="create_prompt",
+        description="""Create a prompt for analysis.
+
+Tokenizes the text and returns token IDs and next-token probabilities.
+The returned prompt_id can be used with other tools.""",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "text": {
+                    "type": "string",
+                    "description": "The text to create a prompt from",
+                },
+            },
+            "required": ["text"],
+        },
+    ),
+    ToolDefinition(
+        name="update_research_log",
+        description="""Append content to your research log.
+
+Use this to document your investigation progress, findings, and next steps.
+The research log is your primary output for humans to follow your work.
+
+Call this frequently (every few minutes) with updates on what you're doing.""",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "content": {
+                    "type": "string",
+                    "description": "Markdown content to append to the research log",
+                },
+            },
+            "required": ["content"],
+        },
+    ),
+    ToolDefinition(
+        name="save_explanation",
+        description="""Save a complete behavior explanation.
+
+Use this when you have finished investigating a behavior and want to document
+your findings. This creates a structured record of the behavior, the components
+involved, and your explanation of how they work together.
+
+Only call this for complete, validated explanations - not preliminary hypotheses.""",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "subject_prompt": {
+                    "type": "string",
+                    "description": "A prompt that demonstrates the behavior",
+                },
+                "behavior_description": {
+                    "type": "string",
+                    "description": "Clear description of the behavior",
+                },
+                "components_involved": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "component_key": {
+                                "type": "string",
+                                "description": "Component key (e.g., 'h.0.mlp.c_fc:5')",
+                            },
+                            "role": {
+                                "type": "string",
+                                "description": "The role this component plays",
+                            },
+                            "interpretation": {
+                                "type": "string",
+                                "description": "Auto-interp label if available",
+                            },
+                        },
+                        "required": ["component_key", "role"],
+                    },
+                    "description": "List of components and their roles",
+                },
+                "explanation": {
+                    "type": "string",
+                    "description": "How the components work together",
+                },
+                "supporting_evidence": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "evidence_type": {
+                                "type": "string",
+                                "enum": [
+                                    "ablation",
+                                    "attribution",
+                                    "activation_pattern",
+                                    "correlation",
+                                    "other",
+                                ],
+                            },
+                            "description": {"type": "string"},
+                            "details": {"type": "object"},
+                        },
+                        "required": ["evidence_type", "description"],
+                    },
+                    "description": "Evidence supporting this explanation",
+                },
+                "confidence": {
+                    "type": "string",
+                    "enum": ["high", "medium", "low"],
+                    "description": "Your confidence level",
+                },
+                "alternative_hypotheses": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Other hypotheses you considered",
+                },
+                "limitations": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "Known limitations of this explanation",
+                },
+            },
+            "required": [
+                "subject_prompt",
+                "behavior_description",
+                "components_involved",
+                "explanation",
+                "confidence",
+            ],
+        },
+    ),
+    ToolDefinition(
+        name="submit_suggestion",
+        description="""Submit a suggestion for improving the SPD system.
+
+Use this when you encounter limitations, have ideas for new tools, or think
+of ways the system could better support investigation work.
+
+Suggestions are collected centrally and reviewed by humans to improve the system.""",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "category": {
+                    "type": "string",
+                    "enum": ["tool_improvement", "new_tool", "documentation", "bug", "other"],
+                    "description": "Category of suggestion",
+                },
+                "title": {
+                    "type": "string",
+                    "description": "Brief title for the suggestion",
+                },
+                "description": {
+                    "type": "string",
+                    "description": "Detailed description of the suggestion",
+                },
+                "context": {
+                    "type": "string",
+                    "description": "What you were trying to do when you had this idea",
+                },
+            },
+            "required": ["category", "title", "description"],
+        },
+    ),
+    ToolDefinition(
+        name="set_investigation_summary",
+        description="""Set a title and summary for your investigation.
+
+Call this when you've completed your investigation (or periodically as you make progress)
+to provide a human-readable title and summary that will be shown in the investigations UI.
+
+The title should be short and descriptive. The summary should be 1-3 sentences
+explaining what you investigated and what you found.""",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "title": {
+                    "type": "string",
+                    "description": "Short title for the investigation (e.g., 'Gendered Pronoun Circuit')",
+                },
+                "summary": {
+                    "type": "string",
+                    "description": "Brief summary of findings (1-3 sentences)",
+                },
+                "status": {
+                    "type": "string",
+                    "enum": ["in_progress", "completed", "inconclusive"],
+                    "description": "Current status of the investigation",
+                    "default": "in_progress",
+                },
+            },
+            "required": ["title", "summary"],
+        },
+    ),
+]
+
+
+# =============================================================================
+# Tool Implementations
+# =============================================================================
+
+
+def _get_state():
+    """Get state manager and loaded run, raising clear errors if not available."""
+    manager = StateManager.get()
+    if manager.run_state is None:
+        raise ValueError("No run loaded. The backend must load a run first.")
+    return manager, manager.run_state
+
+
+def _tool_optimize_graph(params: dict[str, Any]) -> Generator[dict[str, Any]]:
+    """Optimize a sparse circuit for a behavior. Yields progress events."""
+    manager, loaded = _get_state()
+
+    prompt_text = params["prompt_text"]
+    target_token = params["target_token"]
+    steps = params.get("steps", 100)
+    ci_threshold = params.get("ci_threshold", 0.5)
+
+    # Tokenize prompt
+    token_ids = loaded.tokenizer.encode(prompt_text, add_special_tokens=False)
+    if not token_ids:
+        raise ValueError("Prompt text produced no tokens")
+
+    # Find target token ID
+    target_token_ids = loaded.tokenizer.encode(target_token, add_special_tokens=False)
+    if len(target_token_ids) != 1:
+        raise ValueError(
+            f"Target token '{target_token}' tokenizes to {len(target_token_ids)} tokens, expected 1. "
+            f"Token IDs: {target_token_ids}"
+        )
+    label_token = target_token_ids[0]
+
+    # Determine loss position
+    loss_position = params.get("loss_position")
+    if loss_position is None:
+        loss_position = len(token_ids) - 1
+
+    if loss_position >= len(token_ids):
+        raise ValueError(
+            f"loss_position {loss_position} out of bounds for prompt with {len(token_ids)} tokens"
+        )
+
+    _log_event(
+        "tool_start",
+        f"optimize_graph: '{prompt_text}' → '{target_token}'",
+        {"steps": steps, "loss_position": loss_position},
+    )
+
+    yield {"type": "progress", "current": 0, "total": steps, "stage": "starting optimization"}
+
+    # Create prompt in DB
+    prompt_id = manager.db.add_custom_prompt(
+        run_id=loaded.run.id,
+        token_ids=token_ids,
+        context_length=loaded.context_length,
+    )
+
+    # Build optimization config
+    loss_config = CELossConfig(coeff=1.0, position=loss_position, label_token=label_token)
+
+    optim_config = OptimCIConfig(
+        seed=0,
+        lr=1e-2,
+        steps=steps,
+        weight_decay=0.0,
+        lr_schedule="cosine",
+        lr_exponential_halflife=None,
+        lr_warmup_pct=0.01,
+        log_freq=max(1, steps // 10),
+        imp_min_config=ImportanceMinimalityLossConfig(coeff=0.1, pnorm=0.5, beta=0.0),
+        loss_config=loss_config,
+        sampling=loaded.config.sampling,
+        ce_kl_rounding_threshold=0.5,
+        mask_type="ci",
+    )
+
+    tokens_tensor = torch.tensor([token_ids], device=DEVICE)
+    progress_queue: queue.Queue[dict[str, Any]] = queue.Queue()
+
+    def on_progress(current: int, total: int, stage: str) -> None:
+        progress_queue.put({"current": current, "total": total, "stage": stage})
+
+    # Run optimization in thread
+    result_holder: list[Any] = []
+    error_holder: list[Exception] = []
+
+    def compute():
+        try:
+            with manager.gpu_lock():
+                result = compute_prompt_attributions_optimized(
+                    model=loaded.model,
+                    tokens=tokens_tensor,
+                    sources_by_target=loaded.sources_by_target,
+                    optim_config=optim_config,
+                    output_prob_threshold=0.01,
+                    device=DEVICE,
+                    on_progress=on_progress,
+                )
+                result_holder.append(result)
+        except Exception as e:
+            error_holder.append(e)
+
+    thread = threading.Thread(target=compute)
+    thread.start()
+
+    # Yield progress events (throttle logging to every 10% or 10 steps)
+    last_logged_step = -1
+    log_interval = max(1, steps // 10)
+
+    while thread.is_alive() or not progress_queue.empty():
+        try:
+            progress = progress_queue.get(timeout=0.1)
+            current = progress["current"]
+            # Log to events.jsonl at intervals (for human monitoring)
+            if current - last_logged_step >= log_interval or current == progress["total"]:
+                _log_event(
+                    "optimization_progress",
+                    f"optimize_graph: step {current}/{progress['total']} ({progress['stage']})",
+                    {"prompt": prompt_text, "target": target_token, **progress},
+                )
+                last_logged_step = current
+            # Always yield to SSE stream (for Claude)
+            yield {"type": "progress", **progress}
+        except queue.Empty:
+            continue
+
+    thread.join()
+
+    if error_holder:
+        raise error_holder[0]
+
+    if not result_holder:
+        raise RuntimeError("Optimization completed but no result was produced")
+
+    result = result_holder[0]
+
+    # Build output
+    out_probs = build_out_probs(
+        ci_masked_out_probs=result.ci_masked_out_probs.cpu(),
+        ci_masked_out_logits=result.ci_masked_out_logits.cpu(),
+        target_out_probs=result.target_out_probs.cpu(),
+        target_out_logits=result.target_out_logits.cpu(),
+        output_prob_threshold=0.01,
+        token_strings=loaded.token_strings,
+    )
+
+    # Save graph to DB
+    from spd.app.backend.database import OptimizationParams
+
+    opt_params = OptimizationParams(
+        imp_min_coeff=0.1,
+        steps=steps,
+        pnorm=0.5,
+        beta=0.0,
+        mask_type="ci",
+        loss=loss_config,
+    )
+    graph_id = manager.db.save_graph(
+        prompt_id=prompt_id,
+        graph=StoredGraph(
+            graph_type="optimized",
+            edges=result.edges,
+            out_probs=out_probs,
+            node_ci_vals=result.node_ci_vals,
+            node_subcomp_acts=result.node_subcomp_acts,
+            optimization_params=opt_params,
+        ),
+    )
+
+    # Filter nodes by CI threshold
+    active_components = {k: v for k, v in result.node_ci_vals.items() if v >= ci_threshold}
+
+    # Get target token probability
+    target_key = f"{loss_position}:{label_token}"
+    target_prob = out_probs.get(target_key)
+
+    token_strings = [loaded.token_strings[t] for t in token_ids]
+
+    final_result = {
+        "graph_id": graph_id,
+        "prompt_id": prompt_id,
+        "tokens": token_strings,
+        "target_token": target_token,
+        "target_token_id": label_token,
+        "target_position": loss_position,
+        "target_probability": target_prob.prob if target_prob else None,
+        "target_probability_baseline": target_prob.target_prob if target_prob else None,
+        "active_components": active_components,
+        "total_active": len(active_components),
+        "output_probs": {k: {"prob": v.prob, "token": v.token} for k, v in out_probs.items()},
+    }
+
+    _log_event(
+        "tool_complete",
+        f"optimize_graph complete: {len(active_components)} active components",
+        {"graph_id": graph_id, "target_prob": target_prob.prob if target_prob else None},
+    )
+
+    yield {"type": "result", "data": final_result}
+
+
+def _tool_get_component_info(params: dict[str, Any]) -> dict[str, Any]:
+    """Get detailed information about a component."""
+    _, loaded = _get_state()
+
+    layer = params["layer"]
+    component_idx = params["component_idx"]
+    top_k = params.get("top_k", 20)
+    component_key = f"{layer}:{component_idx}"
+
+    _log_event(
+        "tool_call", f"get_component_info: {component_key}", {"layer": layer, "idx": component_idx}
+    )
+
+    result: dict[str, Any] = {"component_key": component_key}
+
+    # Get interpretation
+    interpretations = loaded.harvest.interpretations
+    if component_key in interpretations:
+        interp = interpretations[component_key]
+        result["interpretation"] = {
+            "label": interp.label,
+            "confidence": interp.confidence,
+            "reasoning": interp.reasoning,
+        }
+    else:
+        result["interpretation"] = None
+
+    # Get token stats
+    token_stats = loaded.harvest.token_stats
+    input_stats = analysis.get_input_token_stats(
+        token_stats, component_key, loaded.tokenizer, top_k
+    )
+    output_stats = analysis.get_output_token_stats(
+        token_stats, component_key, loaded.tokenizer, top_k
+    )
+
+    if input_stats and output_stats:
+        result["token_stats"] = {
+            "input": {
+                "top_recall": input_stats.top_recall,
+                "top_precision": input_stats.top_precision,
+                "top_pmi": input_stats.top_pmi,
+            },
+            "output": {
+                "top_recall": output_stats.top_recall,
+                "top_precision": output_stats.top_precision,
+                "top_pmi": output_stats.top_pmi,
+                "bottom_pmi": output_stats.bottom_pmi,
+            },
+        }
+    else:
+        result["token_stats"] = None
+
+    # Get correlations
+    correlations = loaded.harvest.correlations
+    if analysis.has_component(correlations, component_key):
+        result["correlated_components"] = {
+            "precision": [
+                {"key": c.component_key, "score": c.score}
+                for c in analysis.get_correlated_components(
+                    correlations, component_key, "precision", top_k
+                )
+            ],
+            "pmi": [
+                {"key": c.component_key, "score": c.score}
+                for c in analysis.get_correlated_components(
+                    correlations, component_key, "pmi", top_k
+                )
+            ],
+        }
+    else:
+        result["correlated_components"] = None
+
+    return result
+
+
+def _tool_run_ablation(params: dict[str, Any]) -> dict[str, Any]:
+    """Run ablation with selected components."""
+    manager, loaded = _get_state()
+
+    text = params["text"]
+    selected_nodes = params["selected_nodes"]
+    top_k = params.get("top_k", 10)
+
+    _log_event(
+        "tool_call",
+        f"run_ablation: '{text[:50]}...' with {len(selected_nodes)} nodes",
+        {"text": text, "n_nodes": len(selected_nodes)},
+    )
+
+    token_ids = loaded.tokenizer.encode(text, add_special_tokens=False)
+    tokens = torch.tensor([token_ids], dtype=torch.long, device=DEVICE)
+
+    # Parse node keys
+    active_nodes = []
+    for key in selected_nodes:
+        parts = key.split(":")
+        if len(parts) != 3:
+            raise ValueError(f"Invalid node key format: {key!r} (expected 'layer:seq:cIdx')")
+        layer, seq_str, cidx_str = parts
+        if layer in ("wte", "output"):
+            raise ValueError(f"Cannot intervene on {layer!r} nodes - only internal layers allowed")
+        active_nodes.append((layer, int(seq_str), int(cidx_str)))
+
+    with manager.gpu_lock():
+        result = compute_intervention_forward(
+            model=loaded.model,
+            tokens=tokens,
+            active_nodes=active_nodes,
+            top_k=top_k,
+            tokenizer=loaded.tokenizer,
+        )
+
+    predictions = []
+    for pos_predictions in result.predictions_per_position:
+        pos_result = []
+        for token, token_id, spd_prob, _logit, target_prob, _target_logit in pos_predictions:
+            pos_result.append(
+                {
+                    "token": token,
+                    "token_id": token_id,
+                    "circuit_prob": round(spd_prob, 6),
+                    "full_model_prob": round(target_prob, 6),
+                }
+            )
+        predictions.append(pos_result)
+
+    return {
+        "input_tokens": result.input_tokens,
+        "predictions_per_position": predictions,
+        "selected_nodes": selected_nodes,
+    }
+
+
+def _tool_search_dataset(params: dict[str, Any]) -> dict[str, Any]:
+    """Search the SimpleStories dataset."""
+    import time
+
+    from datasets import Dataset, load_dataset
+
+    query = params["query"]
+    limit = params.get("limit", 20)
+    search_query = query.lower()
+
+    _log_event("tool_call", f"search_dataset: '{query}'", {"query": query, "limit": limit})
+
+    start_time = time.time()
+    dataset = load_dataset("lennart-finke/SimpleStories", split="train")
+    assert isinstance(dataset, Dataset)
+
+    filtered = dataset.filter(
+        lambda x: search_query in x["story"].lower(),
+        num_proc=4,
+    )
+
+    results = []
+    for i, item in enumerate(filtered):
+        if i >= limit:
+            break
+        item_dict: dict[str, Any] = dict(item)
+        story: str = item_dict["story"]
+        results.append(
+            {
+                "story": story[:500] + "..." if len(story) > 500 else story,
+                "occurrence_count": story.lower().count(search_query),
+            }
+        )
+
+    return {
+        "query": query,
+        "total_matches": len(filtered),
+        "returned": len(results),
+        "search_time_seconds": round(time.time() - start_time, 2),
+        "results": results,
+    }
+
+
+def _tool_create_prompt(params: dict[str, Any]) -> dict[str, Any]:
+    """Create a prompt from text."""
+    manager, loaded = _get_state()
+
+    text = params["text"]
+
+    _log_event("tool_call", f"create_prompt: '{text[:50]}...'", {"text": text})
+
+    token_ids = loaded.tokenizer.encode(text, add_special_tokens=False)
+    if not token_ids:
+        raise ValueError("Text produced no tokens")
+
+    prompt_id = manager.db.add_custom_prompt(
+        run_id=loaded.run.id,
+        token_ids=token_ids,
+        context_length=loaded.context_length,
+    )
+
+    # Compute next token probs
+    tokens_tensor = torch.tensor([token_ids], device=DEVICE)
+    with torch.no_grad():
+        logits = loaded.model(tokens_tensor)
+        probs = torch.softmax(logits, dim=-1)
+
+    next_token_probs = []
+    for i in range(len(token_ids) - 1):
+        next_token_id = token_ids[i + 1]
+        prob = probs[0, i, next_token_id].item()
+        next_token_probs.append(round(prob, 6))
+    next_token_probs.append(None)
+
+    token_strings = [loaded.token_strings[t] for t in token_ids]
+
+    return {
+        "prompt_id": prompt_id,
+        "text": text,
+        "tokens": token_strings,
+        "token_ids": token_ids,
+        "next_token_probs": next_token_probs,
+    }
+
+
+def _tool_update_research_log(params: dict[str, Any]) -> dict[str, Any]:
+    """Append content to the research log."""
+    if _task_dir is None:
+        raise ValueError("Research log not available - not running in swarm mode")
+
+    content = params["content"]
+    research_log_path = _task_dir / "research_log.md"
+
+    _log_event(
+        "tool_call", f"update_research_log: {len(content)} chars", {"preview": content[:100]}
+    )
+
+    # Append content with a newline separator
+    with open(research_log_path, "a") as f:
+        f.write(content)
+        if not content.endswith("\n"):
+            f.write("\n")
+
+    return {"status": "ok", "path": str(research_log_path)}
+
+
+def _tool_save_explanation(params: dict[str, Any]) -> dict[str, Any]:
+    """Save a behavior explanation to explanations.jsonl."""
+    from spd.agent_swarm.schemas import BehaviorExplanation, ComponentInfo, Evidence
+
+    if _task_dir is None:
+        raise ValueError("Explanations file not available - not running in swarm mode")
+
+    _log_event(
+        "tool_call",
+        f"save_explanation: '{params['behavior_description'][:50]}...'",
+        {"prompt": params["subject_prompt"]},
+    )
+
+    # Build components
+    components = [
+        ComponentInfo(
+            component_key=c["component_key"],
+            role=c["role"],
+            interpretation=c.get("interpretation"),
+        )
+        for c in params["components_involved"]
+    ]
+
+    # Build evidence
+    evidence = [
+        Evidence(
+            evidence_type=e["evidence_type"],
+            description=e["description"],
+            details=e.get("details", {}),
+        )
+        for e in params.get("supporting_evidence", [])
+    ]
+
+    explanation = BehaviorExplanation(
+        subject_prompt=params["subject_prompt"],
+        behavior_description=params["behavior_description"],
+        components_involved=components,
+        explanation=params["explanation"],
+        supporting_evidence=evidence,
+        confidence=params["confidence"],
+        alternative_hypotheses=params.get("alternative_hypotheses", []),
+        limitations=params.get("limitations", []),
+    )
+
+    explanations_path = _task_dir / "explanations.jsonl"
+    with open(explanations_path, "a") as f:
+        f.write(explanation.model_dump_json() + "\n")
+
+    _log_event(
+        "explanation",
+        f"Saved explanation: {params['behavior_description']}",
+        {"confidence": params["confidence"], "n_components": len(components)},
+    )
+
+    return {"status": "ok", "path": str(explanations_path)}
+
+
+def _tool_submit_suggestion(params: dict[str, Any]) -> dict[str, Any]:
+    """Submit a suggestion for system improvement."""
+    if _suggestions_path is None:
+        raise ValueError("Suggestions not available - not running in swarm mode")
+
+    suggestion = {
+        "timestamp": datetime.now(UTC).isoformat(),
+        "category": params["category"],
+        "title": params["title"],
+        "description": params["description"],
+        "context": params.get("context"),
+    }
+
+    _log_event(
+        "tool_call",
+        f"submit_suggestion: [{params['category']}] {params['title']}",
+        suggestion,
+    )
+
+    # Ensure parent directory exists
+    _suggestions_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(_suggestions_path, "a") as f:
+        f.write(json.dumps(suggestion) + "\n")
+
+    return {"status": "ok", "message": "Suggestion recorded. Thank you!"}
+
+
+def _tool_set_investigation_summary(params: dict[str, Any]) -> dict[str, Any]:
+    """Set the investigation title and summary."""
+    if _task_dir is None:
+        raise ValueError("Summary not available - not running in swarm mode")
+
+    summary = {
+        "title": params["title"],
+        "summary": params["summary"],
+        "status": params.get("status", "in_progress"),
+        "updated_at": datetime.now(UTC).isoformat(),
+    }
+
+    _log_event(
+        "tool_call",
+        f"set_investigation_summary: {params['title']}",
+        summary,
+    )
+
+    summary_path = _task_dir / "summary.json"
+    summary_path.write_text(json.dumps(summary, indent=2))
+
+    return {"status": "ok", "path": str(summary_path)}
+
+
+# =============================================================================
+# MCP Protocol Handler
+# =============================================================================
+
+
+def _handle_initialize(_params: dict[str, Any] | None) -> dict[str, Any]:
+    """Handle initialize request."""
+    return {
+        "protocolVersion": MCP_PROTOCOL_VERSION,
+        "capabilities": {"tools": {}},
+        "serverInfo": {"name": "spd-app", "version": "1.0.0"},
+    }
+
+
+def _handle_tools_list() -> dict[str, Any]:
+    """Handle tools/list request."""
+    return {"tools": [t.model_dump() for t in TOOLS]}
+
+
+def _handle_tools_call(
+    params: dict[str, Any],
+) -> Generator[dict[str, Any]] | dict[str, Any]:
+    """Handle tools/call request. May return generator for streaming tools."""
+    name = params.get("name")
+    arguments = params.get("arguments", {})
+
+    if name == "optimize_graph":
+        # This tool streams progress
+        return _tool_optimize_graph(arguments)
+    elif name == "get_component_info":
+        return {
+            "content": [
+                {"type": "text", "text": json.dumps(_tool_get_component_info(arguments), indent=2)}
+            ]
+        }
+    elif name == "run_ablation":
+        return {
+            "content": [
+                {"type": "text", "text": json.dumps(_tool_run_ablation(arguments), indent=2)}
+            ]
+        }
+    elif name == "search_dataset":
+        return {
+            "content": [
+                {"type": "text", "text": json.dumps(_tool_search_dataset(arguments), indent=2)}
+            ]
+        }
+    elif name == "create_prompt":
+        return {
+            "content": [
+                {"type": "text", "text": json.dumps(_tool_create_prompt(arguments), indent=2)}
+            ]
+        }
+    elif name == "update_research_log":
+        return {
+            "content": [
+                {"type": "text", "text": json.dumps(_tool_update_research_log(arguments), indent=2)}
+            ]
+        }
+    elif name == "save_explanation":
+        return {
+            "content": [
+                {"type": "text", "text": json.dumps(_tool_save_explanation(arguments), indent=2)}
+            ]
+        }
+    elif name == "submit_suggestion":
+        return {
+            "content": [
+                {"type": "text", "text": json.dumps(_tool_submit_suggestion(arguments), indent=2)}
+            ]
+        }
+    elif name == "set_investigation_summary":
+        return {
+            "content": [
+                {
+                    "type": "text",
+                    "text": json.dumps(_tool_set_investigation_summary(arguments), indent=2),
+                }
+            ]
+        }
+    else:
+        raise ValueError(f"Unknown tool: {name}")
+
+
+@router.post("/mcp")
+async def mcp_endpoint(request: Request):
+    """MCP JSON-RPC endpoint.
+
+    Handles initialize, tools/list, and tools/call methods.
+    Returns SSE stream for streaming tools, JSON for others.
+    """
+    try:
+        body = await request.json()
+        mcp_request = MCPRequest(**body)
+    except Exception as e:
+        return JSONResponse(
+            status_code=400,
+            content=MCPResponse(
+                id=None, error={"code": -32700, "message": f"Parse error: {e}"}
+            ).model_dump(),
+        )
+
+    logger.info(f"[MCP] {mcp_request.method} (id={mcp_request.id})")
+
+    try:
+        if mcp_request.method == "initialize":
+            result = _handle_initialize(mcp_request.params)
+            return JSONResponse(
+                content=MCPResponse(id=mcp_request.id, result=result).model_dump(),
+                headers={"Mcp-Session-Id": "spd-session"},
+            )
+
+        elif mcp_request.method == "notifications/initialized":
+            # Client confirms initialization
+            return JSONResponse(status_code=202, content={})
+
+        elif mcp_request.method == "tools/list":
+            result = _handle_tools_list()
+            return JSONResponse(content=MCPResponse(id=mcp_request.id, result=result).model_dump())
+
+        elif mcp_request.method == "tools/call":
+            if mcp_request.params is None:
+                raise ValueError("tools/call requires params")
+
+            result = _handle_tools_call(mcp_request.params)
+
+            # Check if result is a generator (streaming)
+            if inspect.isgenerator(result):
+                # Streaming response via SSE
+                gen = result  # Capture for closure
+
+                def generate_sse() -> Generator[str]:
+                    try:
+                        final_result = None
+                        for event in gen:
+                            if event.get("type") == "progress":
+                                # Send progress notification
+                                progress_msg = {
+                                    "jsonrpc": "2.0",
+                                    "method": "notifications/progress",
+                                    "params": event,
+                                }
+                                yield f"data: {json.dumps(progress_msg)}\n\n"
+                            elif event.get("type") == "result":
+                                final_result = event["data"]
+
+                        # Send final response
+                        response = MCPResponse(
+                            id=mcp_request.id,
+                            result={
+                                "content": [
+                                    {"type": "text", "text": json.dumps(final_result, indent=2)}
+                                ]
+                            },
+                        )
+                        yield f"data: {json.dumps(response.model_dump())}\n\n"
+                    except Exception as e:
+                        tb = traceback.format_exc()
+                        logger.error(f"[MCP] Tool error: {e}\n{tb}")
+                        error_response = MCPResponse(
+                            id=mcp_request.id,
+                            error={"code": -32000, "message": str(e)},
+                        )
+                        yield f"data: {json.dumps(error_response.model_dump())}\n\n"
+
+                return StreamingResponse(generate_sse(), media_type="text/event-stream")
+
+            else:
+                # Non-streaming response
+                return JSONResponse(
+                    content=MCPResponse(id=mcp_request.id, result=result).model_dump()
+                )
+
+        else:
+            return JSONResponse(
+                content=MCPResponse(
+                    id=mcp_request.id,
+                    error={"code": -32601, "message": f"Method not found: {mcp_request.method}"},
+                ).model_dump()
+            )
+
+    except Exception as e:
+        tb = traceback.format_exc()
+        logger.error(f"[MCP] Error handling {mcp_request.method}: {e}\n{tb}")
+        return JSONResponse(
+            content=MCPResponse(
+                id=mcp_request.id,
+                error={"code": -32000, "message": str(e)},
+            ).model_dump()
+        )
diff --git a/spd/app/backend/server.py b/spd/app/backend/server.py
index 45f5d9afb..68316bb69 100644
--- a/spd/app/backend/server.py
+++ b/spd/app/backend/server.py
@@ -34,6 +34,8 @@
     dataset_search_router,
     graphs_router,
     intervention_router,
+    investigations_router,
+    mcp_router,
     prompts_router,
     runs_router,
 )
@@ -47,6 +49,15 @@
 @asynccontextmanager
 async def lifespan(app: FastAPI):  # pyright: ignore[reportUnusedParameter]
     """Initialize DB connection at startup. Model loaded on-demand via /api/runs/load."""
+    import os
+    from pathlib import Path
+
+    from spd.app.backend.routers.mcp import (
+        set_events_log_path,
+        set_suggestions_path,
+        set_task_dir,
+    )
+
     manager = StateManager.get()
 
     db = PromptAttrDB(check_same_thread=False)
@@ -57,6 +68,22 @@ async def lifespan(app: FastAPI):  # pyright: ignore[reportUnusedParameter]
     logger.info(f"[STARTUP] Device: {DEVICE}")
     logger.info(f"[STARTUP] CUDA available: {torch.cuda.is_available()}")
 
+    # Configure MCP for agent swarm mode
+    mcp_events_path = os.environ.get("SPD_MCP_EVENTS_PATH")
+    if mcp_events_path:
+        set_events_log_path(Path(mcp_events_path))
+        logger.info(f"[STARTUP] MCP events logging to: {mcp_events_path}")
+
+    mcp_task_dir = os.environ.get("SPD_MCP_TASK_DIR")
+    if mcp_task_dir:
+        set_task_dir(Path(mcp_task_dir))
+        logger.info(f"[STARTUP] MCP task dir: {mcp_task_dir}")
+
+    mcp_suggestions_path = os.environ.get("SPD_MCP_SUGGESTIONS_PATH")
+    if mcp_suggestions_path:
+        set_suggestions_path(Path(mcp_suggestions_path))
+        logger.info(f"[STARTUP] MCP suggestions file: {mcp_suggestions_path}")
+
     yield
 
     manager.close()
@@ -157,6 +184,8 @@ async def global_exception_handler(request: Request, exc: Exception) -> JSONResp
 app.include_router(dataset_attributions_router)
 app.include_router(agents_router)
 app.include_router(component_data_router)
+app.include_router(investigations_router)
+app.include_router(mcp_router)
 
 
 def cli(port: int = 8000) -> None:
diff --git a/spd/app/frontend/src/components/InvestigationsTab.svelte b/spd/app/frontend/src/components/InvestigationsTab.svelte
new file mode 100644
index 000000000..e9b4a7cb8
--- /dev/null
+++ b/spd/app/frontend/src/components/InvestigationsTab.svelte
@@ -0,0 +1,497 @@
+<script lang="ts">
+    import * as api from "../lib/api";
+    import type { InvestigationSummary, InvestigationDetail } from "../lib/api/investigations";
+    import type { Loadable } from "../lib";
+
+    // State
+    let investigations = $state<Loadable<InvestigationSummary[]>>({ status: "uninitialized" });
+    let selected = $state<Loadable<InvestigationDetail> | null>(null);
+    let activeTab = $state<"research" | "events">("research");
+
+    // Load investigations on mount
+    $effect(() => {
+        loadInvestigations();
+    });
+
+    async function loadInvestigations() {
+        investigations = { status: "loading" };
+        try {
+            const data = await api.listInvestigations();
+            investigations = { status: "loaded", data };
+        } catch (e) {
+            investigations = { status: "error", error: String(e) };
+        }
+    }
+
+    async function selectInvestigation(swarmId: string, taskId: number) {
+        selected = { status: "loading" };
+        try {
+            const data = await api.getInvestigation(swarmId, taskId);
+            selected = { status: "loaded", data };
+        } catch (e) {
+            selected = { status: "error", error: String(e) };
+        }
+    }
+
+    function goBack() {
+        selected = null;
+    }
+
+    function formatDate(isoString: string): string {
+        const date = new Date(isoString);
+        return date.toLocaleString();
+    }
+
+    function formatId(id: string): string {
+        // swarm-abc123/1 -> abc123/1
+        return id.replace("swarm-", "");
+    }
+
+    function getEventTypeColor(eventType: string): string {
+        switch (eventType) {
+            case "tool_start":
+            case "tool_call":
+                return "var(--accent-blue)";
+            case "tool_complete":
+                return "var(--accent-green)";
+            case "optimization_progress":
+                return "var(--accent-yellow)";
+            case "error":
+                return "var(--accent-red)";
+            default:
+                return "var(--text-muted)";
+        }
+    }
+</script>
+
+<div class="investigations-container">
+    {#if selected?.status === "loaded"}
+        <!-- Investigation Detail View -->
+        <div class="detail-header">
+            <button class="back-button" onclick={goBack}>← Back</button>
+            <h2>{selected.data.title || formatId(selected.data.id)}</h2>
+            {#if selected.data.status}
+                <span class="status-pill" class:completed={selected.data.status === "completed"} class:in-progress={selected.data.status === "in_progress"}>
+                    {selected.data.status}
+                </span>
+            {/if}
+        </div>
+
+        {#if selected.data.summary}
+            <p class="investigation-summary-detail">{selected.data.summary}</p>
+        {/if}
+
+        <!-- TODO: Add ability to open corresponding graph for this investigation -->
+        <p class="investigation-info">
+            {formatId(selected.data.id)} · Started {formatDate(selected.data.created_at)}
+            {#if selected.data.wandb_path}
+                · <span class="wandb-path">{selected.data.wandb_path}</span>
+            {/if}
+        </p>
+
+        <div class="tabs">
+            <button class="tab" class:active={activeTab === "research"} onclick={() => (activeTab = "research")}>
+                Research Log
+            </button>
+            <button class="tab" class:active={activeTab === "events"} onclick={() => (activeTab = "events")}>
+                Events ({selected.data.events.length})
+            </button>
+        </div>
+
+        <div class="tab-content">
+            {#if activeTab === "research"}
+                <div class="research-log">
+                    {#if selected.data.research_log}
+                        <pre class="log-content">{selected.data.research_log}</pre>
+                    {:else}
+                        <p class="empty-message">No research log available</p>
+                    {/if}
+                </div>
+            {:else}
+                <div class="events-list">
+                    {#each selected.data.events as event, i (i)}
+                        <div class="event-entry">
+                            <span class="event-type" style="color: {getEventTypeColor(event.event_type)}">
+                                {event.event_type}
+                            </span>
+                            <span class="event-time">{formatDate(event.timestamp)}</span>
+                            <span class="event-message">{event.message}</span>
+                            {#if event.details && Object.keys(event.details).length > 0}
+                                <details class="event-details">
+                                    <summary>Details</summary>
+                                    <pre>{JSON.stringify(event.details, null, 2)}</pre>
+                                </details>
+                            {/if}
+                        </div>
+                    {:else}
+                        <p class="empty-message">No events recorded</p>
+                    {/each}
+                </div>
+            {/if}
+        </div>
+    {:else if selected?.status === "loading"}
+        <div class="loading">Loading investigation...</div>
+    {:else}
+        <!-- Investigations List View -->
+        <div class="list-header">
+            <h2>Investigations</h2>
+            <button class="refresh-button" onclick={loadInvestigations}>↻ Refresh</button>
+        </div>
+
+        {#if investigations.status === "loading"}
+            <div class="loading">Loading investigations...</div>
+        {:else if investigations.status === "error"}
+            <div class="error">{investigations.error}</div>
+        {:else if investigations.status === "loaded"}
+            <div class="investigations-list">
+                {#each investigations.data as inv (inv.id)}
+                    <button class="investigation-card" onclick={() => selectInvestigation(inv.swarm_id, inv.task_id)}>
+                        <div class="card-header">
+                            <span class="investigation-id">{formatId(inv.id)}</span>
+                            <span class="investigation-date">{formatDate(inv.created_at)}</span>
+                        </div>
+                        {#if inv.title}
+                            <span class="investigation-title">{inv.title}</span>
+                        {/if}
+                        {#if inv.wandb_path}
+                            <span class="investigation-wandb">{inv.wandb_path}</span>
+                        {/if}
+                        <div class="card-status">
+                            {#if inv.status}
+                                <span class="status-badge" class:success={inv.status === "completed"} class:warning={inv.status === "in_progress"}>
+                                    {inv.status === "completed" ? "✓" : inv.status === "in_progress" ? "⏳" : "?"} {inv.status}
+                                </span>
+                            {/if}
+                            {#if inv.has_research_log}
+                                <span class="status-badge">📝 Log</span>
+                            {/if}
+                            {#if inv.event_count > 0}
+                                <span class="status-badge">{inv.event_count} events</span>
+                            {/if}
+                            {#if inv.has_explanations}
+                                <span class="status-badge success">✓ Explanations</span>
+                            {/if}
+                        </div>
+                        {#if inv.summary}
+                            <span class="investigation-summary">{inv.summary}</span>
+                        {:else if inv.last_event_message}
+                            <span class="last-event">{inv.last_event_message}</span>
+                        {/if}
+                    </button>
+                {:else}
+                    <p class="empty-message">No investigations found. Run <code>spd-swarm</code> to create one.</p>
+                {/each}
+            </div>
+        {/if}
+    {/if}
+</div>
+
+<style>
+    .investigations-container {
+        padding: var(--space-4);
+        height: 100%;
+        overflow-y: auto;
+    }
+
+    .list-header,
+    .detail-header {
+        display: flex;
+        align-items: center;
+        gap: var(--space-3);
+        margin-bottom: var(--space-4);
+    }
+
+    .list-header h2,
+    .detail-header h2 {
+        margin: 0;
+        font-size: var(--text-xl);
+        color: var(--text-primary);
+    }
+
+    .back-button {
+        padding: var(--space-1) var(--space-2);
+        background: var(--bg-surface);
+        border: 1px solid var(--border-default);
+        border-radius: var(--radius-sm);
+        color: var(--text-secondary);
+        cursor: pointer;
+        font-size: var(--text-sm);
+    }
+
+    .back-button:hover {
+        background: var(--bg-elevated);
+        color: var(--text-primary);
+    }
+
+    .refresh-button {
+        padding: var(--space-1) var(--space-2);
+        background: none;
+        border: 1px solid var(--border-default);
+        border-radius: var(--radius-sm);
+        color: var(--text-secondary);
+        cursor: pointer;
+        font-size: var(--text-sm);
+    }
+
+    .refresh-button:hover {
+        background: var(--bg-surface);
+    }
+
+    .wandb-path,
+    .investigation-wandb {
+        font-family: var(--font-mono);
+        font-size: var(--text-xs);
+        color: var(--accent-primary);
+    }
+
+    .investigation-info {
+        color: var(--text-muted);
+        font-size: var(--text-sm);
+        margin-bottom: var(--space-4);
+    }
+
+    .investigations-list {
+        display: flex;
+        flex-direction: column;
+        gap: var(--space-2);
+    }
+
+    .investigation-card {
+        display: flex;
+        flex-direction: column;
+        align-items: flex-start;
+        gap: var(--space-1);
+        padding: var(--space-3);
+        background: var(--bg-surface);
+        border: 1px solid var(--border-default);
+        border-radius: var(--radius-md);
+        cursor: pointer;
+        text-align: left;
+        width: 100%;
+        transition:
+            border-color var(--transition-normal),
+            background var(--transition-normal);
+    }
+
+    .investigation-card:hover {
+        border-color: var(--accent-primary);
+        background: var(--bg-elevated);
+    }
+
+    .card-header {
+        display: flex;
+        justify-content: space-between;
+        align-items: center;
+        width: 100%;
+    }
+
+    .investigation-id {
+        font-size: var(--text-sm);
+        font-weight: 600;
+        color: var(--text-primary);
+        font-family: var(--font-mono);
+    }
+
+    .investigation-date {
+        font-size: var(--text-xs);
+        color: var(--text-muted);
+    }
+
+    .card-status {
+        display: flex;
+        gap: var(--space-1);
+        flex-wrap: wrap;
+    }
+
+    .status-badge {
+        padding: var(--space-0) var(--space-1);
+        background: var(--bg-inset);
+        border-radius: var(--radius-sm);
+        font-size: var(--text-xs);
+        color: var(--text-muted);
+    }
+
+    .status-badge.success {
+        background: var(--accent-green-dim);
+        color: var(--accent-green);
+    }
+
+    .status-badge.warning {
+        background: var(--accent-yellow-dim);
+        color: var(--accent-yellow);
+    }
+
+    .investigation-title {
+        font-size: var(--text-sm);
+        font-weight: 600;
+        color: var(--text-primary);
+    }
+
+    .investigation-summary {
+        font-size: var(--text-xs);
+        color: var(--text-secondary);
+        line-height: 1.4;
+    }
+
+    .investigation-summary-detail {
+        font-size: var(--text-sm);
+        color: var(--text-secondary);
+        margin-bottom: var(--space-3);
+        line-height: 1.5;
+    }
+
+    .status-pill {
+        padding: var(--space-0) var(--space-2);
+        border-radius: var(--radius-full);
+        font-size: var(--text-xs);
+        font-weight: 500;
+        background: var(--bg-inset);
+        color: var(--text-muted);
+    }
+
+    .status-pill.completed {
+        background: var(--accent-green-dim);
+        color: var(--accent-green);
+    }
+
+    .status-pill.in-progress {
+        background: var(--accent-yellow-dim);
+        color: var(--accent-yellow);
+    }
+
+    .last-event {
+        font-size: var(--text-xs);
+        color: var(--text-muted);
+        overflow: hidden;
+        text-overflow: ellipsis;
+        white-space: nowrap;
+        max-width: 100%;
+    }
+
+    .tabs {
+        display: flex;
+        gap: var(--space-1);
+        margin-bottom: var(--space-3);
+        border-bottom: 1px solid var(--border-default);
+    }
+
+    .tab {
+        padding: var(--space-2) var(--space-3);
+        background: none;
+        border: none;
+        border-bottom: 2px solid transparent;
+        color: var(--text-muted);
+        cursor: pointer;
+        font-size: var(--text-sm);
+        transition: color var(--transition-normal);
+    }
+
+    .tab:hover {
+        color: var(--text-primary);
+    }
+
+    .tab.active {
+        color: var(--text-primary);
+        border-bottom-color: var(--accent-primary);
+    }
+
+    .tab-content {
+        flex: 1;
+        overflow-y: auto;
+    }
+
+    .research-log {
+        background: var(--bg-surface);
+        border: 1px solid var(--border-default);
+        border-radius: var(--radius-md);
+        padding: var(--space-3);
+        max-height: 70vh;
+        overflow-y: auto;
+    }
+
+    .log-content {
+        margin: 0;
+        font-family: var(--font-mono);
+        font-size: var(--text-sm);
+        color: var(--text-primary);
+        white-space: pre-wrap;
+        word-wrap: break-word;
+    }
+
+    .events-list {
+        display: flex;
+        flex-direction: column;
+        gap: var(--space-2);
+        max-height: 70vh;
+        overflow-y: auto;
+    }
+
+    .event-entry {
+        display: flex;
+        flex-direction: column;
+        gap: var(--space-1);
+        padding: var(--space-2);
+        background: var(--bg-surface);
+        border: 1px solid var(--border-default);
+        border-radius: var(--radius-sm);
+    }
+
+    .event-type {
+        font-family: var(--font-mono);
+        font-size: var(--text-xs);
+        font-weight: 600;
+    }
+
+    .event-time {
+        font-size: var(--text-xs);
+        color: var(--text-muted);
+    }
+
+    .event-message {
+        font-size: var(--text-sm);
+        color: var(--text-primary);
+    }
+
+    .event-details {
+        margin-top: var(--space-1);
+    }
+
+    .event-details summary {
+        font-size: var(--text-xs);
+        color: var(--text-muted);
+        cursor: pointer;
+    }
+
+    .event-details pre {
+        margin: var(--space-1) 0 0 0;
+        padding: var(--space-2);
+        background: var(--bg-inset);
+        border-radius: var(--radius-sm);
+        font-size: var(--text-xs);
+        overflow-x: auto;
+    }
+
+    .loading {
+        color: var(--text-muted);
+        padding: var(--space-4);
+        text-align: center;
+    }
+
+    .error {
+        color: var(--accent-red);
+        padding: var(--space-4);
+        text-align: center;
+    }
+
+    .empty-message {
+        color: var(--text-muted);
+        font-size: var(--text-sm);
+    }
+
+    .empty-message code {
+        background: var(--bg-inset);
+        padding: var(--space-0) var(--space-1);
+        border-radius: var(--radius-sm);
+        font-family: var(--font-mono);
+    }
+</style>
diff --git a/spd/app/frontend/src/components/RunView.svelte b/spd/app/frontend/src/components/RunView.svelte
index 734c9657d..06fd2ebbe 100644
--- a/spd/app/frontend/src/components/RunView.svelte
+++ b/spd/app/frontend/src/components/RunView.svelte
@@ -3,13 +3,14 @@
     import { RUN_KEY, type RunContext } from "../lib/useRun.svelte";
     import ClusterPathInput from "./ClusterPathInput.svelte";
     import DatasetExplorerTab from "./DatasetExplorerTab.svelte";
+    import InvestigationsTab from "./InvestigationsTab.svelte";
     import PromptAttributionsTab from "./PromptAttributionsTab.svelte";
     import DisplaySettingsDropdown from "./ui/DisplaySettingsDropdown.svelte";
     import ActivationContextsTab from "./ActivationContextsTab.svelte";
 
     const runState = getContext<RunContext>(RUN_KEY);
 
-    let activeTab = $state<"prompts" | "components" | "dataset-search" | null>(null);
+    let activeTab = $state<"prompts" | "components" | "dataset-search" | "investigations" | null>(null);
     let showRunMenu = $state(false);
 </script>
 
@@ -32,6 +33,14 @@
         {/if}
 
         <nav class="nav-group">
+            <button
+                type="button"
+                class="tab-button"
+                class:active={activeTab === "investigations"}
+                onclick={() => (activeTab = "investigations")}
+            >
+                Investigations
+            </button>
             <button
                 type="button"
                 class="tab-button"
@@ -75,6 +84,10 @@
                 {runState.run.error}
             </div>
         {/if}
+        <!-- Investigations tab - always available, doesn't require loaded run -->
+        <div class="tab-content" class:hidden={activeTab !== "investigations"}>
+            <InvestigationsTab />
+        </div>
         <!-- Dataset Explorer tab - always available, doesn't require loaded run -->
         <div class="tab-content" class:hidden={activeTab !== "dataset-search"}>
             <DatasetExplorerTab />
diff --git a/spd/app/frontend/src/lib/api/index.ts b/spd/app/frontend/src/lib/api/index.ts
index 09195fab1..3a37535f7 100644
--- a/spd/app/frontend/src/lib/api/index.ts
+++ b/spd/app/frontend/src/lib/api/index.ts
@@ -46,3 +46,4 @@ export * from "./intervention";
 export * from "./dataset";
 export * from "./clusters";
 export * from "./componentData";
+export * from "./investigations";
diff --git a/spd/app/frontend/src/lib/api/investigations.ts b/spd/app/frontend/src/lib/api/investigations.ts
new file mode 100644
index 000000000..2bdc7b09c
--- /dev/null
+++ b/spd/app/frontend/src/lib/api/investigations.ts
@@ -0,0 +1,55 @@
+/**
+ * API client for investigations (agent swarm results).
+ * Investigations are flattened across swarms - each task is independent.
+ */
+
+export interface InvestigationSummary {
+    id: string; // swarm_id/task_id
+    swarm_id: string;
+    task_id: number;
+    wandb_path: string | null;
+    created_at: string;
+    has_research_log: boolean;
+    has_explanations: boolean;
+    event_count: number;
+    last_event_time: string | null;
+    last_event_message: string | null;
+    // Agent-provided summary
+    title: string | null;
+    summary: string | null;
+    status: string | null; // in_progress, completed, inconclusive
+}
+
+export interface EventEntry {
+    event_type: string;
+    timestamp: string;
+    message: string;
+    details: Record<string, unknown> | null;
+}
+
+export interface InvestigationDetail {
+    id: string;
+    swarm_id: string;
+    task_id: number;
+    wandb_path: string | null;
+    created_at: string;
+    research_log: string | null;
+    events: EventEntry[];
+    explanations: Record<string, unknown>[];
+    // Agent-provided summary
+    title: string | null;
+    summary: string | null;
+    status: string | null;
+}
+
+export async function listInvestigations(): Promise<InvestigationSummary[]> {
+    const res = await fetch("/api/investigations");
+    if (!res.ok) throw new Error(`Failed to list investigations: ${res.statusText}`);
+    return res.json();
+}
+
+export async function getInvestigation(swarmId: string, taskId: number): Promise<InvestigationDetail> {
+    const res = await fetch(`/api/investigations/${swarmId}/${taskId}`);
+    if (!res.ok) throw new Error(`Failed to get investigation: ${res.statusText}`);
+    return res.json();
+}

From 06cf2e8594cb6d4d022e02be7117a839cd6ba0ce Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Sat, 31 Jan 2026 21:32:35 +0000
Subject: [PATCH 09/62] Fix MCP JSON-RPC response format violating spec

The JSON-RPC 2.0 spec requires that the "error" field must NOT be present
when there is no error. Our MCPResponse was serializing "error": null in
all success responses, causing Claude Code to reject the MCP connection
with "Failed to connect" status.

Added exclude_none=True to all model_dump() calls so null fields are
omitted from the serialized response.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 spd/app/backend/routers/mcp.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/spd/app/backend/routers/mcp.py b/spd/app/backend/routers/mcp.py
index c986109a0..01b29653d 100644
--- a/spd/app/backend/routers/mcp.py
+++ b/spd/app/backend/routers/mcp.py
@@ -94,7 +94,11 @@ class MCPRequest(BaseModel):
 
 
 class MCPResponse(BaseModel):
-    """JSON-RPC 2.0 response."""
+    """JSON-RPC 2.0 response.
+
+    Per JSON-RPC 2.0 spec, exactly one of result/error must be present (not both, not neither).
+    Use model_dump(exclude_none=True) when serializing to avoid including null fields.
+    """
 
     jsonrpc: Literal["2.0"] = "2.0"
     id: int | str | None
@@ -1078,7 +1082,7 @@ async def mcp_endpoint(request: Request):
             status_code=400,
             content=MCPResponse(
                 id=None, error={"code": -32700, "message": f"Parse error: {e}"}
-            ).model_dump(),
+            ).model_dump(exclude_none=True),
         )
 
     logger.info(f"[MCP] {mcp_request.method} (id={mcp_request.id})")
@@ -1087,7 +1091,7 @@ async def mcp_endpoint(request: Request):
         if mcp_request.method == "initialize":
             result = _handle_initialize(mcp_request.params)
             return JSONResponse(
-                content=MCPResponse(id=mcp_request.id, result=result).model_dump(),
+                content=MCPResponse(id=mcp_request.id, result=result).model_dump(exclude_none=True),
                 headers={"Mcp-Session-Id": "spd-session"},
             )
 
@@ -1097,7 +1101,9 @@ async def mcp_endpoint(request: Request):
 
         elif mcp_request.method == "tools/list":
             result = _handle_tools_list()
-            return JSONResponse(content=MCPResponse(id=mcp_request.id, result=result).model_dump())
+            return JSONResponse(
+                content=MCPResponse(id=mcp_request.id, result=result).model_dump(exclude_none=True)
+            )
 
         elif mcp_request.method == "tools/call":
             if mcp_request.params is None:
@@ -1134,7 +1140,7 @@ def generate_sse() -> Generator[str]:
                                 ]
                             },
                         )
-                        yield f"data: {json.dumps(response.model_dump())}\n\n"
+                        yield f"data: {json.dumps(response.model_dump(exclude_none=True))}\n\n"
                     except Exception as e:
                         tb = traceback.format_exc()
                         logger.error(f"[MCP] Tool error: {e}\n{tb}")
@@ -1142,14 +1148,16 @@ def generate_sse() -> Generator[str]:
                             id=mcp_request.id,
                             error={"code": -32000, "message": str(e)},
                         )
-                        yield f"data: {json.dumps(error_response.model_dump())}\n\n"
+                        yield f"data: {json.dumps(error_response.model_dump(exclude_none=True))}\n\n"
 
                 return StreamingResponse(generate_sse(), media_type="text/event-stream")
 
             else:
                 # Non-streaming response
                 return JSONResponse(
-                    content=MCPResponse(id=mcp_request.id, result=result).model_dump()
+                    content=MCPResponse(id=mcp_request.id, result=result).model_dump(
+                        exclude_none=True
+                    )
                 )
 
         else:
@@ -1157,7 +1165,7 @@ def generate_sse() -> Generator[str]:
                 content=MCPResponse(
                     id=mcp_request.id,
                     error={"code": -32601, "message": f"Method not found: {mcp_request.method}"},
-                ).model_dump()
+                ).model_dump(exclude_none=True)
             )
 
     except Exception as e:
@@ -1167,5 +1175,5 @@ def generate_sse() -> Generator[str]:
             content=MCPResponse(
                 id=mcp_request.id,
                 error={"code": -32000, "message": str(e)},
-            ).model_dump()
+            ).model_dump(exclude_none=True)
         )

From 39b5acbca82372b484da64b223996138b8bafb9e Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Sat, 31 Jan 2026 22:01:15 +0000
Subject: [PATCH 10/62] wip: Refactor agent swarm MCP configuration to require
 all swarm settings together

---
 .mcp.json                                |  7 +--
 spd/agent_swarm/scripts/run_agent.py     | 39 +++++++-------
 spd/agent_swarm/scripts/run_slurm.py     | 25 +++------
 spd/agent_swarm/scripts/run_slurm_cli.py |  3 ++
 spd/app/backend/routers/mcp.py           | 68 +++++++++++-------------
 spd/app/backend/server.py                | 32 ++++++-----
 6 files changed, 76 insertions(+), 98 deletions(-)

diff --git a/.mcp.json b/.mcp.json
index fefb52c9a..700113020 100644
--- a/.mcp.json
+++ b/.mcp.json
@@ -1,8 +1,3 @@
 {
-  "mcpServers": {
-    "svelte-llm": {
-      "type": "http",
-      "url": "https://svelte-llm.stanislav.garden/mcp/mcp"
-    }
-  }
+  "mcpServers": {}
 }
\ No newline at end of file
diff --git a/spd/agent_swarm/scripts/run_agent.py b/spd/agent_swarm/scripts/run_agent.py
index 627b6d473..1a36684e8 100644
--- a/spd/agent_swarm/scripts/run_agent.py
+++ b/spd/agent_swarm/scripts/run_agent.py
@@ -43,6 +43,14 @@ def write_mcp_config(task_dir: Path, port: int) -> Path:
     return config_path
 
 
+def write_claude_settings(task_dir: Path) -> None:
+    """Write Claude Code settings to pre-grant MCP tool permissions."""
+    claude_dir = task_dir / ".claude"
+    claude_dir.mkdir(exist_ok=True)
+    settings = {"permissions": {"allow": ["mcp__spd__*"]}}
+    (claude_dir / "settings.json").write_text(json.dumps(settings, indent=2))
+
+
 def find_available_port(start_port: int = 8000, max_attempts: int = 100) -> int:
     """Find an available port starting from start_port."""
     for offset in range(max_attempts):
@@ -73,16 +81,14 @@ def wait_for_backend(port: int, timeout: float = 120.0) -> bool:
     return False
 
 
-def load_run(port: int, wandb_path: str, context_length: int) -> bool:
-    """Load the SPD run into the backend."""
+def load_run(port: int, wandb_path: str, context_length: int) -> None:
+    """Load the SPD run into the backend. Raises on failure."""
     url = f"http://localhost:{port}/api/runs/load"
     params = {"wandb_path": wandb_path, "context_length": context_length}
-    try:
-        resp = requests.post(url, params=params, timeout=300)
-        return resp.status_code == 200
-    except Exception as e:
-        logger.error(f"Failed to load run: {e}")
-        return False
+    resp = requests.post(url, params=params, timeout=300)
+    assert resp.status_code == 200, (
+        f"Failed to load run {wandb_path}: {resp.status_code} {resp.text}"
+    )
 
 
 def log_event(events_path: Path, event: SwarmEvent) -> None:
@@ -112,6 +118,9 @@ def run_agent(
     task_dir = swarm_dir / f"task_{task_id}"
     task_dir.mkdir(parents=True, exist_ok=True)
 
+    # Pre-grant MCP tool permissions
+    write_claude_settings(task_dir)
+
     events_path = task_dir / "events.jsonl"
     explanations_path = task_dir / "explanations.jsonl"
     db_path = task_dir / "app.db"
@@ -203,16 +212,7 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
         )
 
         # Load the SPD run
-        if not load_run(port, wandb_path, context_length):
-            log_event(
-                events_path,
-                SwarmEvent(
-                    event_type="error",
-                    message="Failed to load run",
-                    details={"wandb_path": wandb_path},
-                ),
-            )
-            raise RuntimeError(f"Failed to load run: {wandb_path}")
+        load_run(port, wandb_path, context_length)
 
         logger.info(f"[Task {task_id}] Run loaded, launching Claude Code...")
         log_event(
@@ -240,12 +240,11 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
         logger.info(f"[Task {task_id}] MCP config written to {mcp_config_path}")
 
         # Launch Claude Code with streaming JSON output and MCP
-        # No --dangerously-skip-permissions needed - agents use MCP tools for all I/O
         claude_output_path = task_dir / "claude_output.jsonl"
         claude_cmd = [
             "claude",
             "--print",
-            "--verbose",  # Required for stream-json output
+            "--verbose",
             "--output-format",
             "stream-json",
             "--max-turns",
diff --git a/spd/agent_swarm/scripts/run_slurm.py b/spd/agent_swarm/scripts/run_slurm.py
index f596e1ed9..076a0d847 100644
--- a/spd/agent_swarm/scripts/run_slurm.py
+++ b/spd/agent_swarm/scripts/run_slurm.py
@@ -30,23 +30,14 @@ def get_swarm_output_dir(swarm_id: str) -> Path:
 def launch_agent_swarm(
     wandb_path: str,
     n_agents: int,
-    context_length: int = 128,
-    max_turns: int = 50,
-    partition: str = "h200-reserved",
-    time: str = "8:00:00",
-    job_suffix: str | None = None,
+    context_length: int,
+    max_turns: int,
+    max_concurrent: int,
+    partition: str,
+    time: str,
+    job_suffix: str | None,
 ) -> None:
-    """Launch a swarm of agents to investigate behaviors.
-
-    Args:
-        wandb_path: WandB run path for the SPD decomposition.
-        n_agents: Number of agents to launch.
-        context_length: Context length for prompts.
-        max_turns: Maximum agentic turns per agent (prevents runaway).
-        partition: SLURM partition.
-        time: Time limit per agent.
-        job_suffix: Optional suffix for job names.
-    """
+    """Launch a swarm of agents to investigate behaviors."""
     swarm_id = f"swarm-{secrets.token_hex(4)}"
     output_dir = get_swarm_output_dir(swarm_id)
     output_dir.mkdir(parents=True, exist_ok=True)
@@ -91,7 +82,7 @@ def launch_agent_swarm(
         n_gpus=1,
         time=time,
         snapshot_branch=snapshot_branch,
-        max_concurrent_tasks=min(n_agents, 8),  # Respect cluster limits
+        max_concurrent_tasks=min(n_agents, max_concurrent),
     )
     array_script = generate_array_script(array_config, worker_commands)
     array_result = submit_slurm_job(
diff --git a/spd/agent_swarm/scripts/run_slurm_cli.py b/spd/agent_swarm/scripts/run_slurm_cli.py
index 9b75ce95f..c603ffb3b 100644
--- a/spd/agent_swarm/scripts/run_slurm_cli.py
+++ b/spd/agent_swarm/scripts/run_slurm_cli.py
@@ -24,6 +24,7 @@ def main(
     n_agents: int,
     context_length: int = 128,
     max_turns: int = 50,
+    max_concurrent: int = 8,
     partition: str = DEFAULT_PARTITION_NAME,
     time: str = "8:00:00",
     job_suffix: str | None = None,
@@ -40,6 +41,7 @@ def main(
         n_agents: Number of agents to launch (each gets 1 GPU).
         context_length: Context length for prompts (default 128).
         max_turns: Maximum agentic turns per agent (default 50, prevents runaway).
+        max_concurrent: Maximum agents to run in parallel (default 8, respects cluster limits).
         partition: SLURM partition name.
         time: Job time limit per agent (default 8 hours).
         job_suffix: Optional suffix for SLURM job names.
@@ -51,6 +53,7 @@ def main(
         n_agents=n_agents,
         context_length=context_length,
         max_turns=max_turns,
+        max_concurrent=max_concurrent,
         partition=partition,
         time=time,
         job_suffix=job_suffix,
diff --git a/spd/app/backend/routers/mcp.py b/spd/app/backend/routers/mcp.py
index 01b29653d..07af9c8b4 100644
--- a/spd/app/backend/routers/mcp.py
+++ b/spd/app/backend/routers/mcp.py
@@ -12,6 +12,7 @@
 import threading
 import traceback
 from collections.abc import Generator
+from dataclasses import dataclass
 from datetime import UTC, datetime
 from pathlib import Path
 from typing import Any, Literal
@@ -41,33 +42,28 @@
 # MCP protocol version
 MCP_PROTOCOL_VERSION = "2024-11-05"
 
-# Optional paths for swarm integration (set via environment at runtime)
-_events_log_path: Path | None = None
-_task_dir: Path | None = None
-_suggestions_path: Path | None = None
 
+@dataclass
+class SwarmConfig:
+    """Configuration for agent swarm mode. All paths are required when in swarm mode."""
 
-def set_events_log_path(path: Path | None) -> None:
-    """Set the path for logging MCP tool events (for swarm monitoring)."""
-    global _events_log_path
-    _events_log_path = path
+    events_log_path: Path
+    task_dir: Path
+    suggestions_path: Path
 
 
-def set_task_dir(path: Path | None) -> None:
-    """Set the task directory for research log and explanations output."""
-    global _task_dir
-    _task_dir = path
+_swarm_config: SwarmConfig | None = None
 
 
-def set_suggestions_path(path: Path | None) -> None:
-    """Set the path for the central suggestions file."""
-    global _suggestions_path
-    _suggestions_path = path
+def set_swarm_config(config: SwarmConfig) -> None:
+    """Configure MCP for agent swarm mode."""
+    global _swarm_config
+    _swarm_config = config
 
 
 def _log_event(event_type: str, message: str, details: dict[str, Any] | None = None) -> None:
-    """Log an event to the events file if configured."""
-    if _events_log_path is None:
+    """Log an event to the events file if in swarm mode."""
+    if _swarm_config is None:
         return
     event = {
         "event_type": event_type,
@@ -75,7 +71,7 @@ def _log_event(event_type: str, message: str, details: dict[str, Any] | None = N
         "message": message,
         "details": details or {},
     }
-    with open(_events_log_path, "a") as f:
+    with open(_swarm_config.events_log_path, "a") as f:
         f.write(json.dumps(event) + "\n")
 
 
@@ -853,19 +849,22 @@ def _tool_create_prompt(params: dict[str, Any]) -> dict[str, Any]:
     }
 
 
+def _require_swarm_config() -> SwarmConfig:
+    """Get swarm config, raising if not in swarm mode."""
+    assert _swarm_config is not None, "Not running in swarm mode"
+    return _swarm_config
+
+
 def _tool_update_research_log(params: dict[str, Any]) -> dict[str, Any]:
     """Append content to the research log."""
-    if _task_dir is None:
-        raise ValueError("Research log not available - not running in swarm mode")
-
+    config = _require_swarm_config()
     content = params["content"]
-    research_log_path = _task_dir / "research_log.md"
+    research_log_path = config.task_dir / "research_log.md"
 
     _log_event(
         "tool_call", f"update_research_log: {len(content)} chars", {"preview": content[:100]}
     )
 
-    # Append content with a newline separator
     with open(research_log_path, "a") as f:
         f.write(content)
         if not content.endswith("\n"):
@@ -878,8 +877,7 @@ def _tool_save_explanation(params: dict[str, Any]) -> dict[str, Any]:
     """Save a behavior explanation to explanations.jsonl."""
     from spd.agent_swarm.schemas import BehaviorExplanation, ComponentInfo, Evidence
 
-    if _task_dir is None:
-        raise ValueError("Explanations file not available - not running in swarm mode")
+    config = _require_swarm_config()
 
     _log_event(
         "tool_call",
@@ -887,7 +885,6 @@ def _tool_save_explanation(params: dict[str, Any]) -> dict[str, Any]:
         {"prompt": params["subject_prompt"]},
     )
 
-    # Build components
     components = [
         ComponentInfo(
             component_key=c["component_key"],
@@ -897,7 +894,6 @@ def _tool_save_explanation(params: dict[str, Any]) -> dict[str, Any]:
         for c in params["components_involved"]
     ]
 
-    # Build evidence
     evidence = [
         Evidence(
             evidence_type=e["evidence_type"],
@@ -918,7 +914,7 @@ def _tool_save_explanation(params: dict[str, Any]) -> dict[str, Any]:
         limitations=params.get("limitations", []),
     )
 
-    explanations_path = _task_dir / "explanations.jsonl"
+    explanations_path = config.task_dir / "explanations.jsonl"
     with open(explanations_path, "a") as f:
         f.write(explanation.model_dump_json() + "\n")
 
@@ -933,8 +929,7 @@ def _tool_save_explanation(params: dict[str, Any]) -> dict[str, Any]:
 
 def _tool_submit_suggestion(params: dict[str, Any]) -> dict[str, Any]:
     """Submit a suggestion for system improvement."""
-    if _suggestions_path is None:
-        raise ValueError("Suggestions not available - not running in swarm mode")
+    config = _require_swarm_config()
 
     suggestion = {
         "timestamp": datetime.now(UTC).isoformat(),
@@ -950,10 +945,8 @@ def _tool_submit_suggestion(params: dict[str, Any]) -> dict[str, Any]:
         suggestion,
     )
 
-    # Ensure parent directory exists
-    _suggestions_path.parent.mkdir(parents=True, exist_ok=True)
-
-    with open(_suggestions_path, "a") as f:
+    config.suggestions_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(config.suggestions_path, "a") as f:
         f.write(json.dumps(suggestion) + "\n")
 
     return {"status": "ok", "message": "Suggestion recorded. Thank you!"}
@@ -961,8 +954,7 @@ def _tool_submit_suggestion(params: dict[str, Any]) -> dict[str, Any]:
 
 def _tool_set_investigation_summary(params: dict[str, Any]) -> dict[str, Any]:
     """Set the investigation title and summary."""
-    if _task_dir is None:
-        raise ValueError("Summary not available - not running in swarm mode")
+    config = _require_swarm_config()
 
     summary = {
         "title": params["title"],
@@ -977,7 +969,7 @@ def _tool_set_investigation_summary(params: dict[str, Any]) -> dict[str, Any]:
         summary,
     )
 
-    summary_path = _task_dir / "summary.json"
+    summary_path = config.task_dir / "summary.json"
     summary_path.write_text(json.dumps(summary, indent=2))
 
     return {"status": "ok", "path": str(summary_path)}
diff --git a/spd/app/backend/server.py b/spd/app/backend/server.py
index 68316bb69..e28c3911b 100644
--- a/spd/app/backend/server.py
+++ b/spd/app/backend/server.py
@@ -52,11 +52,7 @@ async def lifespan(app: FastAPI):  # pyright: ignore[reportUnusedParameter]
     import os
     from pathlib import Path
 
-    from spd.app.backend.routers.mcp import (
-        set_events_log_path,
-        set_suggestions_path,
-        set_task_dir,
-    )
+    from spd.app.backend.routers.mcp import SwarmConfig, set_swarm_config
 
     manager = StateManager.get()
 
@@ -68,21 +64,23 @@ async def lifespan(app: FastAPI):  # pyright: ignore[reportUnusedParameter]
     logger.info(f"[STARTUP] Device: {DEVICE}")
     logger.info(f"[STARTUP] CUDA available: {torch.cuda.is_available()}")
 
-    # Configure MCP for agent swarm mode
+    # Configure MCP for agent swarm mode (all three env vars must be set together)
     mcp_events_path = os.environ.get("SPD_MCP_EVENTS_PATH")
-    if mcp_events_path:
-        set_events_log_path(Path(mcp_events_path))
-        logger.info(f"[STARTUP] MCP events logging to: {mcp_events_path}")
-
     mcp_task_dir = os.environ.get("SPD_MCP_TASK_DIR")
-    if mcp_task_dir:
-        set_task_dir(Path(mcp_task_dir))
-        logger.info(f"[STARTUP] MCP task dir: {mcp_task_dir}")
-
     mcp_suggestions_path = os.environ.get("SPD_MCP_SUGGESTIONS_PATH")
-    if mcp_suggestions_path:
-        set_suggestions_path(Path(mcp_suggestions_path))
-        logger.info(f"[STARTUP] MCP suggestions file: {mcp_suggestions_path}")
+
+    if mcp_events_path or mcp_task_dir or mcp_suggestions_path:
+        assert mcp_events_path and mcp_task_dir and mcp_suggestions_path, (
+            "Swarm mode requires all env vars: SPD_MCP_EVENTS_PATH, SPD_MCP_TASK_DIR, SPD_MCP_SUGGESTIONS_PATH"
+        )
+        set_swarm_config(
+            SwarmConfig(
+                events_log_path=Path(mcp_events_path),
+                task_dir=Path(mcp_task_dir),
+                suggestions_path=Path(mcp_suggestions_path),
+            )
+        )
+        logger.info(f"[STARTUP] Swarm mode enabled: task_dir={mcp_task_dir}")
 
     yield
 

From ae88d53bdda58ee4afa883ddb38e29b0038d5e50 Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Sun, 1 Feb 2026 01:06:45 +0000
Subject: [PATCH 11/62] Fix agent swarm hanging at ~80% optimization

The backend subprocess had stdout=subprocess.PIPE but the pipe was
never drained. When the pipe buffer filled (~64KB), tqdm.write() in
the optimization loop would block forever.

Fix: Write backend logs to task_dir/backend.log instead of piping.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 spd/agent_swarm/scripts/run_agent.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/spd/agent_swarm/scripts/run_agent.py b/spd/agent_swarm/scripts/run_agent.py
index 1a36684e8..eda4e3d7e 100644
--- a/spd/agent_swarm/scripts/run_agent.py
+++ b/spd/agent_swarm/scripts/run_agent.py
@@ -166,10 +166,14 @@ def run_agent(
         str(port),
     ]
 
+    # Write backend logs to file instead of piping to avoid buffer deadlock
+    # (if we pipe and don't drain, the buffer fills and blocks the backend)
+    backend_log_path = task_dir / "backend.log"
+    backend_log = open(backend_log_path, "w")  # noqa: SIM115 - managed manually
     backend_proc = subprocess.Popen(
         backend_cmd,
         env=env,
-        stdout=subprocess.PIPE,
+        stdout=backend_log,
         stderr=subprocess.STDOUT,
     )
 
@@ -183,6 +187,7 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
                 backend_proc.wait(timeout=5)
             except subprocess.TimeoutExpired:
                 backend_proc.kill()
+        backend_log.close()
         if signum is not None:
             sys.exit(1)
 

From 58129b047f658dd02a24baa750656871511c369e Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Sun, 1 Feb 2026 01:18:55 +0000
Subject: [PATCH 12/62] =?UTF-8?q?Simplify=20agent=20swarm=20env=20vars:=20?=
 =?UTF-8?q?4=20=E2=86=92=202?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- SPD_SWARM_TASK_DIR: backend derives db_path, events_path from this
- SPD_SWARM_SUGGESTIONS_PATH: global suggestions file

Removed:
- SPD_APP_DB_PATH, SPD_MCP_EVENTS_PATH, SPD_MCP_TASK_DIR (consolidated)
- Unused AgentOutput schema

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 spd/agent_swarm/CLAUDE.md            | 10 +++++++---
 spd/agent_swarm/schemas.py           | 16 ----------------
 spd/agent_swarm/scripts/run_agent.py | 17 +++++------------
 spd/app/backend/database.py          | 11 ++++++++++-
 spd/app/backend/server.py            | 24 ++++++++++++------------
 5 files changed, 34 insertions(+), 44 deletions(-)

diff --git a/spd/agent_swarm/CLAUDE.md b/spd/agent_swarm/CLAUDE.md
index 48fabc504..2c27ff419 100644
--- a/spd/agent_swarm/CLAUDE.md
+++ b/spd/agent_swarm/CLAUDE.md
@@ -117,10 +117,14 @@ General logging:
 - `message`: Human-readable description
 - `details`: Structured data
 
-## Database Isolation
+## Swarm Mode Environment
 
-Each agent gets its own SQLite database via the `SPD_APP_DB_PATH` environment variable.
-This prevents conflicts when multiple agents run on the same machine.
+Each agent runs with `SPD_SWARM_TASK_DIR` set to its task directory. The backend derives:
+- Database: `task_dir/app.db`
+- Events log: `task_dir/events.jsonl`
+- Research log: `task_dir/research_log.md`
+
+`SPD_SWARM_SUGGESTIONS_PATH` points to the global suggestions file shared across all agents.
 
 ## Monitoring
 
diff --git a/spd/agent_swarm/schemas.py b/spd/agent_swarm/schemas.py
index d554db855..a323d7be5 100644
--- a/spd/agent_swarm/schemas.py
+++ b/spd/agent_swarm/schemas.py
@@ -102,19 +102,3 @@ class SwarmEvent(BaseModel):
     timestamp: datetime = Field(default_factory=lambda: datetime.now(UTC))
     message: str
     details: dict[str, Any] = Field(default_factory=dict)
-
-
-class AgentOutput(BaseModel):
-    """Container for all outputs from a single agent run.
-
-    Written to the agent's output directory as output.json upon completion.
-    """
-
-    task_id: int
-    wandb_path: str
-    started_at: datetime
-    completed_at: datetime | None = None
-    explanations: list[BehaviorExplanation] = Field(default_factory=list)
-    events: list[SwarmEvent] = Field(default_factory=list)
-    status: Literal["running", "completed", "failed"] = "running"
-    error: str | None = None
diff --git a/spd/agent_swarm/scripts/run_agent.py b/spd/agent_swarm/scripts/run_agent.py
index eda4e3d7e..72ebeb7a8 100644
--- a/spd/agent_swarm/scripts/run_agent.py
+++ b/spd/agent_swarm/scripts/run_agent.py
@@ -122,11 +122,7 @@ def run_agent(
     write_claude_settings(task_dir)
 
     events_path = task_dir / "events.jsonl"
-    explanations_path = task_dir / "explanations.jsonl"
-    db_path = task_dir / "app.db"
-
-    # Initialize empty output files
-    explanations_path.touch()
+    (task_dir / "explanations.jsonl").touch()
 
     log_event(
         events_path,
@@ -146,17 +142,14 @@ def run_agent(
         SwarmEvent(
             event_type="progress",
             message=f"Starting backend on port {port}",
-            details={"port": port, "db_path": str(db_path)},
+            details={"port": port},
         ),
     )
 
-    # Start backend with isolated database and swarm configuration
+    # Start backend with swarm configuration (paths derived from task_dir)
     env = os.environ.copy()
-    env["SPD_APP_DB_PATH"] = str(db_path)
-    env["SPD_MCP_EVENTS_PATH"] = str(events_path)
-    env["SPD_MCP_TASK_DIR"] = str(task_dir)
-    # Suggestions go to a global file (one level above swarm dirs)
-    env["SPD_MCP_SUGGESTIONS_PATH"] = str(swarm_dir.parent / "suggestions.jsonl")
+    env["SPD_SWARM_TASK_DIR"] = str(task_dir)
+    env["SPD_SWARM_SUGGESTIONS_PATH"] = str(swarm_dir.parent / "suggestions.jsonl")
 
     backend_cmd = [
         sys.executable,
diff --git a/spd/app/backend/database.py b/spd/app/backend/database.py
index 1ee06ce5a..f69113b41 100644
--- a/spd/app/backend/database.py
+++ b/spd/app/backend/database.py
@@ -30,7 +30,16 @@
 
 
 def get_default_db_path() -> Path:
-    """Get the default database path, respecting SPD_APP_DB_PATH env var."""
+    """Get the default database path.
+
+    Checks env vars in order:
+    1. SPD_SWARM_TASK_DIR - swarm mode, db at task_dir/app.db
+    2. SPD_APP_DB_PATH - explicit override
+    3. Default: .data/app/prompt_attr.db
+    """
+    swarm_task_dir = os.environ.get("SPD_SWARM_TASK_DIR")
+    if swarm_task_dir:
+        return Path(swarm_task_dir) / "app.db"
     env_path = os.environ.get("SPD_APP_DB_PATH")
     if env_path:
         return Path(env_path)
diff --git a/spd/app/backend/server.py b/spd/app/backend/server.py
index e28c3911b..7de2b608d 100644
--- a/spd/app/backend/server.py
+++ b/spd/app/backend/server.py
@@ -64,23 +64,23 @@ async def lifespan(app: FastAPI):  # pyright: ignore[reportUnusedParameter]
     logger.info(f"[STARTUP] Device: {DEVICE}")
     logger.info(f"[STARTUP] CUDA available: {torch.cuda.is_available()}")
 
-    # Configure MCP for agent swarm mode (all three env vars must be set together)
-    mcp_events_path = os.environ.get("SPD_MCP_EVENTS_PATH")
-    mcp_task_dir = os.environ.get("SPD_MCP_TASK_DIR")
-    mcp_suggestions_path = os.environ.get("SPD_MCP_SUGGESTIONS_PATH")
-
-    if mcp_events_path or mcp_task_dir or mcp_suggestions_path:
-        assert mcp_events_path and mcp_task_dir and mcp_suggestions_path, (
-            "Swarm mode requires all env vars: SPD_MCP_EVENTS_PATH, SPD_MCP_TASK_DIR, SPD_MCP_SUGGESTIONS_PATH"
+    # Configure MCP for agent swarm mode (derives paths from task_dir)
+    swarm_task_dir = os.environ.get("SPD_SWARM_TASK_DIR")
+    swarm_suggestions_path = os.environ.get("SPD_SWARM_SUGGESTIONS_PATH")
+
+    if swarm_task_dir or swarm_suggestions_path:
+        assert swarm_task_dir and swarm_suggestions_path, (
+            "Swarm mode requires: SPD_SWARM_TASK_DIR and SPD_SWARM_SUGGESTIONS_PATH"
         )
+        task_dir = Path(swarm_task_dir)
         set_swarm_config(
             SwarmConfig(
-                events_log_path=Path(mcp_events_path),
-                task_dir=Path(mcp_task_dir),
-                suggestions_path=Path(mcp_suggestions_path),
+                events_log_path=task_dir / "events.jsonl",
+                task_dir=task_dir,
+                suggestions_path=Path(swarm_suggestions_path),
             )
         )
-        logger.info(f"[STARTUP] Swarm mode enabled: task_dir={mcp_task_dir}")
+        logger.info(f"[STARTUP] Swarm mode enabled: task_dir={swarm_task_dir}")
 
     yield
 

From b47733fd1d8dde55ff3581a5c030399c59d47343 Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Mon, 2 Feb 2026 11:07:56 +0000
Subject: [PATCH 13/62] wip: Add graph artifacts to investigation research logs

---
 spd/agent_swarm/CLAUDE.md                     |   4 +
 spd/agent_swarm/agent_prompt.py               |  23 +
 spd/app/backend/routers/investigations.py     |  41 ++
 spd/app/backend/routers/mcp.py                | 171 +++++++
 spd/app/frontend/package-lock.json            |  15 +
 spd/app/frontend/package.json                 |   3 +
 .../src/components/InvestigationsTab.svelte   |  58 ++-
 .../investigations/ArtifactGraph.svelte       | 478 ++++++++++++++++++
 .../investigations/ResearchLogViewer.svelte   | 223 ++++++++
 .../frontend/src/lib/api/investigations.ts    |  34 ++
 10 files changed, 1036 insertions(+), 14 deletions(-)
 create mode 100644 spd/app/frontend/src/components/investigations/ArtifactGraph.svelte
 create mode 100644 spd/app/frontend/src/components/investigations/ResearchLogViewer.svelte

diff --git a/spd/agent_swarm/CLAUDE.md b/spd/agent_swarm/CLAUDE.md
index 2c27ff419..6e06ed91a 100644
--- a/spd/agent_swarm/CLAUDE.md
+++ b/spd/agent_swarm/CLAUDE.md
@@ -57,6 +57,7 @@ these tools at `/mcp`. Agents don't need file system access - everything is done
 | Tool | Description |
 |------|-------------|
 | `update_research_log` | Append content to the agent's research log (PRIMARY OUTPUT) |
+| `save_graph_artifact` | Save a graph for inline visualization in research log |
 | `save_explanation` | Save a complete, validated behavior explanation |
 | `set_investigation_summary` | Set title and summary shown in the investigations UI |
 | `submit_suggestion` | Submit ideas for improving the tools or system |
@@ -78,6 +79,9 @@ SPD_OUT_DIR/agent_swarm/
     │   ├── events.jsonl      # Structured progress and observations
     │   ├── explanations.jsonl # Complete behavior explanations
     │   ├── summary.json      # Agent-provided title and summary for UI
+    │   ├── artifacts/        # Graph artifacts for inline visualization
+    │   │   ├── graph_001.json
+    │   │   └── graph_002.json
     │   ├── app.db            # Isolated SQLite database
     │   ├── agent_prompt.md   # The prompt given to the agent
     │   ├── mcp_config.json   # MCP server configuration for Claude Code
diff --git a/spd/agent_swarm/agent_prompt.py b/spd/agent_swarm/agent_prompt.py
index 44424c190..774d636d5 100644
--- a/spd/agent_swarm/agent_prompt.py
+++ b/spd/agent_swarm/agent_prompt.py
@@ -37,6 +37,7 @@
 
 **Output Tools:**
 - **update_research_log**: Append to your research log (PRIMARY OUTPUT - use frequently!)
+- **save_graph_artifact**: Save a graph as an artifact for inline display in your research log
 - **save_explanation**: Save a complete, validated behavior explanation
 - **set_investigation_summary**: Set a title and summary for your investigation (shown in UI)
 - **submit_suggestion**: Submit ideas for improving the tools or system
@@ -98,6 +99,28 @@
 update_research_log("## [14:28:03] Ablation Test\n\nResult: P(he) = 0.89 (vs 0.22 baseline)\n\nThis confirms the circuit is sufficient!\n\n")
 ```
 
+### Including Graph Visualizations
+
+After running `optimize_graph`, you can embed the circuit visualization in your research log:
+
+1. Call `save_graph_artifact` with the graph_id returned by optimize_graph
+2. Reference it in your research log using the `spd:graph` code block
+
+Example:
+```
+# After optimize_graph returns graph_id=42
+save_graph_artifact(graph_id=42, caption="Circuit predicting 'he' after 'The boy'")
+
+# Then include in your research log:
+update_research_log('''## Circuit Visualization
+
+```spd:graph
+artifact: graph_001
+```
+
+This circuit shows the key components involved in predicting "he"...
+''')
+
 ### Saving Explanations
 
 When you have a complete explanation, use `save_explanation`:
diff --git a/spd/app/backend/routers/investigations.py b/spd/app/backend/routers/investigations.py
index 3ea6244c3..77c4f435a 100644
--- a/spd/app/backend/routers/investigations.py
+++ b/spd/app/backend/routers/investigations.py
@@ -58,6 +58,7 @@ class InvestigationDetail(BaseModel):
     research_log: str | None
     events: list[EventEntry]
     explanations: list[dict[str, Any]]
+    artifact_ids: list[str]  # List of artifact IDs available for this investigation
     # Agent-provided summary
     title: str | None
     summary: str | None
@@ -116,6 +117,17 @@ def _parse_task_summary(task_path: Path) -> tuple[str | None, str | None, str |
         return None, None, None
 
 
+def _list_artifact_ids(task_path: Path) -> list[str]:
+    """List all artifact IDs for a task."""
+    artifacts_dir = task_path / "artifacts"
+    if not artifacts_dir.exists():
+        return []
+    artifact_ids = []
+    for f in sorted(artifacts_dir.glob("graph_*.json")):
+        artifact_ids.append(f.stem)  # e.g., "graph_001"
+    return artifact_ids
+
+
 def _get_task_created_at(task_path: Path, swarm_metadata: dict[str, Any] | None) -> str:
     """Get creation time for a task."""
     # Try to get from first event
@@ -247,6 +259,9 @@ def get_investigation(swarm_id: str, task_id: int) -> InvestigationDetail:
 
     title, summary, status = _parse_task_summary(task_path)
 
+    # List artifact IDs
+    artifact_ids = _list_artifact_ids(task_path)
+
     return InvestigationDetail(
         id=f"{swarm_id}/{task_id}",
         swarm_id=swarm_id,
@@ -256,7 +271,33 @@ def get_investigation(swarm_id: str, task_id: int) -> InvestigationDetail:
         research_log=research_log,
         events=events,
         explanations=explanations,
+        artifact_ids=artifact_ids,
         title=title,
         summary=summary,
         status=status,
     )
+
+
+@router.get("/{swarm_id}/{task_id}/artifacts")
+def list_artifacts(swarm_id: str, task_id: int) -> list[str]:
+    """List all artifact IDs for an investigation."""
+    task_path = SWARM_DIR / swarm_id / f"task_{task_id}"
+    if not task_path.exists():
+        raise HTTPException(status_code=404, detail=f"Investigation {swarm_id}/{task_id} not found")
+    return _list_artifact_ids(task_path)
+
+
+@router.get("/{swarm_id}/{task_id}/artifacts/{artifact_id}")
+def get_artifact(swarm_id: str, task_id: int, artifact_id: str) -> dict[str, Any]:
+    """Get a specific artifact by ID."""
+    task_path = SWARM_DIR / swarm_id / f"task_{task_id}"
+    artifact_path = task_path / "artifacts" / f"{artifact_id}.json"
+
+    if not artifact_path.exists():
+        raise HTTPException(
+            status_code=404,
+            detail=f"Artifact {artifact_id} not found in {swarm_id}/{task_id}",
+        )
+
+    data: dict[str, Any] = json.loads(artifact_path.read_text())
+    return data
diff --git a/spd/app/backend/routers/mcp.py b/spd/app/backend/routers/mcp.py
index 07af9c8b4..cc32e6e06 100644
--- a/spd/app/backend/routers/mcp.py
+++ b/spd/app/backend/routers/mcp.py
@@ -422,6 +422,34 @@ class ToolDefinition(BaseModel):
             "required": ["title", "summary"],
         },
     ),
+    ToolDefinition(
+        name="save_graph_artifact",
+        description="""Save a graph as an artifact for inclusion in your research report.
+
+After calling optimize_graph and getting a graph_id, call this to save the graph
+as an artifact. Then reference it in your research log using the spd:graph syntax:
+
+```spd:graph
+artifact: graph_001
+```
+
+This allows humans reviewing your investigation to see interactive circuit visualizations
+inline with your research notes.""",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "graph_id": {
+                    "type": "integer",
+                    "description": "The graph ID returned by optimize_graph",
+                },
+                "caption": {
+                    "type": "string",
+                    "description": "Optional caption describing what this graph shows",
+                },
+            },
+            "required": ["graph_id"],
+        },
+    ),
 ]
 
 
@@ -975,6 +1003,140 @@ def _tool_set_investigation_summary(params: dict[str, Any]) -> dict[str, Any]:
     return {"status": "ok", "path": str(summary_path)}
 
 
+def _tool_save_graph_artifact(params: dict[str, Any]) -> dict[str, Any]:
+    """Save a graph as an artifact for the research report.
+
+    Uses the same filtering logic as the main graph API:
+    1. Filter nodes by CI threshold
+    2. Add pseudo nodes (wte, output)
+    3. Filter edges to only active nodes
+    4. Apply edge limit
+    """
+    config = _require_swarm_config()
+    manager, loaded = _get_state()
+
+    graph_id = params["graph_id"]
+    caption = params.get("caption")
+    ci_threshold = params.get("ci_threshold", 0.5)
+    edge_limit = params.get("edge_limit", 5000)
+
+    _log_event(
+        "tool_call",
+        f"save_graph_artifact: graph_id={graph_id}",
+        {"graph_id": graph_id, "caption": caption},
+    )
+
+    # Fetch graph from DB
+    result = manager.db.get_graph(graph_id)
+    if result is None:
+        raise ValueError(f"Graph with id={graph_id} not found")
+
+    graph, prompt_id = result
+
+    # Get tokens from prompt
+    prompt_record = manager.db.get_prompt(prompt_id)
+    if prompt_record is None:
+        raise ValueError(f"Prompt with id={prompt_id} not found")
+
+    tokens = [loaded.token_strings[tid] for tid in prompt_record.token_ids]
+    num_tokens = len(tokens)
+
+    # Create artifacts directory
+    artifacts_dir = config.task_dir / "artifacts"
+    artifacts_dir.mkdir(exist_ok=True)
+
+    # Generate artifact ID (find max existing number to avoid collisions)
+    existing_nums = []
+    for f in artifacts_dir.glob("graph_*.json"):
+        try:
+            num = int(f.stem.split("_")[1])
+            existing_nums.append(num)
+        except (IndexError, ValueError):
+            continue
+    artifact_num = max(existing_nums, default=0) + 1
+    artifact_id = f"graph_{artifact_num:03d}"
+
+    # Step 1: Filter nodes by CI threshold (same as main graph API)
+    filtered_ci_vals = {k: v for k, v in graph.node_ci_vals.items() if v > ci_threshold}
+    l0_total = len(filtered_ci_vals)
+
+    # Step 2: Add pseudo nodes (wte and output) - same as _add_pseudo_layer_nodes
+    node_ci_vals_with_pseudo = dict(filtered_ci_vals)
+    for seq_pos in range(num_tokens):
+        node_ci_vals_with_pseudo[f"wte:{seq_pos}:0"] = 1.0
+    for key, out_prob in graph.out_probs.items():
+        seq_pos, token_id = key.split(":")
+        node_ci_vals_with_pseudo[f"output:{seq_pos}:{token_id}"] = out_prob.prob
+
+    # Step 3: Filter edges to only active nodes
+    active_node_keys = set(node_ci_vals_with_pseudo.keys())
+    filtered_edges = [
+        e
+        for e in graph.edges
+        if str(e.source) in active_node_keys and str(e.target) in active_node_keys
+    ]
+
+    # Step 4: Sort by strength and apply edge limit
+    filtered_edges.sort(key=lambda e: abs(e.strength), reverse=True)
+    filtered_edges = filtered_edges[:edge_limit]
+
+    # Build edges data
+    edges_data = [
+        {
+            "src": str(e.source),
+            "tgt": str(e.target),
+            "val": e.strength,
+        }
+        for e in filtered_edges
+    ]
+
+    # Compute max abs attr from filtered edges
+    max_abs_attr = max((abs(e.strength) for e in filtered_edges), default=0.0)
+
+    # Filter nodeSubcompActs to match nodeCiVals
+    filtered_subcomp_acts = {
+        k: v for k, v in graph.node_subcomp_acts.items() if k in node_ci_vals_with_pseudo
+    }
+
+    # Build artifact data (self-contained GraphData, same structure as API response)
+    artifact = {
+        "type": "graph",
+        "id": artifact_id,
+        "caption": caption,
+        "graph_id": graph_id,
+        "data": {
+            "tokens": tokens,
+            "edges": edges_data,
+            "outputProbs": {
+                k: {
+                    "prob": v.prob,
+                    "logit": v.logit,
+                    "target_prob": v.target_prob,
+                    "target_logit": v.target_logit,
+                    "token": v.token,
+                }
+                for k, v in graph.out_probs.items()
+            },
+            "nodeCiVals": node_ci_vals_with_pseudo,
+            "nodeSubcompActs": filtered_subcomp_acts,
+            "maxAbsAttr": max_abs_attr,
+            "l0_total": l0_total,
+        },
+    }
+
+    # Save artifact
+    artifact_path = artifacts_dir / f"{artifact_id}.json"
+    artifact_path.write_text(json.dumps(artifact, indent=2))
+
+    _log_event(
+        "artifact_saved",
+        f"Saved graph artifact: {artifact_id}",
+        {"artifact_id": artifact_id, "graph_id": graph_id, "path": str(artifact_path)},
+    )
+
+    return {"artifact_id": artifact_id, "path": str(artifact_path)}
+
+
 # =============================================================================
 # MCP Protocol Handler
 # =============================================================================
@@ -1055,6 +1217,15 @@ def _handle_tools_call(
                 }
             ]
         }
+    elif name == "save_graph_artifact":
+        return {
+            "content": [
+                {
+                    "type": "text",
+                    "text": json.dumps(_tool_save_graph_artifact(arguments), indent=2),
+                }
+            ]
+        }
     else:
         raise ValueError(f"Unknown tool: {name}")
 
diff --git a/spd/app/frontend/package-lock.json b/spd/app/frontend/package-lock.json
index b6c451303..32da0218c 100644
--- a/spd/app/frontend/package-lock.json
+++ b/spd/app/frontend/package-lock.json
@@ -7,6 +7,9 @@
         "": {
             "name": "frontend",
             "version": "0.0.0",
+            "dependencies": {
+                "marked": "^17.0.1"
+            },
             "devDependencies": {
                 "@eslint/js": "^9.38.0",
                 "@sveltejs/vite-plugin-svelte": "^6.2.1",
@@ -2347,6 +2350,18 @@
                 "@jridgewell/sourcemap-codec": "^1.5.5"
             }
         },
+        "node_modules/marked": {
+            "version": "17.0.1",
+            "resolved": "https://registry.npmjs.org/marked/-/marked-17.0.1.tgz",
+            "integrity": "sha512-boeBdiS0ghpWcSwoNm/jJBwdpFaMnZWRzjA6SkUMYb40SVaN1x7mmfGKp0jvexGcx+7y2La5zRZsYFZI6Qpypg==",
+            "license": "MIT",
+            "bin": {
+                "marked": "bin/marked.js"
+            },
+            "engines": {
+                "node": ">= 20"
+            }
+        },
         "node_modules/merge2": {
             "version": "1.4.1",
             "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz",
diff --git a/spd/app/frontend/package.json b/spd/app/frontend/package.json
index f54e1bb3d..f298885ce 100644
--- a/spd/app/frontend/package.json
+++ b/spd/app/frontend/package.json
@@ -27,5 +27,8 @@
         "typescript": "~5.9.3",
         "typescript-eslint": "^8.46.2",
         "vite": "^7.1.7"
+    },
+    "dependencies": {
+        "marked": "^17.0.1"
     }
 }
diff --git a/spd/app/frontend/src/components/InvestigationsTab.svelte b/spd/app/frontend/src/components/InvestigationsTab.svelte
index e9b4a7cb8..482bc95f4 100644
--- a/spd/app/frontend/src/components/InvestigationsTab.svelte
+++ b/spd/app/frontend/src/components/InvestigationsTab.svelte
@@ -1,12 +1,15 @@
 <script lang="ts">
     import * as api from "../lib/api";
-    import type { InvestigationSummary, InvestigationDetail } from "../lib/api/investigations";
+    import type { InvestigationSummary, InvestigationDetail, GraphArtifact } from "../lib/api/investigations";
     import type { Loadable } from "../lib";
+    import ResearchLogViewer from "./investigations/ResearchLogViewer.svelte";
 
     // State
     let investigations = $state<Loadable<InvestigationSummary[]>>({ status: "uninitialized" });
     let selected = $state<Loadable<InvestigationDetail> | null>(null);
     let activeTab = $state<"research" | "events">("research");
+    let loadedArtifacts = $state<Record<string, GraphArtifact>>({});
+    let artifactsLoading = $state(false);
 
     // Load investigations on mount
     $effect(() => {
@@ -25,16 +28,39 @@
 
     async function selectInvestigation(swarmId: string, taskId: number) {
         selected = { status: "loading" };
+        loadedArtifacts = {}; // Reset artifacts when selecting new investigation
+        artifactsLoading = false;
         try {
             const data = await api.getInvestigation(swarmId, taskId);
             selected = { status: "loaded", data };
+
+            // Load all artifacts for this investigation
+            if (data.artifact_ids.length > 0) {
+                artifactsLoading = true;
+                const artifacts: Record<string, GraphArtifact> = {};
+                await Promise.all(
+                    data.artifact_ids.map(async (artifactId) => {
+                        try {
+                            const artifact = await api.getArtifact(swarmId, taskId, artifactId);
+                            artifacts[artifactId] = artifact;
+                        } catch (e) {
+                            console.error(`Failed to load artifact ${artifactId}:`, e);
+                        }
+                    }),
+                );
+                loadedArtifacts = artifacts;
+                artifactsLoading = false;
+            }
         } catch (e) {
             selected = { status: "error", error: String(e) };
+            artifactsLoading = false;
         }
     }
 
     function goBack() {
         selected = null;
+        loadedArtifacts = {};
+        artifactsLoading = false;
     }
 
     function formatDate(isoString: string): string {
@@ -71,7 +97,11 @@
             <button class="back-button" onclick={goBack}>← Back</button>
             <h2>{selected.data.title || formatId(selected.data.id)}</h2>
             {#if selected.data.status}
-                <span class="status-pill" class:completed={selected.data.status === "completed"} class:in-progress={selected.data.status === "in_progress"}>
+                <span
+                    class="status-pill"
+                    class:completed={selected.data.status === "completed"}
+                    class:in-progress={selected.data.status === "in_progress"}
+                >
                     {selected.data.status}
                 </span>
             {/if}
@@ -102,7 +132,11 @@
             {#if activeTab === "research"}
                 <div class="research-log">
                     {#if selected.data.research_log}
-                        <pre class="log-content">{selected.data.research_log}</pre>
+                        <ResearchLogViewer
+                            markdown={selected.data.research_log}
+                            artifacts={loadedArtifacts}
+                            {artifactsLoading}
+                        />
                     {:else}
                         <p class="empty-message">No research log available</p>
                     {/if}
@@ -158,8 +192,13 @@
                         {/if}
                         <div class="card-status">
                             {#if inv.status}
-                                <span class="status-badge" class:success={inv.status === "completed"} class:warning={inv.status === "in_progress"}>
-                                    {inv.status === "completed" ? "✓" : inv.status === "in_progress" ? "⏳" : "?"} {inv.status}
+                                <span
+                                    class="status-badge"
+                                    class:success={inv.status === "completed"}
+                                    class:warning={inv.status === "in_progress"}
+                                >
+                                    {inv.status === "completed" ? "✓" : inv.status === "in_progress" ? "⏳" : "?"}
+                                    {inv.status}
                                 </span>
                             {/if}
                             {#if inv.has_research_log}
@@ -409,15 +448,6 @@
         overflow-y: auto;
     }
 
-    .log-content {
-        margin: 0;
-        font-family: var(--font-mono);
-        font-size: var(--text-sm);
-        color: var(--text-primary);
-        white-space: pre-wrap;
-        word-wrap: break-word;
-    }
-
     .events-list {
         display: flex;
         flex-direction: column;
diff --git a/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte b/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte
new file mode 100644
index 000000000..2dfb3dab5
--- /dev/null
+++ b/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte
@@ -0,0 +1,478 @@
+<script lang="ts">
+    /**
+     * Graph component for rendering artifacts in research reports.
+     * Includes tooltips using the same NodeTooltip as the main graph.
+     */
+
+    import { getContext } from "svelte";
+    import type { NodePosition, EdgeData, OutputProbability, HoveredNode } from "../../lib/promptAttributionsTypes";
+    import { buildEdgeIndexes } from "../../lib/promptAttributionsTypes";
+    import type { ArtifactGraphData } from "../../lib/api/investigations";
+    import { getAliasedRowLabel } from "../../lib/layerAliasing";
+    import { colors, getEdgeColor, rgbToCss } from "../../lib/colors";
+    import {
+        lerp,
+        sortComponentsByImportance,
+        computeComponentOffsets,
+        calcTooltipPos,
+        type TooltipPos,
+    } from "../prompt-attr/graphUtils";
+    import { useZoomPan } from "../../lib/useZoomPan.svelte";
+    import ZoomControls from "../../lib/ZoomControls.svelte";
+    import NodeTooltip from "../prompt-attr/NodeTooltip.svelte";
+    import { RUN_KEY, type RunContext } from "../../lib/useRun.svelte";
+
+    // Get run context for tooltips
+    const runState = getContext<RunContext>(RUN_KEY);
+
+    // Constants
+    const COMPONENT_SIZE = 8;
+    const HIT_AREA_PADDING = 4;
+    const MARGIN = { top: 60, right: 40, bottom: 20, left: 20 };
+    const LABEL_WIDTH = 100;
+    const ROW_ORDER = ["wte", "qkv", "o_proj", "c_fc", "down_proj", "lm_head", "output"];
+    const QKV_SUBTYPES = ["q_proj", "k_proj", "v_proj"];
+
+    type Props = {
+        data: ArtifactGraphData;
+        caption?: string;
+        topK?: number;
+        componentGap?: number;
+        layerGap?: number;
+    };
+
+    let { data, caption, topK = 200, componentGap = 4, layerGap = 24 }: Props = $props();
+
+    // Hover state
+    let hoveredNode = $state<HoveredNode | null>(null);
+    let isHoveringTooltip = $state(false);
+    let tooltipPos = $state<TooltipPos>({ left: 0, top: 0 });
+    let hoverTimeout: ReturnType<typeof setTimeout> | null = null;
+
+    // Refs
+    let innerContainer: HTMLDivElement;
+    const zoom = useZoomPan(() => innerContainer);
+
+    // Helper functions (not reactive)
+    function parseLayer(name: string): { block: number; subtype: string } {
+        if (name === "wte") return { block: -1, subtype: "wte" };
+        if (name === "lm_head") return { block: Infinity - 1, subtype: "lm_head" };
+        if (name === "output") return { block: Infinity, subtype: "output" };
+        const m = name.match(/h\.(\d+)\.(attn|mlp)\.(\w+)/);
+        if (!m) return { block: 0, subtype: name };
+        return { block: +m[1], subtype: m[3] };
+    }
+
+    function getRowKey(layer: string): string {
+        const info = parseLayer(layer);
+        if (QKV_SUBTYPES.includes(info.subtype)) {
+            const m = layer.match(/h\.(\d+)/);
+            return m ? `h.${m[1]}.qkv` : layer;
+        }
+        return layer;
+    }
+
+    // Compute layout - ONLY for nodes involved in edges
+    function computeLayout(graphData: ArtifactGraphData, edges: EdgeData[], compGap: number, lGap: number) {
+        // Collect only nodes involved in the displayed edges
+        // eslint-disable-next-line svelte/prefer-svelte-reactivity -- local variable, not reactive state
+        const activeNodes = new Set<string>();
+        for (const edge of edges) {
+            activeNodes.add(edge.src);
+            activeNodes.add(edge.tgt);
+        }
+
+        const nodesPerLayerSeq: Record<string, number[]> = {};
+        // eslint-disable-next-line svelte/prefer-svelte-reactivity -- local variable, not reactive state
+        const allLayers = new Set<string>();
+        // eslint-disable-next-line svelte/prefer-svelte-reactivity -- local variable, not reactive state
+        const allRows = new Set<string>();
+
+        for (const nodeKey of activeNodes) {
+            const [layer, seqIdx, cIdx] = nodeKey.split(":");
+            allLayers.add(layer);
+            allRows.add(getRowKey(layer));
+            const key = `${layer}:${seqIdx}`;
+            if (!nodesPerLayerSeq[key]) nodesPerLayerSeq[key] = [];
+            nodesPerLayerSeq[key].push(+cIdx);
+        }
+
+        // Sort rows
+        const rows = Array.from(allRows).sort((a, b) => {
+            const parseRow = (r: string) => {
+                if (r === "wte") return { block: -1, subtype: "wte" };
+                if (r === "lm_head") return { block: Infinity - 1, subtype: "lm_head" };
+                if (r === "output") return { block: Infinity, subtype: "output" };
+                const mQkv = r.match(/h\.(\d+)\.qkv/);
+                if (mQkv) return { block: +mQkv[1], subtype: "qkv" };
+                const m = r.match(/h\.(\d+)\.(attn|mlp)\.(\w+)/);
+                if (!m) return { block: 0, subtype: r };
+                return { block: +m[1], subtype: m[3] };
+            };
+            const infoA = parseRow(a);
+            const infoB = parseRow(b);
+            if (infoA.block !== infoB.block) return infoA.block - infoB.block;
+            return ROW_ORDER.indexOf(infoA.subtype) - ROW_ORDER.indexOf(infoB.subtype);
+        });
+
+        // Assign Y positions
+        const rowYPositions: Record<string, number> = {};
+        for (let i = 0; i < rows.length; i++) {
+            const distanceFromEnd = rows.length - 1 - i;
+            rowYPositions[rows[i]] = MARGIN.top + distanceFromEnd * (COMPONENT_SIZE + lGap);
+        }
+
+        const layerYPositions: Record<string, number> = {};
+        for (const layer of allLayers) {
+            layerYPositions[layer] = rowYPositions[getRowKey(layer)];
+        }
+
+        // Calculate column widths
+        const tokens = graphData.tokens;
+        const maxComponentsPerSeq = tokens.map((_: string, seqIdx: number) => {
+            let maxAtSeq = 0;
+            for (const layer of allLayers) {
+                const nodes = nodesPerLayerSeq[`${layer}:${seqIdx}`] ?? [];
+                maxAtSeq = Math.max(maxAtSeq, nodes.length);
+            }
+            return maxAtSeq;
+        });
+
+        const MIN_COL_WIDTH = 30;
+        const COL_PADDING = 16;
+        const seqWidths = maxComponentsPerSeq.map((n: number) =>
+            Math.max(MIN_COL_WIDTH, n * (COMPONENT_SIZE + compGap) + COL_PADDING * 2),
+        );
+        const seqXStarts = [MARGIN.left];
+        for (let i = 0; i < seqWidths.length - 1; i++) {
+            seqXStarts.push(seqXStarts[i] + seqWidths[i]);
+        }
+
+        // Position nodes
+        const nodePositions: Record<string, NodePosition> = {};
+        for (const layer of allLayers) {
+            for (let seqIdx = 0; seqIdx < tokens.length; seqIdx++) {
+                const nodes = nodesPerLayerSeq[`${layer}:${seqIdx}`];
+                if (!nodes) continue;
+
+                const baseX = seqXStarts[seqIdx] + COL_PADDING;
+                const baseY = layerYPositions[layer];
+
+                const sorted = sortComponentsByImportance(nodes, layer, seqIdx, graphData.nodeCiVals, graphData.outputProbs);
+                const offsets = computeComponentOffsets(sorted, COMPONENT_SIZE, compGap);
+
+                for (const cIdx of nodes) {
+                    nodePositions[`${layer}:${seqIdx}:${cIdx}`] = {
+                        x: baseX + offsets[cIdx] + COMPONENT_SIZE / 2,
+                        y: baseY + COMPONENT_SIZE / 2,
+                    };
+                }
+            }
+        }
+
+        const totalSeqWidth = seqXStarts[seqXStarts.length - 1] + seqWidths[seqWidths.length - 1];
+        const width = totalSeqWidth + MARGIN.right;
+        const maxY = Math.max(...Object.values(layerYPositions), 0) + COMPONENT_SIZE;
+        const height = maxY + MARGIN.bottom;
+
+        return { nodePositions, layerYPositions, seqXStarts, width, height };
+    }
+
+    // Compute filtered edges
+    function getFilteredEdges(edges: EdgeData[], k: number): EdgeData[] {
+        return [...edges].sort((a, b) => Math.abs(b.val) - Math.abs(a.val)).slice(0, k);
+    }
+
+    // Compute node styles
+    function computeNodeStyles(
+        positions: Record<string, NodePosition>,
+        nodeCiVals: Record<string, number>,
+        outputProbs: Record<string, OutputProbability>,
+    ): Record<string, { fill: string; opacity: number }> {
+        const styles: Record<string, { fill: string; opacity: number }> = {};
+        const maxCi = Math.max(...Object.values(nodeCiVals), 1);
+
+        for (const nodeKey of Object.keys(positions)) {
+            const [layer, seqIdx, cIdx] = nodeKey.split(":");
+            let fill: string = colors.nodeDefault;
+            let opacity = 0.2;
+
+            if (layer === "output") {
+                const probEntry = outputProbs[`${seqIdx}:${cIdx}`];
+                if (probEntry) {
+                    fill = rgbToCss(colors.outputBase);
+                    opacity = 0.2 + probEntry.prob * 0.8;
+                }
+            } else {
+                const ci = nodeCiVals[nodeKey] || 0;
+                opacity = 0.2 + (ci / maxCi) * 0.8;
+            }
+
+            styles[nodeKey] = { fill, opacity };
+        }
+
+        return styles;
+    }
+
+    // Build edges SVG string
+    function buildEdgesSvg(edges: EdgeData[], positions: Record<string, NodePosition>, maxAbsAttr: number): string {
+        let svg = "";
+        for (let i = edges.length - 1; i >= 0; i--) {
+            const edge = edges[i];
+            const p1 = positions[edge.src];
+            const p2 = positions[edge.tgt];
+            if (p1 && p2) {
+                const color = getEdgeColor(edge.val);
+                const w = lerp(1, 4, Math.abs(edge.val) / maxAbsAttr);
+                const op = lerp(0.1, 0.6, Math.abs(edge.val) / maxAbsAttr);
+                const dy = Math.abs(p2.y - p1.y);
+                const curveOffset = Math.max(20, dy * 0.4);
+                const d = `M ${p1.x},${p1.y} C ${p1.x},${p1.y - curveOffset} ${p2.x},${p2.y + curveOffset} ${p2.x},${p2.y}`;
+                svg += `<path d="${d}" stroke="${color}" stroke-width="${w}" opacity="${op}" fill="none"/>`;
+            }
+        }
+        return svg;
+    }
+
+    // Derived values - compute once and cache
+    // NOTE: filteredEdges must be computed BEFORE layout since layout depends on it
+    const filteredEdges = $derived(getFilteredEdges(data.edges, topK));
+    const layout = $derived(computeLayout(data, filteredEdges, componentGap, layerGap));
+    const nodeStyles = $derived(computeNodeStyles(layout.nodePositions, data.nodeCiVals, data.outputProbs));
+    const edgesSvg = $derived(buildEdgesSvg(filteredEdges, layout.nodePositions, data.maxAbsAttr || 1));
+
+    // Build edge indexes for tooltip (same pattern as main graph)
+    const edgeIndexes = $derived(buildEdgeIndexes(filteredEdges));
+
+    const svgWidth = $derived(layout.width * zoom.scale + Math.max(zoom.translateX, 0));
+    const svgHeight = $derived(layout.height * zoom.scale + Math.max(zoom.translateY, 0));
+
+    // Hover handlers
+    function handleNodeHover(event: MouseEvent, layer: string, seqIdx: number, cIdx: number) {
+        if (hoverTimeout) {
+            clearTimeout(hoverTimeout);
+            hoverTimeout = null;
+        }
+        hoveredNode = { layer, seqIdx, cIdx };
+        const size = layer === "wte" || layer === "output" ? "small" : "large";
+        tooltipPos = calcTooltipPos(event.clientX, event.clientY, size);
+    }
+
+    function handleNodeLeave() {
+        if (hoverTimeout) {
+            clearTimeout(hoverTimeout);
+        }
+        hoverTimeout = setTimeout(() => {
+            if (!isHoveringTooltip) {
+                hoveredNode = null;
+            }
+            hoverTimeout = null;
+        }, 100);
+    }
+
+    function handlePanStart(event: MouseEvent) {
+        if (event.button === 1 || (event.button === 0 && event.shiftKey)) {
+            zoom.startPan(event);
+        }
+    }
+</script>
+
+<div class="artifact-graph">
+    {#if caption}
+        <div class="caption">{caption}</div>
+    {/if}
+
+    <!-- svelte-ignore a11y_no_static_element_interactions -->
+    <div
+        class="graph-wrapper"
+        class:panning={zoom.isPanning}
+        onmousedown={handlePanStart}
+        onmousemove={zoom.updatePan}
+        onmouseup={zoom.endPan}
+        onmouseleave={zoom.endPan}
+    >
+        <ZoomControls scale={zoom.scale} onZoomIn={zoom.zoomIn} onZoomOut={zoom.zoomOut} onReset={zoom.reset} />
+
+        <div class="layer-labels-container" style="width: {LABEL_WIDTH}px;">
+            <svg width={LABEL_WIDTH} height={svgHeight} style="display: block;">
+                <g transform="translate(0, {zoom.translateY}) scale(1, {zoom.scale})">
+                    {#each Object.entries(layout.layerYPositions) as [layer, y] (layer)}
+                        <text
+                            x={LABEL_WIDTH - 10}
+                            y={y + COMPONENT_SIZE / 2}
+                            text-anchor="end"
+                            dominant-baseline="middle"
+                            font-size="10"
+                            font-weight="500"
+                            font-family="'Berkeley Mono', 'SF Mono', monospace"
+                            fill={colors.textSecondary}
+                        >
+                            {getAliasedRowLabel(layer, getRowKey(layer).endsWith(".qkv"))}
+                        </text>
+                    {/each}
+                </g>
+            </svg>
+        </div>
+
+        <div class="graph-container" bind:this={innerContainer}>
+            <svg width={svgWidth} height={svgHeight}>
+                <g transform="translate({zoom.translateX}, {zoom.translateY}) scale({zoom.scale})">
+                    <g class="edges-layer">
+                        <!-- eslint-disable-next-line svelte/no-at-html-tags -->
+                        {@html edgesSvg}
+                    </g>
+
+                    <g class="nodes-layer">
+                        {#each Object.entries(layout.nodePositions) as [key, pos] (key)}
+                            {@const style = nodeStyles[key]}
+                            {@const [layer, seqIdxStr, cIdxStr] = key.split(":")}
+                            {@const seqIdx = parseInt(seqIdxStr)}
+                            {@const cIdx = parseInt(cIdxStr)}
+                            <!-- Hit area for easier hovering -->
+                            <rect
+                                x={pos.x - COMPONENT_SIZE / 2 - HIT_AREA_PADDING}
+                                y={pos.y - COMPONENT_SIZE / 2 - HIT_AREA_PADDING}
+                                width={COMPONENT_SIZE + HIT_AREA_PADDING * 2}
+                                height={COMPONENT_SIZE + HIT_AREA_PADDING * 2}
+                                fill="transparent"
+                                onmouseenter={(e) => handleNodeHover(e, layer, seqIdx, cIdx)}
+                                onmouseleave={handleNodeLeave}
+                            />
+                            <!-- Visible node -->
+                            <rect
+                                x={pos.x - COMPONENT_SIZE / 2}
+                                y={pos.y - COMPONENT_SIZE / 2}
+                                width={COMPONENT_SIZE}
+                                height={COMPONENT_SIZE}
+                                fill={style.fill}
+                                rx="1"
+                                opacity={style.opacity}
+                                pointer-events="none"
+                            />
+                        {/each}
+                    </g>
+                </g>
+            </svg>
+
+            <div class="token-labels-container">
+                <svg width={svgWidth} height="50" style="display: block;">
+                    <g transform="translate({zoom.translateX}, 0) scale({zoom.scale}, 1)">
+                        {#each data.tokens as token, i (i)}
+                            {@const colLeft = layout.seqXStarts[i] + 8}
+                            <text
+                                x={colLeft}
+                                y="20"
+                                text-anchor="start"
+                                font-size="11"
+                                font-family="'Berkeley Mono', 'SF Mono', monospace"
+                                font-weight="500"
+                                fill={colors.textPrimary}
+                            >
+                                {token}
+                            </text>
+                            <text
+                                x={colLeft}
+                                y="36"
+                                text-anchor="start"
+                                font-size="9"
+                                font-family="'Berkeley Mono', 'SF Mono', monospace"
+                                fill={colors.textMuted}>[{i}]</text
+                            >
+                        {/each}
+                    </g>
+                </svg>
+            </div>
+        </div>
+    </div>
+
+    <div class="stats">
+        L0: {data.l0_total} · Edges: {filteredEdges.length}
+    </div>
+
+    <!-- Node tooltip -->
+    {#if hoveredNode && runState}
+        <NodeTooltip
+            {hoveredNode}
+            {tooltipPos}
+            hideNodeCard={false}
+            outputProbs={data.outputProbs}
+            nodeCiVals={data.nodeCiVals}
+            nodeSubcompActs={data.nodeSubcompActs}
+            tokens={data.tokens}
+            edgesBySource={edgeIndexes.edgesBySource}
+            edgesByTarget={edgeIndexes.edgesByTarget}
+            onMouseEnter={() => (isHoveringTooltip = true)}
+            onMouseLeave={() => {
+                isHoveringTooltip = false;
+                hoveredNode = null;
+            }}
+        />
+    {/if}
+</div>
+
+<style>
+    .artifact-graph {
+        border: 1px solid var(--border-default);
+        border-radius: var(--radius-md);
+        overflow: hidden;
+        margin: var(--space-3) 0;
+        background: var(--bg-surface);
+    }
+
+    .caption {
+        padding: var(--space-2) var(--space-3);
+        font-size: var(--text-sm);
+        font-weight: 500;
+        color: var(--text-secondary);
+        background: var(--bg-elevated);
+        border-bottom: 1px solid var(--border-default);
+    }
+
+    .graph-wrapper {
+        display: flex;
+        overflow: hidden;
+        position: relative;
+        height: 400px;
+    }
+
+    .graph-wrapper.panning {
+        cursor: grabbing;
+    }
+
+    .layer-labels-container {
+        position: sticky;
+        left: 0;
+        background: var(--bg-surface);
+        border-right: 1px solid var(--border-default);
+        z-index: 11;
+        flex-shrink: 0;
+    }
+
+    .graph-container {
+        overflow: auto;
+        flex: 1;
+        position: relative;
+        background: var(--bg-inset);
+    }
+
+    .token-labels-container {
+        position: sticky;
+        bottom: 0;
+        background: var(--bg-surface);
+        border-top: 1px solid var(--border-default);
+        z-index: 10;
+    }
+
+    svg {
+        display: block;
+    }
+
+    .stats {
+        padding: var(--space-1) var(--space-3);
+        font-size: var(--text-xs);
+        font-family: var(--font-mono);
+        color: var(--text-muted);
+        background: var(--bg-elevated);
+        border-top: 1px solid var(--border-default);
+    }
+</style>
diff --git a/spd/app/frontend/src/components/investigations/ResearchLogViewer.svelte b/spd/app/frontend/src/components/investigations/ResearchLogViewer.svelte
new file mode 100644
index 000000000..ef9b8d40d
--- /dev/null
+++ b/spd/app/frontend/src/components/investigations/ResearchLogViewer.svelte
@@ -0,0 +1,223 @@
+<script lang="ts">
+    /**
+     * Research log viewer that renders markdown with inline graph artifacts.
+     *
+     * Supports the ```spd:graph syntax:
+     * ```spd:graph
+     * artifact: graph_001
+     * ```
+     *
+     * This block is replaced with an interactive ArtifactGraph component.
+     */
+
+    import { marked } from "marked";
+    import type { GraphArtifact } from "../../lib/api/investigations";
+    import ArtifactGraph from "./ArtifactGraph.svelte";
+
+    type Props = {
+        markdown: string;
+        artifacts: Record<string, GraphArtifact>;
+        artifactsLoading?: boolean;
+    };
+
+    let { markdown, artifacts, artifactsLoading = false }: Props = $props();
+
+    // Content block type: either rendered markdown HTML or a graph artifact
+    type ContentBlock = { type: "html"; content: string } | { type: "graph"; artifactId: string };
+
+    // Parse markdown and extract spd:graph blocks
+    const contentBlocks = $derived.by(() => {
+        const blocks: ContentBlock[] = [];
+
+        // Pattern to find ```spd:graph blocks with artifact reference
+        const spdGraphPattern = /```spd:graph\s*\n\s*artifact:\s*(\S+)\s*\n```/g;
+
+        // Split markdown by spd:graph blocks
+        let lastIndex = 0;
+        let match;
+
+        while ((match = spdGraphPattern.exec(markdown)) !== null) {
+            // Add markdown before this block
+            if (match.index > lastIndex) {
+                const mdContent = markdown.slice(lastIndex, match.index);
+                if (mdContent.trim()) {
+                    blocks.push({ type: "html", content: marked.parse(mdContent) as string });
+                }
+            }
+
+            // Add graph block
+            const artifactId = match[1];
+            blocks.push({ type: "graph", artifactId });
+
+            lastIndex = match.index + match[0].length;
+        }
+
+        // Add remaining markdown after last block
+        if (lastIndex < markdown.length) {
+            const mdContent = markdown.slice(lastIndex);
+            if (mdContent.trim()) {
+                blocks.push({ type: "html", content: marked.parse(mdContent) as string });
+            }
+        }
+
+        // If no blocks parsed at all (no spd:graph), parse everything as markdown
+        if (blocks.length === 0 && markdown.trim()) {
+            blocks.push({ type: "html", content: marked.parse(markdown) as string });
+        }
+
+        return blocks;
+    });
+</script>
+
+<div class="research-log-viewer">
+    {#each contentBlocks as block, i (i)}
+        {#if block.type === "html"}
+            <!-- eslint-disable-next-line svelte/no-at-html-tags -->
+            <div class="markdown-content">{@html block.content}</div>
+        {:else if block.type === "graph"}
+            {@const artifact = artifacts[block.artifactId]}
+            {#if artifact}
+                <ArtifactGraph data={artifact.data} caption={artifact.caption ?? undefined} />
+            {:else if artifactsLoading}
+                <div class="artifact-loading">
+                    Loading graph: <code>{block.artifactId}</code>...
+                </div>
+            {:else}
+                <div class="artifact-missing">
+                    Graph artifact not found: <code>{block.artifactId}</code>
+                </div>
+            {/if}
+        {/if}
+    {/each}
+</div>
+
+<style>
+    .research-log-viewer {
+        font-size: var(--text-sm);
+        color: var(--text-primary);
+        line-height: 1.6;
+    }
+
+    .markdown-content :global(h1),
+    .markdown-content :global(h2),
+    .markdown-content :global(h3),
+    .markdown-content :global(h4) {
+        margin-top: var(--space-4);
+        margin-bottom: var(--space-2);
+        font-weight: 600;
+        color: var(--text-primary);
+    }
+
+    .markdown-content :global(h1) {
+        font-size: var(--text-xl);
+        border-bottom: 1px solid var(--border-default);
+        padding-bottom: var(--space-2);
+    }
+
+    .markdown-content :global(h2) {
+        font-size: var(--text-lg);
+    }
+
+    .markdown-content :global(h3) {
+        font-size: var(--text-base);
+    }
+
+    .markdown-content :global(p) {
+        margin: var(--space-2) 0;
+    }
+
+    .markdown-content :global(code) {
+        background: var(--bg-inset);
+        padding: var(--space-0) var(--space-1);
+        border-radius: var(--radius-sm);
+        font-family: var(--font-mono);
+        font-size: 0.9em;
+    }
+
+    .markdown-content :global(pre) {
+        background: var(--bg-inset);
+        padding: var(--space-3);
+        border-radius: var(--radius-md);
+        overflow-x: auto;
+        margin: var(--space-3) 0;
+    }
+
+    .markdown-content :global(pre code) {
+        background: none;
+        padding: 0;
+    }
+
+    .markdown-content :global(ul),
+    .markdown-content :global(ol) {
+        margin: var(--space-2) 0;
+        padding-left: var(--space-4);
+    }
+
+    .markdown-content :global(li) {
+        margin: var(--space-1) 0;
+    }
+
+    .markdown-content :global(blockquote) {
+        border-left: 3px solid var(--border-default);
+        padding-left: var(--space-3);
+        margin: var(--space-3) 0;
+        color: var(--text-secondary);
+    }
+
+    .markdown-content :global(a) {
+        color: var(--accent-primary);
+        text-decoration: none;
+    }
+
+    .markdown-content :global(a:hover) {
+        text-decoration: underline;
+    }
+
+    .markdown-content :global(table) {
+        border-collapse: collapse;
+        width: 100%;
+        margin: var(--space-3) 0;
+    }
+
+    .markdown-content :global(th),
+    .markdown-content :global(td) {
+        border: 1px solid var(--border-default);
+        padding: var(--space-2);
+        text-align: left;
+    }
+
+    .markdown-content :global(th) {
+        background: var(--bg-elevated);
+        font-weight: 600;
+    }
+
+    .artifact-loading {
+        padding: var(--space-3);
+        background: var(--bg-inset);
+        border: 1px solid var(--border-default);
+        border-radius: var(--radius-md);
+        color: var(--text-muted);
+        margin: var(--space-3) 0;
+    }
+
+    .artifact-loading code {
+        background: var(--bg-elevated);
+        padding: var(--space-0) var(--space-1);
+        border-radius: var(--radius-sm);
+    }
+
+    .artifact-missing {
+        padding: var(--space-3);
+        background: var(--accent-red-dim);
+        border: 1px solid var(--accent-red);
+        border-radius: var(--radius-md);
+        color: var(--accent-red);
+        margin: var(--space-3) 0;
+    }
+
+    .artifact-missing code {
+        background: rgba(255, 255, 255, 0.1);
+        padding: var(--space-0) var(--space-1);
+        border-radius: var(--radius-sm);
+    }
+</style>
diff --git a/spd/app/frontend/src/lib/api/investigations.ts b/spd/app/frontend/src/lib/api/investigations.ts
index 2bdc7b09c..f0d258f4b 100644
--- a/spd/app/frontend/src/lib/api/investigations.ts
+++ b/spd/app/frontend/src/lib/api/investigations.ts
@@ -36,12 +36,34 @@ export interface InvestigationDetail {
     research_log: string | null;
     events: EventEntry[];
     explanations: Record<string, unknown>[];
+    artifact_ids: string[]; // List of artifact IDs available for this investigation
     // Agent-provided summary
     title: string | null;
     summary: string | null;
     status: string | null;
 }
 
+import type { EdgeData, OutputProbability } from "../promptAttributionsTypes";
+
+/** Data for a graph artifact (subset of GraphData, self-contained for offline viewing) */
+export interface ArtifactGraphData {
+    tokens: string[];
+    edges: EdgeData[];
+    outputProbs: Record<string, OutputProbability>;
+    nodeCiVals: Record<string, number>;
+    nodeSubcompActs: Record<string, number>;
+    maxAbsAttr: number;
+    l0_total: number;
+}
+
+export interface GraphArtifact {
+    type: "graph";
+    id: string;
+    caption: string | null;
+    graph_id: number;
+    data: ArtifactGraphData;
+}
+
 export async function listInvestigations(): Promise<InvestigationSummary[]> {
     const res = await fetch("/api/investigations");
     if (!res.ok) throw new Error(`Failed to list investigations: ${res.statusText}`);
@@ -53,3 +75,15 @@ export async function getInvestigation(swarmId: string, taskId: number): Promise
     if (!res.ok) throw new Error(`Failed to get investigation: ${res.statusText}`);
     return res.json();
 }
+
+export async function listArtifacts(swarmId: string, taskId: number): Promise<string[]> {
+    const res = await fetch(`/api/investigations/${swarmId}/${taskId}/artifacts`);
+    if (!res.ok) throw new Error(`Failed to list artifacts: ${res.statusText}`);
+    return res.json();
+}
+
+export async function getArtifact(swarmId: string, taskId: number, artifactId: string): Promise<GraphArtifact> {
+    const res = await fetch(`/api/investigations/${swarmId}/${taskId}/artifacts/${artifactId}`);
+    if (!res.ok) throw new Error(`Failed to get artifact: ${res.statusText}`);
+    return res.json();
+}

From 13cf49a907f027eb0caee1bbbeb4d0482cc263f6 Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Fri, 13 Feb 2026 15:24:25 +0000
Subject: [PATCH 14/62] =?UTF-8?q?Refactor=20agent=5Fswarm=20=E2=86=92=20in?=
 =?UTF-8?q?vestigate:=20single-agent,=20researcher-directed?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reshapes the swarm module into a focused investigation tool where a
researcher poses a specific question and a single agent investigates it.

Key changes:
- Rename spd/agent_swarm/ → spd/investigate/, CLI spd-swarm → spd-investigate
- Single SLURM job instead of array, flat output dir structure
- Agent prompt accepts researcher's question + injects model architecture info
- 5 new MCP tools: probe_component, get_component_activation_examples,
  get_component_attributions, get_model_info, get_attribution_strength
- MCP dispatch refactored from if/elif chain to lookup tables
- Investigations scoped to loaded run via DepLoadedRun
- Frontend: refresh button, @file prompt input, launch-from-UI flow
- Graph artifacts expand to natural size, research log flows with page

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md                                     |  26 +-
 pyproject.toml                                |   2 +-
 spd/agent_swarm/CLAUDE.md                     | 176 ------
 spd/agent_swarm/__init__.py                   |  22 -
 spd/agent_swarm/agent_prompt.py               | 198 -------
 spd/agent_swarm/scripts/__init__.py           |   1 -
 spd/agent_swarm/scripts/run_slurm.py          | 118 ----
 spd/agent_swarm/scripts/run_slurm_cli.py      |  68 ---
 spd/app/backend/database.py                   |   8 +-
 spd/app/backend/routers/investigations.py     | 221 +++----
 spd/app/backend/routers/mcp.py                | 553 +++++++++++++-----
 spd/app/backend/server.py                     |  26 +-
 .../src/components/InvestigationsTab.svelte   | 150 ++++-
 .../investigations/ArtifactGraph.svelte       |  15 +-
 .../frontend/src/lib/api/investigations.ts    |  38 +-
 spd/investigate/CLAUDE.md                     | 118 ++++
 spd/investigate/__init__.py                   |  22 +
 spd/investigate/agent_prompt.py               | 211 +++++++
 spd/{agent_swarm => investigate}/schemas.py   |   4 +-
 spd/investigate/scripts/__init__.py           |   1 +
 .../scripts/run_agent.py                      | 165 +++---
 spd/investigate/scripts/run_slurm.py          |  90 +++
 spd/investigate/scripts/run_slurm_cli.py      |  59 ++
 23 files changed, 1290 insertions(+), 1002 deletions(-)
 delete mode 100644 spd/agent_swarm/CLAUDE.md
 delete mode 100644 spd/agent_swarm/__init__.py
 delete mode 100644 spd/agent_swarm/agent_prompt.py
 delete mode 100644 spd/agent_swarm/scripts/__init__.py
 delete mode 100644 spd/agent_swarm/scripts/run_slurm.py
 delete mode 100644 spd/agent_swarm/scripts/run_slurm_cli.py
 create mode 100644 spd/investigate/CLAUDE.md
 create mode 100644 spd/investigate/__init__.py
 create mode 100644 spd/investigate/agent_prompt.py
 rename spd/{agent_swarm => investigate}/schemas.py (97%)
 create mode 100644 spd/investigate/scripts/__init__.py
 rename spd/{agent_swarm => investigate}/scripts/run_agent.py (55%)
 create mode 100644 spd/investigate/scripts/run_slurm.py
 create mode 100644 spd/investigate/scripts/run_slurm_cli.py

diff --git a/CLAUDE.md b/CLAUDE.md
index 6977a19c2..5d230696b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -166,7 +166,7 @@ Each experiment (`spd/experiments/{tms,resid_mlp,lm}/`) contains:
 ├── scripts/                         # Standalone utility scripts
 ├── tests/                           # Test suite
 ├── spd/                             # Main source code
-│   ├── agent_swarm/                 # Parallel agent investigation (see agent_swarm/CLAUDE.md)
+│   ├── investigate/                 # Agent investigation (see investigate/CLAUDE.md)
 │   ├── app/                         # Web visualization app (see app/CLAUDE.md)
 │   ├── autointerp/                  # LLM interpretation (see autointerp/CLAUDE.md)
 │   ├── clustering/                  # Component clustering (see clustering/CLAUDE.md)
@@ -211,7 +211,7 @@ Each experiment (`spd/experiments/{tms,resid_mlp,lm}/`) contains:
 | `spd-clustering`   | `spd/clustering/scripts/run_pipeline.py`            | Clustering pipeline                                                          |
 
 | `spd-pretrain`    | `spd/pretrain/scripts/run_slurm_cli.py`              | Pretrain target models                                                       |
-| `spd-swarm`       | `spd/agent_swarm/scripts/run_slurm_cli.py`           | Launch parallel agent swarm                                                  |
+| `spd-investigate` | `spd/investigate/scripts/run_slurm_cli.py`            | Launch investigation agent                                                   |
 
 ### Files to Skip When Searching
 
@@ -255,9 +255,9 @@ Use `spd/` as the search root (not repo root) to avoid noise.
 
 - `spd-clustering` → `spd/clustering/scripts/run_pipeline.py` → `spd/utils/slurm.py` → `spd/clustering/scripts/run_clustering.py`
 
-**Agent Swarm Pipeline:**
+**Investigation Pipeline:**
 
-- `spd-swarm` → `spd/agent_swarm/scripts/run_slurm_cli.py` → `spd/utils/slurm.py` → SLURM array → `spd/agent_swarm/scripts/run_agent.py` → Claude Code
+- `spd-investigate` → `spd/investigate/scripts/run_slurm_cli.py` → `spd/utils/slurm.py` → SLURM → `spd/investigate/scripts/run_agent.py` → Claude Code
 
 ## Common Usage Patterns
 
@@ -305,25 +305,27 @@ spd-autointerp <wandb_path>            # Submit SLURM job to interpret component
 
 Requires `OPENROUTER_API_KEY` env var. See `spd/autointerp/CLAUDE.md` for details.
 
-### Agent Swarm for Parallel Investigation (`spd-swarm`)
+### Agent Investigation (`spd-investigate`)
 
-Launch a swarm of Claude Code agents to investigate behaviors in an SPD model:
+Launch a Claude Code agent to investigate a specific question about an SPD model:
 
 ```bash
-spd-swarm <wandb_path> --n_agents 10              # Launch 10 parallel agents
-spd-swarm <wandb_path> --n_agents 5 --time 4:00:00  # Custom time limit
+spd-investigate <wandb_path> "How does the model handle gendered pronouns?"
+spd-investigate <wandb_path> "What components are involved in verb agreement?" --time 4:00:00
 ```
 
-Each agent:
+Each investigation:
 
 - Runs in its own SLURM job with 1 GPU
 - Starts an isolated app backend instance
-- Investigates behaviors using the SPD app API
+- Investigates the specific research question using SPD tools via MCP
 - Writes findings to append-only JSONL files
 
-Output: `SPD_OUT_DIR/agent_swarm/<swarm_id>/task_*/explanations.jsonl`
+Output: `SPD_OUT_DIR/investigations/<inv_id>/`
 
-See `spd/agent_swarm/CLAUDE.md` for details.
+For parallel investigations, run the command multiple times with different prompts.
+
+See `spd/investigate/CLAUDE.md` for details.
 
 ### Unified Postprocessing (`spd-postprocess`)
 
diff --git a/pyproject.toml b/pyproject.toml
index 1e5c78aeb..b684c724b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,7 +56,7 @@ spd-clustering = "spd.clustering.scripts.run_pipeline:cli"
 spd-harvest = "spd.harvest.scripts.run_slurm_cli:cli"
 spd-autointerp = "spd.autointerp.scripts.run_slurm_cli:cli"
 spd-attributions = "spd.dataset_attributions.scripts.run_slurm_cli:cli"
-spd-swarm = "spd.agent_swarm.scripts.run_slurm_cli:cli"
+spd-investigate = "spd.investigate.scripts.run_slurm_cli:cli"
 spd-postprocess = "spd.postprocess.cli:cli"
 
 [build-system]
diff --git a/spd/agent_swarm/CLAUDE.md b/spd/agent_swarm/CLAUDE.md
deleted file mode 100644
index 6e06ed91a..000000000
--- a/spd/agent_swarm/CLAUDE.md
+++ /dev/null
@@ -1,176 +0,0 @@
-# Agent Swarm Module
-
-This module provides infrastructure for launching parallel SLURM-based Claude Code agents
-that investigate behaviors in SPD model decompositions.
-
-## Overview
-
-The agent swarm system allows you to:
-1. Launch many parallel agents (each as a SLURM job with 1 GPU)
-2. Each agent runs an isolated app backend instance with MCP support
-3. Agents investigate behaviors using SPD tools via MCP (Model Context Protocol)
-4. Progress is streamed in real-time via MCP SSE events
-5. Findings are written to append-only JSONL files
-
-## Usage
-
-```bash
-# Launch 10 agents to investigate a decomposition
-spd-swarm goodfire-ai/spd/runs/abc123 --n_agents 10
-
-# With custom settings
-spd-swarm goodfire-ai/spd/runs/abc123 --n_agents 5 --context_length 64 --time 4:00:00
-```
-
-## Architecture
-
-```
-spd/agent_swarm/
-├── __init__.py           # Public exports
-├── CLAUDE.md             # This file
-├── schemas.py            # Pydantic models for outputs
-├── agent_prompt.py       # System prompt for agents
-└── scripts/
-    ├── __init__.py
-    ├── run_slurm_cli.py  # CLI entry point (spd-swarm)
-    ├── run_slurm.py      # SLURM submission logic
-    └── run_agent.py      # Worker script (runs in each SLURM job)
-```
-
-## MCP Tools
-
-Agents access ALL SPD functionality via MCP (Model Context Protocol). The backend exposes
-these tools at `/mcp`. Agents don't need file system access - everything is done through MCP.
-
-**Analysis Tools:**
-
-| Tool | Description |
-|------|-------------|
-| `optimize_graph` | Find minimal circuit for a behavior (streams progress) |
-| `get_component_info` | Get component interpretation, token stats, correlations |
-| `run_ablation` | Test circuit by running with selected components only |
-| `search_dataset` | Search SimpleStories training data for patterns |
-| `create_prompt` | Tokenize text and get next-token probabilities |
-
-**Output Tools:**
-
-| Tool | Description |
-|------|-------------|
-| `update_research_log` | Append content to the agent's research log (PRIMARY OUTPUT) |
-| `save_graph_artifact` | Save a graph for inline visualization in research log |
-| `save_explanation` | Save a complete, validated behavior explanation |
-| `set_investigation_summary` | Set title and summary shown in the investigations UI |
-| `submit_suggestion` | Submit ideas for improving the tools or system |
-
-The `optimize_graph` tool streams progress events via SSE, giving real-time visibility
-into long-running optimization operations.
-
-Suggestions from all agents are collected in `SPD_OUT_DIR/agent_swarm/suggestions.jsonl` (global file).
-
-## Output Structure
-
-```
-SPD_OUT_DIR/agent_swarm/
-├── suggestions.jsonl         # System improvement suggestions from ALL agents (global)
-└── <swarm_id>/
-    ├── metadata.json         # Swarm configuration
-    ├── task_1/
-    │   ├── research_log.md   # Human-readable progress log (PRIMARY OUTPUT)
-    │   ├── events.jsonl      # Structured progress and observations
-    │   ├── explanations.jsonl # Complete behavior explanations
-    │   ├── summary.json      # Agent-provided title and summary for UI
-    │   ├── artifacts/        # Graph artifacts for inline visualization
-    │   │   ├── graph_001.json
-    │   │   └── graph_002.json
-    │   ├── app.db            # Isolated SQLite database
-    │   ├── agent_prompt.md   # The prompt given to the agent
-    │   ├── mcp_config.json   # MCP server configuration for Claude Code
-    │   └── claude_output.jsonl # Raw Claude Code output (stream-json format)
-    ├── task_2/
-    │   └── ...
-    └── task_N/
-        └── ...
-```
-
-## Key Files
-
-| File | Purpose |
-|------|---------|
-| `schemas.py` | Defines `BehaviorExplanation`, `SwarmEvent`, `Evidence` schemas |
-| `agent_prompt.py` | Contains detailed instructions for agents on using the API |
-| `run_slurm.py` | Creates git snapshot, generates commands, submits SLURM array |
-| `run_agent.py` | Starts backend, loads run, launches Claude Code |
-
-## Schemas
-
-### BehaviorExplanation
-The primary output - documents a discovered behavior:
-- `subject_prompt`: Prompt demonstrating the behavior
-- `behavior_description`: What the model does
-- `components_involved`: List of components and their roles
-- `explanation`: How components work together
-- `supporting_evidence`: Ablations, attributions, etc.
-- `confidence`: high/medium/low
-- `alternative_hypotheses`: Other considered explanations
-- `limitations`: Known caveats
-
-### SwarmEvent
-General logging:
-- `event_type`: start, progress, observation, hypothesis, test_result, error, complete
-- `timestamp`: When it occurred
-- `message`: Human-readable description
-- `details`: Structured data
-
-## Swarm Mode Environment
-
-Each agent runs with `SPD_SWARM_TASK_DIR` set to its task directory. The backend derives:
-- Database: `task_dir/app.db`
-- Events log: `task_dir/events.jsonl`
-- Research log: `task_dir/research_log.md`
-
-`SPD_SWARM_SUGGESTIONS_PATH` points to the global suggestions file shared across all agents.
-
-## Monitoring
-
-```bash
-# Watch research logs (best way to follow agent progress)
-tail -f SPD_OUT_DIR/agent_swarm/<swarm_id>/task_*/research_log.md
-
-# Watch a specific agent's research log
-cat SPD_OUT_DIR/agent_swarm/<swarm_id>/task_1/research_log.md
-
-# Watch events from all agents
-tail -f SPD_OUT_DIR/agent_swarm/<swarm_id>/task_*/events.jsonl
-
-# View all explanations
-cat SPD_OUT_DIR/agent_swarm/<swarm_id>/task_*/explanations.jsonl | jq .
-
-# View agent suggestions for system improvement (global file)
-cat SPD_OUT_DIR/agent_swarm/suggestions.jsonl | jq .
-
-# Check SLURM job status
-squeue --me
-
-# View specific job logs
-tail -f ~/slurm_logs/slurm-<job_id>_<task_id>.out
-```
-
-## Configuration
-
-CLI arguments:
-- `wandb_path`: Required - WandB run path for the SPD decomposition
-- `--n_agents`: Required - Number of parallel agents to launch
-- `--context_length`: Token context length (default: 128)
-- `--partition`: SLURM partition (default: h200-reserved)
-- `--time`: Time limit per agent (default: 8:00:00)
-- `--job_suffix`: Optional suffix for job names
-
-## Extending
-
-To modify agent behavior:
-1. Edit `agent_prompt.py` to change investigation instructions
-2. Update `schemas.py` to add new output fields
-3. Modify `run_agent.py` to change the worker flow
-
-The agent prompt is the primary way to guide agent behavior - it contains
-detailed API documentation and scientific methodology guidance.
diff --git a/spd/agent_swarm/__init__.py b/spd/agent_swarm/__init__.py
deleted file mode 100644
index cac91de2d..000000000
--- a/spd/agent_swarm/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-"""Agent Swarm: Parallel SLURM-based agent investigation of model behaviors.
-
-This module provides infrastructure for launching many parallel Claude Code agents,
-each investigating behaviors in an SPD model decomposition. Each agent:
-1. Starts an isolated app backend instance (separate database, unique port)
-2. Receives detailed instructions on using the SPD app API
-3. Investigates behaviors and writes findings to append-only JSONL files
-"""
-
-from spd.agent_swarm.schemas import (
-    BehaviorExplanation,
-    ComponentInfo,
-    Evidence,
-    SwarmEvent,
-)
-
-__all__ = [
-    "BehaviorExplanation",
-    "ComponentInfo",
-    "Evidence",
-    "SwarmEvent",
-]
diff --git a/spd/agent_swarm/agent_prompt.py b/spd/agent_swarm/agent_prompt.py
deleted file mode 100644
index 774d636d5..000000000
--- a/spd/agent_swarm/agent_prompt.py
+++ /dev/null
@@ -1,198 +0,0 @@
-"""System prompt for SPD investigation agents.
-
-This module contains the detailed instructions given to each agent in the swarm.
-The agent has access to SPD tools via MCP - tools are self-documenting.
-"""
-
-AGENT_SYSTEM_PROMPT = """
-# SPD Behavior Investigation Agent
-
-You are a research agent investigating behaviors in a neural network model decomposition.
-Your goal is to find interesting behaviors, understand how components interact to produce
-them, and document your findings.
-
-## Your Mission
-
-You are part of a swarm of agents, each independently investigating behaviors in the same
-model. Your task is to:
-
-1. **Find a behavior**: Discover a prompt where the model does something interesting
-   (e.g., predicts the correct gendered pronoun, completes a pattern, etc.)
-
-2. **Understand the mechanism**: Figure out which components are involved and how they
-   work together to produce the behavior
-
-3. **Document your findings**: Write clear explanations with supporting evidence
-
-## Available Tools (via MCP)
-
-You have access to SPD analysis tools. Use them directly - they have full documentation.
-
-**Analysis Tools:**
-- **optimize_graph**: Find the minimal circuit for a behavior (e.g., "boy" → "he")
-- **get_component_info**: Get interpretation and token stats for a component
-- **run_ablation**: Test a circuit by running with only selected components
-- **search_dataset**: Find examples in the training data
-- **create_prompt**: Tokenize text for analysis
-
-**Output Tools:**
-- **update_research_log**: Append to your research log (PRIMARY OUTPUT - use frequently!)
-- **save_graph_artifact**: Save a graph as an artifact for inline display in your research log
-- **save_explanation**: Save a complete, validated behavior explanation
-- **set_investigation_summary**: Set a title and summary for your investigation (shown in UI)
-- **submit_suggestion**: Submit ideas for improving the tools or system
-
-## Investigation Methodology
-
-### Step 1: Find an Interesting Behavior
-
-Start by exploring:
-- Search for linguistic patterns: pronouns, verb agreement, completions
-- Create test prompts that show clear model behavior
-- Good targets: gendered pronouns, subject-verb agreement, semantic associations
-
-### Step 2: Optimize a Sparse Circuit
-
-Once you have a behavior:
-1. Use `optimize_graph` with your prompt and target token
-2. Examine which components have high CI values
-3. Note the circuit size (fewer = cleaner mechanism)
-
-### Step 3: Understand Component Roles
-
-For each important component:
-1. Use `get_component_info` to see its interpretation and token stats
-2. Look at what tokens activate it (input) and what it predicts (output)
-3. Check correlated components
-
-### Step 4: Test with Ablations
-
-Form hypotheses and test them:
-1. Use `run_ablation` with the circuit's components
-2. Verify predictions match expectations
-3. Try removing individual components to find critical ones
-
-### Step 5: Document Your Findings
-
-Use `update_research_log` frequently - this is how humans monitor your work!
-When you complete an investigation, use `save_explanation` to create a structured record.
-
-## Scientific Principles
-
-- **Be skeptical**: Your first hypothesis is probably incomplete
-- **Triangulate**: Don't rely on a single type of evidence
-- **Document uncertainty**: Note what you're confident in vs. uncertain about
-- **Consider alternatives**: What else could explain the behavior?
-
-## Output Format
-
-### Research Log (PRIMARY OUTPUT - Update frequently!)
-
-Use `update_research_log` with markdown content. Call it every few minutes to show progress:
-
-Example calls:
-```
-update_research_log("# Research Log - Task 1\n\nStarting investigation...\n\n")
-
-update_research_log("## [14:25:42] Hypothesis: Gendered Pronoun Circuit\n\nTesting prompt: 'The boy said that' → expecting ' he'\n\nUsed optimize_graph - found 15 active components:\n- h.0.mlp.c_fc:407 (CI=0.95) - 'male subjects'\n- h.3.attn.o_proj:262 (CI=0.92) - 'masculine pronouns'\n\n")
-
-update_research_log("## [14:28:03] Ablation Test\n\nResult: P(he) = 0.89 (vs 0.22 baseline)\n\nThis confirms the circuit is sufficient!\n\n")
-```
-
-### Including Graph Visualizations
-
-After running `optimize_graph`, you can embed the circuit visualization in your research log:
-
-1. Call `save_graph_artifact` with the graph_id returned by optimize_graph
-2. Reference it in your research log using the `spd:graph` code block
-
-Example:
-```
-# After optimize_graph returns graph_id=42
-save_graph_artifact(graph_id=42, caption="Circuit predicting 'he' after 'The boy'")
-
-# Then include in your research log:
-update_research_log('''## Circuit Visualization
-
-```spd:graph
-artifact: graph_001
-```
-
-This circuit shows the key components involved in predicting "he"...
-''')
-
-### Saving Explanations
-
-When you have a complete explanation, use `save_explanation`:
-
-```
-save_explanation(
-  subject_prompt="The boy said that",
-  behavior_description="Predicts masculine pronoun 'he' after male subject",
-  components_involved=[
-    {{"component_key": "h.0.mlp.c_fc:407", "role": "Male subject detector"}},
-    {{"component_key": "h.3.attn.o_proj:262", "role": "Masculine pronoun promoter"}}
-  ],
-  explanation="Component h.0.mlp.c_fc:407 activates on male subjects...",
-  confidence="medium",
-  limitations=["Only tested on simple sentences"]
-)
-```
-
-### Submitting Suggestions
-
-If you have ideas for improving the system, use `submit_suggestion`:
-
-```
-submit_suggestion(
-  category="tool_improvement",
-  title="Add batch ablation support",
-  description="It would be faster to test multiple ablations at once...",
-  context="I was testing 10 different component subsets one at a time"
-)
-```
-
-## Getting Started
-
-1. **Create your research log** with `update_research_log("# Research Log - Task N\n\n...")`
-2. Use analysis tools to explore the model
-3. Find an interesting behavior to investigate
-4. **Call `update_research_log` frequently** - humans are watching!
-5. Use `save_explanation` for complete findings
-6. **Call `set_investigation_summary`** with a title and summary when done (or periodically for updates)
-
-You are exploring! Not every investigation will lead to a clear explanation.
-Document what you learn, even if it's "this was more complicated than expected."
-
-Good luck!
-"""
-
-
-def get_agent_prompt(port: int, wandb_path: str, task_id: int, output_dir: str) -> str:
-    """Generate the full agent prompt with runtime parameters filled in.
-
-    Args:
-        port: The port the backend is running on (for reference, tools use MCP).
-        wandb_path: The WandB path of the loaded run.
-        task_id: The SLURM task ID for this agent.
-        output_dir: Path to the agent's output directory.
-
-    Returns:
-        The complete agent prompt with parameters substituted.
-    """
-    runtime_context = f"""
-## Runtime Context
-
-- **Model Run**: {wandb_path}
-- **Task ID**: {task_id}
-
-Use the MCP tools for ALL output:
-- `update_research_log` → **PRIMARY OUTPUT** - Update frequently with your progress!
-- `save_explanation` → Save complete, validated behavior explanations
-- `submit_suggestion` → Share ideas for improving the system
-
-**Start by calling update_research_log to create your log, then investigate!**
-"""
-    # Note: output_dir and port are available but agents shouldn't need them
-    _ = output_dir, port
-    return AGENT_SYSTEM_PROMPT + runtime_context
diff --git a/spd/agent_swarm/scripts/__init__.py b/spd/agent_swarm/scripts/__init__.py
deleted file mode 100644
index 9d0e8ed1b..000000000
--- a/spd/agent_swarm/scripts/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Agent swarm SLURM scripts."""
diff --git a/spd/agent_swarm/scripts/run_slurm.py b/spd/agent_swarm/scripts/run_slurm.py
deleted file mode 100644
index 076a0d847..000000000
--- a/spd/agent_swarm/scripts/run_slurm.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""SLURM launcher for agent swarm.
-
-Submits a SLURM array job where each task runs an independent agent investigating
-behaviors in an SPD model decomposition.
-
-Each agent:
-1. Starts an isolated app backend (unique port, isolated database)
-2. Launches Claude Code with investigation instructions
-3. Writes findings to append-only JSONL files
-"""
-
-import secrets
-from pathlib import Path
-
-from spd.log import logger
-from spd.settings import SPD_OUT_DIR
-from spd.utils.git_utils import create_git_snapshot
-from spd.utils.slurm import (
-    SlurmArrayConfig,
-    generate_array_script,
-    submit_slurm_job,
-)
-
-
-def get_swarm_output_dir(swarm_id: str) -> Path:
-    """Get the output directory for a swarm run."""
-    return SPD_OUT_DIR / "agent_swarm" / swarm_id
-
-
-def launch_agent_swarm(
-    wandb_path: str,
-    n_agents: int,
-    context_length: int,
-    max_turns: int,
-    max_concurrent: int,
-    partition: str,
-    time: str,
-    job_suffix: str | None,
-) -> None:
-    """Launch a swarm of agents to investigate behaviors."""
-    swarm_id = f"swarm-{secrets.token_hex(4)}"
-    output_dir = get_swarm_output_dir(swarm_id)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    snapshot_branch, commit_hash = create_git_snapshot(swarm_id)
-    logger.info(f"Created git snapshot: {snapshot_branch} ({commit_hash[:8]})")
-
-    suffix = f"-{job_suffix}" if job_suffix else ""
-    job_name = f"spd-swarm{suffix}"
-
-    # Write swarm metadata
-    metadata_path = output_dir / "metadata.json"
-    import json
-
-    metadata = {
-        "swarm_id": swarm_id,
-        "wandb_path": wandb_path,
-        "n_agents": n_agents,
-        "context_length": context_length,
-        "max_turns": max_turns,
-        "snapshot_branch": snapshot_branch,
-        "commit_hash": commit_hash,
-    }
-    metadata_path.write_text(json.dumps(metadata, indent=2))
-
-    # Build worker commands (SLURM arrays are 1-indexed)
-    worker_commands = []
-    for task_id in range(1, n_agents + 1):
-        cmd = (
-            f"python -m spd.agent_swarm.scripts.run_agent "
-            f'"{wandb_path}" '
-            f"--task_id {task_id} "
-            f"--swarm_id {swarm_id} "
-            f"--context_length {context_length} "
-            f"--max_turns {max_turns}"
-        )
-        worker_commands.append(cmd)
-
-    array_config = SlurmArrayConfig(
-        job_name=job_name,
-        partition=partition,
-        n_gpus=1,
-        time=time,
-        snapshot_branch=snapshot_branch,
-        max_concurrent_tasks=min(n_agents, max_concurrent),
-    )
-    array_script = generate_array_script(array_config, worker_commands)
-    array_result = submit_slurm_job(
-        array_script,
-        "agent_swarm",
-        is_array=True,
-        n_array_tasks=n_agents,
-    )
-
-    logger.section("Agent swarm jobs submitted!")
-    logger.values(
-        {
-            "Swarm ID": swarm_id,
-            "WandB path": wandb_path,
-            "N agents": n_agents,
-            "Context length": context_length,
-            "Max turns": max_turns,
-            "Output directory": str(output_dir),
-            "Snapshot": f"{snapshot_branch} ({commit_hash[:8]})",
-            "Job ID": array_result.job_id,
-            "Logs": array_result.log_pattern,
-            "Script": str(array_result.script_path),
-        }
-    )
-    logger.info("")
-    logger.info("Monitor progress:")
-    logger.info(f"  tail -f {output_dir}/task_*/events.jsonl")
-    logger.info("")
-    logger.info("Monitor Claude output (stream-json):")
-    logger.info(f"  tail -f {output_dir}/task_*/claude_output.jsonl | jq -r '.result // empty'")
-    logger.info("")
-    logger.info("View explanations:")
-    logger.info(f"  cat {output_dir}/task_*/explanations.jsonl | jq .")
diff --git a/spd/agent_swarm/scripts/run_slurm_cli.py b/spd/agent_swarm/scripts/run_slurm_cli.py
deleted file mode 100644
index c603ffb3b..000000000
--- a/spd/agent_swarm/scripts/run_slurm_cli.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""CLI entry point for agent swarm SLURM launcher.
-
-Thin wrapper for fast --help. Heavy imports deferred to run_slurm.py.
-
-Usage:
-    spd-swarm <wandb_path> --n_agents 10
-    spd-swarm <wandb_path> --n_agents 5 --max_turns 30
-
-Examples:
-    # Launch 10 agents to investigate a decomposition
-    spd-swarm goodfire-ai/spd/runs/abc123 --n_agents 10
-
-    # Launch 5 agents with custom settings
-    spd-swarm goodfire-ai/spd/runs/abc123 --n_agents 5 --max_turns 30 --time 4:00:00
-"""
-
-import fire
-
-from spd.settings import DEFAULT_PARTITION_NAME
-
-
-def main(
-    wandb_path: str,
-    n_agents: int,
-    context_length: int = 128,
-    max_turns: int = 50,
-    max_concurrent: int = 8,
-    partition: str = DEFAULT_PARTITION_NAME,
-    time: str = "8:00:00",
-    job_suffix: str | None = None,
-) -> None:
-    """Launch a swarm of agents to investigate behaviors in an SPD model.
-
-    Each agent runs in its own SLURM job with an isolated app backend instance.
-    Agents use Claude Code to investigate behaviors and write findings to
-    append-only JSONL files.
-
-    Args:
-        wandb_path: WandB run path for the SPD decomposition to investigate.
-            Format: "entity/project/runs/run_id" or "wandb:entity/project/run_id"
-        n_agents: Number of agents to launch (each gets 1 GPU).
-        context_length: Context length for prompts (default 128).
-        max_turns: Maximum agentic turns per agent (default 50, prevents runaway).
-        max_concurrent: Maximum agents to run in parallel (default 8, respects cluster limits).
-        partition: SLURM partition name.
-        time: Job time limit per agent (default 8 hours).
-        job_suffix: Optional suffix for SLURM job names.
-    """
-    from spd.agent_swarm.scripts.run_slurm import launch_agent_swarm
-
-    launch_agent_swarm(
-        wandb_path=wandb_path,
-        n_agents=n_agents,
-        context_length=context_length,
-        max_turns=max_turns,
-        max_concurrent=max_concurrent,
-        partition=partition,
-        time=time,
-        job_suffix=job_suffix,
-    )
-
-
-def cli() -> None:
-    fire.Fire(main)
-
-
-if __name__ == "__main__":
-    cli()
diff --git a/spd/app/backend/database.py b/spd/app/backend/database.py
index d00cbf2b9..8dbf3178d 100644
--- a/spd/app/backend/database.py
+++ b/spd/app/backend/database.py
@@ -33,13 +33,13 @@ def get_default_db_path() -> Path:
     """Get the default database path.
 
     Checks env vars in order:
-    1. SPD_SWARM_TASK_DIR - swarm mode, db at task_dir/app.db
+    1. SPD_INVESTIGATION_DIR - investigation mode, db at dir/app.db
     2. SPD_APP_DB_PATH - explicit override
     3. Default: .data/app/prompt_attr.db
     """
-    swarm_task_dir = os.environ.get("SPD_SWARM_TASK_DIR")
-    if swarm_task_dir:
-        return Path(swarm_task_dir) / "app.db"
+    investigation_dir = os.environ.get("SPD_INVESTIGATION_DIR")
+    if investigation_dir:
+        return Path(investigation_dir) / "app.db"
     env_path = os.environ.get("SPD_APP_DB_PATH")
     if env_path:
         return Path(env_path)
diff --git a/spd/app/backend/routers/investigations.py b/spd/app/backend/routers/investigations.py
index 77c4f435a..be59ba5f1 100644
--- a/spd/app/backend/routers/investigations.py
+++ b/spd/app/backend/routers/investigations.py
@@ -1,7 +1,7 @@
-"""Investigations endpoint for viewing agent swarm results.
+"""Investigations endpoint for viewing agent investigation results.
 
-Lists and serves investigation data from SPD_OUT_DIR/agent_swarm/.
-Each task is treated as an independent investigation (flattened across swarms).
+Lists and serves investigation data from SPD_OUT_DIR/investigations/.
+Each investigation directory contains findings from a single agent run.
 """
 
 import json
@@ -12,30 +12,29 @@
 from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel
 
+from spd.app.backend.dependencies import DepLoadedRun
 from spd.settings import SPD_OUT_DIR
 
 router = APIRouter(prefix="/api/investigations", tags=["investigations"])
 
-SWARM_DIR = SPD_OUT_DIR / "agent_swarm"
+INVESTIGATIONS_DIR = SPD_OUT_DIR / "investigations"
 
 
 class InvestigationSummary(BaseModel):
-    """Summary of a single investigation (task)."""
+    """Summary of a single investigation."""
 
-    id: str  # swarm_id/task_id
-    swarm_id: str
-    task_id: int
+    id: str
     wandb_path: str | None
+    prompt: str | None
     created_at: str
     has_research_log: bool
     has_explanations: bool
     event_count: int
     last_event_time: str | None
     last_event_message: str | None
-    # Agent-provided summary
     title: str | None
     summary: str | None
-    status: str | None  # in_progress, completed, inconclusive
+    status: str | None
 
 
 class EventEntry(BaseModel):
@@ -51,23 +50,21 @@ class InvestigationDetail(BaseModel):
     """Full detail of an investigation including logs."""
 
     id: str
-    swarm_id: str
-    task_id: int
     wandb_path: str | None
+    prompt: str | None
     created_at: str
     research_log: str | None
     events: list[EventEntry]
     explanations: list[dict[str, Any]]
-    artifact_ids: list[str]  # List of artifact IDs available for this investigation
-    # Agent-provided summary
+    artifact_ids: list[str]
     title: str | None
     summary: str | None
     status: str | None
 
 
-def _parse_swarm_metadata(swarm_path: Path) -> dict[str, Any] | None:
-    """Parse metadata.json from a swarm directory."""
-    metadata_path = swarm_path / "metadata.json"
+def _parse_metadata(inv_path: Path) -> dict[str, Any] | None:
+    """Parse metadata.json from an investigation directory."""
+    metadata_path = inv_path / "metadata.json"
     if not metadata_path.exists():
         return None
     try:
@@ -105,9 +102,9 @@ def _get_last_event(events_path: Path) -> tuple[str | None, str | None, int]:
     return last_time, last_msg, count
 
 
-def _parse_task_summary(task_path: Path) -> tuple[str | None, str | None, str | None]:
-    """Parse summary.json from a task directory. Returns (title, summary, status)."""
-    summary_path = task_path / "summary.json"
+def _parse_task_summary(inv_path: Path) -> tuple[str | None, str | None, str | None]:
+    """Parse summary.json from an investigation directory. Returns (title, summary, status)."""
+    summary_path = inv_path / "summary.json"
     if not summary_path.exists():
         return None, None, None
     try:
@@ -117,21 +114,17 @@ def _parse_task_summary(task_path: Path) -> tuple[str | None, str | None, str |
         return None, None, None
 
 
-def _list_artifact_ids(task_path: Path) -> list[str]:
-    """List all artifact IDs for a task."""
-    artifacts_dir = task_path / "artifacts"
+def _list_artifact_ids(inv_path: Path) -> list[str]:
+    """List all artifact IDs for an investigation."""
+    artifacts_dir = inv_path / "artifacts"
     if not artifacts_dir.exists():
         return []
-    artifact_ids = []
-    for f in sorted(artifacts_dir.glob("graph_*.json")):
-        artifact_ids.append(f.stem)  # e.g., "graph_001"
-    return artifact_ids
+    return [f.stem for f in sorted(artifacts_dir.glob("graph_*.json"))]
 
 
-def _get_task_created_at(task_path: Path, swarm_metadata: dict[str, Any] | None) -> str:
-    """Get creation time for a task."""
-    # Try to get from first event
-    events_path = task_path / "events.jsonl"
+def _get_created_at(inv_path: Path, metadata: dict[str, Any] | None) -> str:
+    """Get creation time for an investigation."""
+    events_path = inv_path / "events.jsonl"
     if events_path.exists():
         try:
             with open(events_path) as f:
@@ -143,87 +136,77 @@ def _get_task_created_at(task_path: Path, swarm_metadata: dict[str, Any] | None)
         except Exception:
             pass
 
-    # Fall back to swarm metadata
-    if swarm_metadata and "created_at" in swarm_metadata:
-        return swarm_metadata["created_at"]
+    if metadata and "created_at" in metadata:
+        return metadata["created_at"]
 
-    # Fall back to directory mtime
-    return datetime.fromtimestamp(task_path.stat().st_mtime).isoformat()
+    return datetime.fromtimestamp(inv_path.stat().st_mtime).isoformat()
 
 
 @router.get("")
-def list_investigations() -> list[InvestigationSummary]:
-    """List all investigations (tasks) flattened across swarms."""
-    if not SWARM_DIR.exists():
+def list_investigations(loaded: DepLoadedRun) -> list[InvestigationSummary]:
+    """List investigations for the currently loaded run."""
+    if not INVESTIGATIONS_DIR.exists():
         return []
 
+    wandb_path = loaded.run.wandb_path
     results = []
 
-    for swarm_path in SWARM_DIR.iterdir():
-        if not swarm_path.is_dir() or not swarm_path.name.startswith("swarm-"):
+    for inv_path in INVESTIGATIONS_DIR.iterdir():
+        if not inv_path.is_dir() or not inv_path.name.startswith("inv-"):
+            continue
+
+        inv_id = inv_path.name
+        metadata = _parse_metadata(inv_path)
+
+        meta_wandb_path = metadata.get("wandb_path") if metadata else None
+        if meta_wandb_path != wandb_path:
             continue
 
-        swarm_id = swarm_path.name
-        metadata = _parse_swarm_metadata(swarm_path)
-        wandb_path = metadata.get("wandb_path") if metadata else None
-
-        for task_path in swarm_path.iterdir():
-            if not task_path.is_dir() or not task_path.name.startswith("task_"):
-                continue
-
-            try:
-                task_id = int(task_path.name.split("_")[1])
-            except (ValueError, IndexError):
-                continue
-
-            events_path = task_path / "events.jsonl"
-            last_time, last_msg, event_count = _get_last_event(events_path)
-            title, summary, status = _parse_task_summary(task_path)
-
-            results.append(
-                InvestigationSummary(
-                    id=f"{swarm_id}/{task_id}",
-                    swarm_id=swarm_id,
-                    task_id=task_id,
-                    wandb_path=wandb_path,
-                    created_at=_get_task_created_at(task_path, metadata),
-                    has_research_log=(task_path / "research_log.md").exists(),
-                    has_explanations=(task_path / "explanations.jsonl").exists()
-                    and (task_path / "explanations.jsonl").stat().st_size > 0,
-                    event_count=event_count,
-                    last_event_time=last_time,
-                    last_event_message=last_msg,
-                    title=title,
-                    summary=summary,
-                    status=status,
-                )
+        events_path = inv_path / "events.jsonl"
+        last_time, last_msg, event_count = _get_last_event(events_path)
+        title, summary, status = _parse_task_summary(inv_path)
+
+        explanations_path = inv_path / "explanations.jsonl"
+
+        results.append(
+            InvestigationSummary(
+                id=inv_id,
+                wandb_path=meta_wandb_path,
+                prompt=metadata.get("prompt") if metadata else None,
+                created_at=_get_created_at(inv_path, metadata),
+                has_research_log=(inv_path / "research_log.md").exists(),
+                has_explanations=explanations_path.exists()
+                and explanations_path.stat().st_size > 0,
+                event_count=event_count,
+                last_event_time=last_time,
+                last_event_message=last_msg,
+                title=title,
+                summary=summary,
+                status=status,
             )
+        )
 
-    # Sort by creation time, newest first
     results.sort(key=lambda x: x.created_at, reverse=True)
     return results
 
 
-@router.get("/{swarm_id}/{task_id}")
-def get_investigation(swarm_id: str, task_id: int) -> InvestigationDetail:
+@router.get("/{inv_id}")
+def get_investigation(inv_id: str) -> InvestigationDetail:
     """Get full details of an investigation."""
-    swarm_path = SWARM_DIR / swarm_id
-    task_path = swarm_path / f"task_{task_id}"
+    inv_path = INVESTIGATIONS_DIR / inv_id
 
-    if not task_path.exists() or not task_path.is_dir():
-        raise HTTPException(status_code=404, detail=f"Investigation {swarm_id}/{task_id} not found")
+    if not inv_path.exists() or not inv_path.is_dir():
+        raise HTTPException(status_code=404, detail=f"Investigation {inv_id} not found")
 
-    metadata = _parse_swarm_metadata(swarm_path)
+    metadata = _parse_metadata(inv_path)
 
-    # Read research log
     research_log = None
-    research_log_path = task_path / "research_log.md"
+    research_log_path = inv_path / "research_log.md"
     if research_log_path.exists():
         research_log = research_log_path.read_text()
 
-    # Read events
     events = []
-    events_path = task_path / "events.jsonl"
+    events_path = inv_path / "events.jsonl"
     if events_path.exists():
         with open(events_path) as f:
             for line in f:
@@ -243,9 +226,8 @@ def get_investigation(swarm_id: str, task_id: int) -> InvestigationDetail:
                 except json.JSONDecodeError:
                     continue
 
-    # Read explanations
     explanations: list[dict[str, Any]] = []
-    explanations_path = task_path / "explanations.jsonl"
+    explanations_path = inv_path / "explanations.jsonl"
     if explanations_path.exists():
         with open(explanations_path) as f:
             for line in f:
@@ -257,17 +239,14 @@ def get_investigation(swarm_id: str, task_id: int) -> InvestigationDetail:
                 except json.JSONDecodeError:
                     continue
 
-    title, summary, status = _parse_task_summary(task_path)
-
-    # List artifact IDs
-    artifact_ids = _list_artifact_ids(task_path)
+    title, summary, status = _parse_task_summary(inv_path)
+    artifact_ids = _list_artifact_ids(inv_path)
 
     return InvestigationDetail(
-        id=f"{swarm_id}/{task_id}",
-        swarm_id=swarm_id,
-        task_id=task_id,
+        id=inv_id,
         wandb_path=metadata.get("wandb_path") if metadata else None,
-        created_at=_get_task_created_at(task_path, metadata),
+        prompt=metadata.get("prompt") if metadata else None,
+        created_at=_get_created_at(inv_path, metadata),
         research_log=research_log,
         events=events,
         explanations=explanations,
@@ -278,25 +257,51 @@ def get_investigation(swarm_id: str, task_id: int) -> InvestigationDetail:
     )
 
 
-@router.get("/{swarm_id}/{task_id}/artifacts")
-def list_artifacts(swarm_id: str, task_id: int) -> list[str]:
+class LaunchRequest(BaseModel):
+    prompt: str
+
+
+class LaunchResponse(BaseModel):
+    inv_id: str
+    job_id: str
+
+
+@router.post("/launch")
+def launch_investigation_endpoint(request: LaunchRequest, loaded: DepLoadedRun) -> LaunchResponse:
+    """Launch a new investigation for the currently loaded run."""
+    from spd.investigate.scripts.run_slurm import launch_investigation
+
+    result = launch_investigation(
+        wandb_path=loaded.run.wandb_path,
+        prompt=request.prompt,
+        context_length=loaded.context_length,
+        max_turns=50,
+        partition="h200-reserved",
+        time="8:00:00",
+        job_suffix=None,
+    )
+    return LaunchResponse(inv_id=result.inv_id, job_id=result.job_id)
+
+
+@router.get("/{inv_id}/artifacts")
+def list_artifacts(inv_id: str) -> list[str]:
     """List all artifact IDs for an investigation."""
-    task_path = SWARM_DIR / swarm_id / f"task_{task_id}"
-    if not task_path.exists():
-        raise HTTPException(status_code=404, detail=f"Investigation {swarm_id}/{task_id} not found")
-    return _list_artifact_ids(task_path)
+    inv_path = INVESTIGATIONS_DIR / inv_id
+    if not inv_path.exists():
+        raise HTTPException(status_code=404, detail=f"Investigation {inv_id} not found")
+    return _list_artifact_ids(inv_path)
 
 
-@router.get("/{swarm_id}/{task_id}/artifacts/{artifact_id}")
-def get_artifact(swarm_id: str, task_id: int, artifact_id: str) -> dict[str, Any]:
+@router.get("/{inv_id}/artifacts/{artifact_id}")
+def get_artifact(inv_id: str, artifact_id: str) -> dict[str, Any]:
     """Get a specific artifact by ID."""
-    task_path = SWARM_DIR / swarm_id / f"task_{task_id}"
-    artifact_path = task_path / "artifacts" / f"{artifact_id}.json"
+    inv_path = INVESTIGATIONS_DIR / inv_id
+    artifact_path = inv_path / "artifacts" / f"{artifact_id}.json"
 
     if not artifact_path.exists():
         raise HTTPException(
             status_code=404,
-            detail=f"Artifact {artifact_id} not found in {swarm_id}/{task_id}",
+            detail=f"Artifact {artifact_id} not found in {inv_id}",
         )
 
     data: dict[str, Any] = json.loads(artifact_path.read_text())
diff --git a/spd/app/backend/routers/mcp.py b/spd/app/backend/routers/mcp.py
index 867e99242..7705d3396 100644
--- a/spd/app/backend/routers/mcp.py
+++ b/spd/app/backend/routers/mcp.py
@@ -11,7 +11,7 @@
 import queue
 import threading
 import traceback
-from collections.abc import Generator
+from collections.abc import Callable, Generator
 from dataclasses import dataclass
 from datetime import UTC, datetime
 from pathlib import Path
@@ -23,12 +23,14 @@
 from pydantic import BaseModel
 
 from spd.app.backend.compute import (
+    compute_ci_only,
     compute_intervention_forward,
     compute_prompt_attributions_optimized,
 )
 from spd.app.backend.database import StoredGraph
 from spd.app.backend.optim_cis import CELossConfig, OptimCIConfig
 from spd.app.backend.routers.graphs import _build_out_probs
+from spd.app.backend.routers.pretrain_info import _get_pretrain_info
 from spd.app.backend.state import StateManager
 from spd.configs import ImportanceMinimalityLossConfig
 from spd.harvest import analysis
@@ -44,26 +46,25 @@
 
 
 @dataclass
-class SwarmConfig:
-    """Configuration for agent swarm mode. All paths are required when in swarm mode."""
+class InvestigationConfig:
+    """Configuration for investigation mode. All paths are required when in investigation mode."""
 
     events_log_path: Path
-    task_dir: Path
-    suggestions_path: Path
+    investigation_dir: Path
 
 
-_swarm_config: SwarmConfig | None = None
+_investigation_config: InvestigationConfig | None = None
 
 
-def set_swarm_config(config: SwarmConfig) -> None:
-    """Configure MCP for agent swarm mode."""
-    global _swarm_config
-    _swarm_config = config
+def set_investigation_config(config: InvestigationConfig) -> None:
+    """Configure MCP for investigation mode."""
+    global _investigation_config
+    _investigation_config = config
 
 
 def _log_event(event_type: str, message: str, details: dict[str, Any] | None = None) -> None:
-    """Log an event to the events file if in swarm mode."""
-    if _swarm_config is None:
+    """Log an event to the events file if in investigation mode."""
+    if _investigation_config is None:
         return
     event = {
         "event_type": event_type,
@@ -71,7 +72,7 @@ def _log_event(event_type: str, message: str, details: dict[str, Any] | None = N
         "message": message,
         "details": details or {},
     }
-    with open(_swarm_config.events_log_path, "a") as f:
+    with open(_investigation_config.events_log_path, "a") as f:
         f.write(json.dumps(event) + "\n")
 
 
@@ -360,38 +361,6 @@ class ToolDefinition(BaseModel):
             ],
         },
     ),
-    ToolDefinition(
-        name="submit_suggestion",
-        description="""Submit a suggestion for improving the SPD system.
-
-Use this when you encounter limitations, have ideas for new tools, or think
-of ways the system could better support investigation work.
-
-Suggestions are collected centrally and reviewed by humans to improve the system.""",
-        inputSchema={
-            "type": "object",
-            "properties": {
-                "category": {
-                    "type": "string",
-                    "enum": ["tool_improvement", "new_tool", "documentation", "bug", "other"],
-                    "description": "Category of suggestion",
-                },
-                "title": {
-                    "type": "string",
-                    "description": "Brief title for the suggestion",
-                },
-                "description": {
-                    "type": "string",
-                    "description": "Detailed description of the suggestion",
-                },
-                "context": {
-                    "type": "string",
-                    "description": "What you were trying to do when you had this idea",
-                },
-            },
-            "required": ["category", "title", "description"],
-        },
-    ),
     ToolDefinition(
         name="set_investigation_summary",
         description="""Set a title and summary for your investigation.
@@ -450,6 +419,130 @@ class ToolDefinition(BaseModel):
             "required": ["graph_id"],
         },
     ),
+    ToolDefinition(
+        name="probe_component",
+        description="""Fast CI probing on custom text.
+
+Computes causal importance values and subcomponent activations for a specific component
+across all positions in the input text. Also returns next-token probabilities.
+
+Use this for quick, targeted analysis of how a component responds to specific inputs.""",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "text": {
+                    "type": "string",
+                    "description": "The input text to probe",
+                },
+                "layer": {
+                    "type": "string",
+                    "description": "Canonical layer name (e.g., '0.mlp.up')",
+                },
+                "component_idx": {
+                    "type": "integer",
+                    "description": "Component index within the layer",
+                },
+            },
+            "required": ["text", "layer", "component_idx"],
+        },
+    ),
+    ToolDefinition(
+        name="get_component_activation_examples",
+        description="""Get activation examples from harvest data for a component.
+
+Returns examples showing token windows where the component fires, along with
+CI values and activation strengths at each position.
+
+Use this to understand what inputs activate a component.""",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "layer": {
+                    "type": "string",
+                    "description": "Canonical layer name (e.g., '0.mlp.up')",
+                },
+                "component_idx": {
+                    "type": "integer",
+                    "description": "Component index within the layer",
+                },
+                "limit": {
+                    "type": "integer",
+                    "description": "Maximum number of examples to return (default: 10)",
+                    "default": 10,
+                },
+            },
+            "required": ["layer", "component_idx"],
+        },
+    ),
+    ToolDefinition(
+        name="get_component_attributions",
+        description="""Get dataset-level component dependencies from pre-computed attributions.
+
+Returns the top source and target components that this component attributes to/from,
+aggregated over the training dataset. Both positive and negative attributions are returned.
+
+Use this to understand a component's role in the broader network.""",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "layer": {
+                    "type": "string",
+                    "description": "Canonical layer name (e.g., '0.mlp.up') or 'output'",
+                },
+                "component_idx": {
+                    "type": "integer",
+                    "description": "Component index within the layer",
+                },
+                "k": {
+                    "type": "integer",
+                    "description": "Number of top attributions to return per direction (default: 10)",
+                    "default": 10,
+                },
+            },
+            "required": ["layer", "component_idx"],
+        },
+    ),
+    ToolDefinition(
+        name="get_model_info",
+        description="""Get architecture details about the pretrained model.
+
+Returns model type, summary, target model config, topology, and pretrain info.
+No parameters required.""",
+        inputSchema={
+            "type": "object",
+            "properties": {},
+        },
+    ),
+    ToolDefinition(
+        name="get_attribution_strength",
+        description="""Query the attribution strength between two specific components.
+
+Returns the dataset-aggregated attribution value from source to target.
+
+Use this to check if two components have a strong connection.""",
+        inputSchema={
+            "type": "object",
+            "properties": {
+                "source_layer": {
+                    "type": "string",
+                    "description": "Canonical layer name of source component (e.g., '0.mlp.up')",
+                },
+                "source_idx": {
+                    "type": "integer",
+                    "description": "Source component index",
+                },
+                "target_layer": {
+                    "type": "string",
+                    "description": "Canonical layer name of target component (e.g., '1.attn.q') or 'output'",
+                },
+                "target_idx": {
+                    "type": "integer",
+                    "description": "Target component index",
+                },
+            },
+            "required": ["source_layer", "source_idx", "target_layer", "target_idx"],
+        },
+    ),
 ]
 
 
@@ -466,6 +559,21 @@ def _get_state():
     return manager, manager.run_state
 
 
+def _concrete_key(canonical_layer: str, component_idx: int, loaded: Any) -> str:
+    """Translate canonical layer + idx to concrete storage key."""
+    if canonical_layer == "output":
+        return f"output:{component_idx}"
+    concrete = loaded.topology.canon_to_target(canonical_layer)
+    return f"{concrete}:{component_idx}"
+
+
+def _canonicalize_layer(layer: str, loaded: Any) -> str:
+    """Translate concrete layer name to canonical, passing through 'output'."""
+    if layer == "output":
+        return layer
+    return loaded.topology.target_to_canon(layer)
+
+
 def _tool_optimize_graph(params: dict[str, Any]) -> Generator[dict[str, Any]]:
     """Optimize a sparse circuit for a behavior. Yields progress events."""
     manager, loaded = _get_state()
@@ -884,17 +992,17 @@ def _tool_create_prompt(params: dict[str, Any]) -> dict[str, Any]:
     }
 
 
-def _require_swarm_config() -> SwarmConfig:
-    """Get swarm config, raising if not in swarm mode."""
-    assert _swarm_config is not None, "Not running in swarm mode"
-    return _swarm_config
+def _require_investigation_config() -> InvestigationConfig:
+    """Get investigation config, raising if not in investigation mode."""
+    assert _investigation_config is not None, "Not running in investigation mode"
+    return _investigation_config
 
 
 def _tool_update_research_log(params: dict[str, Any]) -> dict[str, Any]:
     """Append content to the research log."""
-    config = _require_swarm_config()
+    config = _require_investigation_config()
     content = params["content"]
-    research_log_path = config.task_dir / "research_log.md"
+    research_log_path = config.investigation_dir / "research_log.md"
 
     _log_event(
         "tool_call", f"update_research_log: {len(content)} chars", {"preview": content[:100]}
@@ -910,9 +1018,9 @@ def _tool_update_research_log(params: dict[str, Any]) -> dict[str, Any]:
 
 def _tool_save_explanation(params: dict[str, Any]) -> dict[str, Any]:
     """Save a behavior explanation to explanations.jsonl."""
-    from spd.agent_swarm.schemas import BehaviorExplanation, ComponentInfo, Evidence
+    from spd.investigate.schemas import BehaviorExplanation, ComponentInfo, Evidence
 
-    config = _require_swarm_config()
+    config = _require_investigation_config()
 
     _log_event(
         "tool_call",
@@ -949,7 +1057,7 @@ def _tool_save_explanation(params: dict[str, Any]) -> dict[str, Any]:
         limitations=params.get("limitations", []),
     )
 
-    explanations_path = config.task_dir / "explanations.jsonl"
+    explanations_path = config.investigation_dir / "explanations.jsonl"
     with open(explanations_path, "a") as f:
         f.write(explanation.model_dump_json() + "\n")
 
@@ -962,34 +1070,9 @@ def _tool_save_explanation(params: dict[str, Any]) -> dict[str, Any]:
     return {"status": "ok", "path": str(explanations_path)}
 
 
-def _tool_submit_suggestion(params: dict[str, Any]) -> dict[str, Any]:
-    """Submit a suggestion for system improvement."""
-    config = _require_swarm_config()
-
-    suggestion = {
-        "timestamp": datetime.now(UTC).isoformat(),
-        "category": params["category"],
-        "title": params["title"],
-        "description": params["description"],
-        "context": params.get("context"),
-    }
-
-    _log_event(
-        "tool_call",
-        f"submit_suggestion: [{params['category']}] {params['title']}",
-        suggestion,
-    )
-
-    config.suggestions_path.parent.mkdir(parents=True, exist_ok=True)
-    with open(config.suggestions_path, "a") as f:
-        f.write(json.dumps(suggestion) + "\n")
-
-    return {"status": "ok", "message": "Suggestion recorded. Thank you!"}
-
-
 def _tool_set_investigation_summary(params: dict[str, Any]) -> dict[str, Any]:
     """Set the investigation title and summary."""
-    config = _require_swarm_config()
+    config = _require_investigation_config()
 
     summary = {
         "title": params["title"],
@@ -1004,7 +1087,7 @@ def _tool_set_investigation_summary(params: dict[str, Any]) -> dict[str, Any]:
         summary,
     )
 
-    summary_path = config.task_dir / "summary.json"
+    summary_path = config.investigation_dir / "summary.json"
     summary_path.write_text(json.dumps(summary, indent=2))
 
     return {"status": "ok", "path": str(summary_path)}
@@ -1019,7 +1102,7 @@ def _tool_save_graph_artifact(params: dict[str, Any]) -> dict[str, Any]:
     3. Filter edges to only active nodes
     4. Apply edge limit
     """
-    config = _require_swarm_config()
+    config = _require_investigation_config()
     manager, loaded = _get_state()
 
     graph_id = params["graph_id"]
@@ -1049,7 +1132,7 @@ def _tool_save_graph_artifact(params: dict[str, Any]) -> dict[str, Any]:
     num_tokens = len(tokens)
 
     # Create artifacts directory
-    artifacts_dir = config.task_dir / "artifacts"
+    artifacts_dir = config.investigation_dir / "artifacts"
     artifacts_dir.mkdir(exist_ok=True)
 
     # Generate artifact ID (find max existing number to avoid collisions)
@@ -1151,11 +1234,242 @@ def _tool_save_graph_artifact(params: dict[str, Any]) -> dict[str, Any]:
     return {"artifact_id": artifact_id, "path": str(artifact_path)}
 
 
+def _tool_probe_component(params: dict[str, Any]) -> dict[str, Any]:
+    """Fast CI probing on custom text for a specific component."""
+    manager, loaded = _get_state()
+
+    text = params["text"]
+    layer = params["layer"]
+    component_idx = params["component_idx"]
+
+    _log_event(
+        "tool_call",
+        f"probe_component: '{text[:50]}...' layer={layer} idx={component_idx}",
+        {"text": text, "layer": layer, "component_idx": component_idx},
+    )
+
+    token_ids = loaded.tokenizer.encode(text)
+    assert token_ids, "Text produced no tokens"
+    tokens_tensor = torch.tensor([token_ids], device=DEVICE)
+
+    concrete_layer = loaded.topology.canon_to_target(layer)
+
+    with manager.gpu_lock():
+        result = compute_ci_only(
+            model=loaded.model, tokens=tokens_tensor, sampling=loaded.config.sampling
+        )
+
+    ci_values = result.ci_lower_leaky[concrete_layer][0, :, component_idx].tolist()
+    subcomp_acts = result.component_acts[concrete_layer][0, :, component_idx].tolist()
+
+    # Get next token probs from target model output
+    next_token_probs = []
+    for i in range(len(token_ids) - 1):
+        next_token_id = token_ids[i + 1]
+        prob = result.target_out_probs[0, i, next_token_id].item()
+        next_token_probs.append(round(prob, 6))
+    next_token_probs.append(None)
+
+    token_strings = [loaded.tokenizer.get_tok_display(t) for t in token_ids]
+
+    return {
+        "tokens": token_strings,
+        "ci_values": ci_values,
+        "subcomp_acts": subcomp_acts,
+        "next_token_probs": next_token_probs,
+    }
+
+
+def _tool_get_component_activation_examples(params: dict[str, Any]) -> dict[str, Any]:
+    """Get activation examples from harvest data."""
+    _, loaded = _get_state()
+
+    layer = params["layer"]
+    component_idx = params["component_idx"]
+    limit = params.get("limit", 10)
+
+    concrete_layer = loaded.topology.canon_to_target(layer)
+    component_key = f"{concrete_layer}:{component_idx}"
+
+    _log_event(
+        "tool_call",
+        f"get_component_activation_examples: {component_key}",
+        {"layer": layer, "component_idx": component_idx, "limit": limit},
+    )
+
+    assert loaded.harvest is not None, "harvest data not loaded"
+    comp = loaded.harvest.get_component(component_key)
+    if comp is None:
+        return {"component_key": component_key, "examples": [], "total": 0}
+
+    examples = []
+    for ex in comp.activation_examples[:limit]:
+        token_strings = [loaded.tokenizer.get_tok_display(t) for t in ex.token_ids]
+        examples.append(
+            {
+                "tokens": token_strings,
+                "ci_values": ex.ci_values,
+                "component_acts": ex.component_acts,
+            }
+        )
+
+    return {
+        "component_key": component_key,
+        "examples": examples,
+        "total": len(comp.activation_examples),
+        "mean_ci": comp.mean_ci,
+    }
+
+
+def _tool_get_component_attributions(params: dict[str, Any]) -> dict[str, Any]:
+    """Get dataset-level component dependencies."""
+    _, loaded = _get_state()
+
+    layer = params["layer"]
+    component_idx = params["component_idx"]
+    k = params.get("k", 10)
+
+    assert loaded.attributions is not None, "dataset attributions not loaded"
+    storage = loaded.attributions.get_attributions()
+
+    concrete_layer = loaded.topology.canon_to_target(layer) if layer != "output" else "output"
+    component_key = f"{concrete_layer}:{component_idx}"
+
+    _log_event(
+        "tool_call",
+        f"get_component_attributions: {component_key}",
+        {"layer": layer, "component_idx": component_idx, "k": k},
+    )
+
+    is_source = storage.has_source(component_key)
+    is_target = storage.has_target(component_key)
+
+    assert is_source or is_target, f"Component {component_key} not found in attributions"
+
+    w_unembed = loaded.topology.get_unembed_weight() if is_source else None
+
+    def _entries_to_dicts(
+        entries: list[Any],
+    ) -> list[dict[str, Any]]:
+        return [
+            {
+                "component_key": f"{_canonicalize_layer(e.layer, loaded)}:{e.component_idx}",
+                "layer": _canonicalize_layer(e.layer, loaded),
+                "component_idx": e.component_idx,
+                "value": e.value,
+            }
+            for e in entries
+        ]
+
+    positive_sources = (
+        _entries_to_dicts(storage.get_top_sources(component_key, k, "positive"))
+        if is_target
+        else []
+    )
+    negative_sources = (
+        _entries_to_dicts(storage.get_top_sources(component_key, k, "negative"))
+        if is_target
+        else []
+    )
+    positive_targets = (
+        _entries_to_dicts(
+            storage.get_top_targets(
+                component_key,
+                k,
+                "positive",
+                w_unembed=w_unembed,
+                include_outputs=w_unembed is not None,
+            )
+        )
+        if is_source
+        else []
+    )
+    negative_targets = (
+        _entries_to_dicts(
+            storage.get_top_targets(
+                component_key,
+                k,
+                "negative",
+                w_unembed=w_unembed,
+                include_outputs=w_unembed is not None,
+            )
+        )
+        if is_source
+        else []
+    )
+
+    return {
+        "component_key": component_key,
+        "positive_sources": positive_sources,
+        "negative_sources": negative_sources,
+        "positive_targets": positive_targets,
+        "negative_targets": negative_targets,
+    }
+
+
+def _tool_get_model_info(_params: dict[str, Any]) -> dict[str, Any]:
+    """Get architecture details about the pretrained model."""
+    _, loaded = _get_state()
+
+    _log_event("tool_call", "get_model_info", {})
+
+    info = _get_pretrain_info(loaded.config)
+    return info.model_dump()
+
+
+def _tool_get_attribution_strength(params: dict[str, Any]) -> dict[str, Any]:
+    """Query attribution between two specific components."""
+    _, loaded = _get_state()
+
+    source_layer = params["source_layer"]
+    source_idx = params["source_idx"]
+    target_layer = params["target_layer"]
+    target_idx = params["target_idx"]
+
+    assert loaded.attributions is not None, "dataset attributions not loaded"
+    storage = loaded.attributions.get_attributions()
+
+    source_key = _concrete_key(source_layer, source_idx, loaded)
+    target_key = _concrete_key(target_layer, target_idx, loaded)
+
+    _log_event(
+        "tool_call",
+        f"get_attribution_strength: {source_key} → {target_key}",
+        {"source": source_key, "target": target_key},
+    )
+
+    w_unembed = loaded.topology.get_unembed_weight() if target_layer == "output" else None
+    value = storage.get_attribution(source_key, target_key, w_unembed=w_unembed)
+
+    return {"value": value}
+
+
 # =============================================================================
 # MCP Protocol Handler
 # =============================================================================
 
 
+_STREAMING_TOOLS: dict[str, Callable[..., Generator[dict[str, Any]]]] = {
+    "optimize_graph": _tool_optimize_graph,
+}
+
+_SIMPLE_TOOLS: dict[str, Callable[..., dict[str, Any]]] = {
+    "get_component_info": _tool_get_component_info,
+    "run_ablation": _tool_run_ablation,
+    "search_dataset": _tool_search_dataset,
+    "create_prompt": _tool_create_prompt,
+    "update_research_log": _tool_update_research_log,
+    "save_explanation": _tool_save_explanation,
+    "set_investigation_summary": _tool_set_investigation_summary,
+    "save_graph_artifact": _tool_save_graph_artifact,
+    "probe_component": _tool_probe_component,
+    "get_component_activation_examples": _tool_get_component_activation_examples,
+    "get_component_attributions": _tool_get_component_attributions,
+    "get_model_info": _tool_get_model_info,
+    "get_attribution_strength": _tool_get_attribution_strength,
+}
+
+
 def _handle_initialize(_params: dict[str, Any] | None) -> dict[str, Any]:
     """Handle initialize request."""
     return {
@@ -1177,71 +1491,14 @@ def _handle_tools_call(
     name = params.get("name")
     arguments = params.get("arguments", {})
 
-    if name == "optimize_graph":
-        # This tool streams progress
-        return _tool_optimize_graph(arguments)
-    elif name == "get_component_info":
-        return {
-            "content": [
-                {"type": "text", "text": json.dumps(_tool_get_component_info(arguments), indent=2)}
-            ]
-        }
-    elif name == "run_ablation":
-        return {
-            "content": [
-                {"type": "text", "text": json.dumps(_tool_run_ablation(arguments), indent=2)}
-            ]
-        }
-    elif name == "search_dataset":
-        return {
-            "content": [
-                {"type": "text", "text": json.dumps(_tool_search_dataset(arguments), indent=2)}
-            ]
-        }
-    elif name == "create_prompt":
-        return {
-            "content": [
-                {"type": "text", "text": json.dumps(_tool_create_prompt(arguments), indent=2)}
-            ]
-        }
-    elif name == "update_research_log":
-        return {
-            "content": [
-                {"type": "text", "text": json.dumps(_tool_update_research_log(arguments), indent=2)}
-            ]
-        }
-    elif name == "save_explanation":
-        return {
-            "content": [
-                {"type": "text", "text": json.dumps(_tool_save_explanation(arguments), indent=2)}
-            ]
-        }
-    elif name == "submit_suggestion":
-        return {
-            "content": [
-                {"type": "text", "text": json.dumps(_tool_submit_suggestion(arguments), indent=2)}
-            ]
-        }
-    elif name == "set_investigation_summary":
-        return {
-            "content": [
-                {
-                    "type": "text",
-                    "text": json.dumps(_tool_set_investigation_summary(arguments), indent=2),
-                }
-            ]
-        }
-    elif name == "save_graph_artifact":
-        return {
-            "content": [
-                {
-                    "type": "text",
-                    "text": json.dumps(_tool_save_graph_artifact(arguments), indent=2),
-                }
-            ]
-        }
-    else:
-        raise ValueError(f"Unknown tool: {name}")
+    if name in _STREAMING_TOOLS:
+        return _STREAMING_TOOLS[name](arguments)
+
+    if name in _SIMPLE_TOOLS:
+        result = _SIMPLE_TOOLS[name](arguments)
+        return {"content": [{"type": "text", "text": json.dumps(result, indent=2)}]}
+
+    raise ValueError(f"Unknown tool: {name}")
 
 
 @router.post("/mcp")
diff --git a/spd/app/backend/server.py b/spd/app/backend/server.py
index 519cf9f07..bb0e1e8f6 100644
--- a/spd/app/backend/server.py
+++ b/spd/app/backend/server.py
@@ -54,7 +54,7 @@ async def lifespan(app: FastAPI):  # pyright: ignore[reportUnusedParameter]
     import os
     from pathlib import Path
 
-    from spd.app.backend.routers.mcp import SwarmConfig, set_swarm_config
+    from spd.app.backend.routers.mcp import InvestigationConfig, set_investigation_config
 
     manager = StateManager.get()
 
@@ -66,23 +66,17 @@ async def lifespan(app: FastAPI):  # pyright: ignore[reportUnusedParameter]
     logger.info(f"[STARTUP] Device: {DEVICE}")
     logger.info(f"[STARTUP] CUDA available: {torch.cuda.is_available()}")
 
-    # Configure MCP for agent swarm mode (derives paths from task_dir)
-    swarm_task_dir = os.environ.get("SPD_SWARM_TASK_DIR")
-    swarm_suggestions_path = os.environ.get("SPD_SWARM_SUGGESTIONS_PATH")
-
-    if swarm_task_dir or swarm_suggestions_path:
-        assert swarm_task_dir and swarm_suggestions_path, (
-            "Swarm mode requires: SPD_SWARM_TASK_DIR and SPD_SWARM_SUGGESTIONS_PATH"
-        )
-        task_dir = Path(swarm_task_dir)
-        set_swarm_config(
-            SwarmConfig(
-                events_log_path=task_dir / "events.jsonl",
-                task_dir=task_dir,
-                suggestions_path=Path(swarm_suggestions_path),
+    # Configure MCP for investigation mode (derives paths from investigation dir)
+    investigation_dir = os.environ.get("SPD_INVESTIGATION_DIR")
+    if investigation_dir:
+        inv_dir = Path(investigation_dir)
+        set_investigation_config(
+            InvestigationConfig(
+                events_log_path=inv_dir / "events.jsonl",
+                investigation_dir=inv_dir,
             )
         )
-        logger.info(f"[STARTUP] Swarm mode enabled: task_dir={swarm_task_dir}")
+        logger.info(f"[STARTUP] Investigation mode enabled: dir={investigation_dir}")
 
     yield
 
diff --git a/spd/app/frontend/src/components/InvestigationsTab.svelte b/spd/app/frontend/src/components/InvestigationsTab.svelte
index 482bc95f4..ff84b5fbc 100644
--- a/spd/app/frontend/src/components/InvestigationsTab.svelte
+++ b/spd/app/frontend/src/components/InvestigationsTab.svelte
@@ -1,6 +1,11 @@
 <script lang="ts">
     import * as api from "../lib/api";
-    import type { InvestigationSummary, InvestigationDetail, GraphArtifact } from "../lib/api/investigations";
+    import type {
+        InvestigationSummary,
+        InvestigationDetail,
+        GraphArtifact,
+        LaunchResponse,
+    } from "../lib/api/investigations";
     import type { Loadable } from "../lib";
     import ResearchLogViewer from "./investigations/ResearchLogViewer.svelte";
 
@@ -11,6 +16,24 @@
     let loadedArtifacts = $state<Record<string, GraphArtifact>>({});
     let artifactsLoading = $state(false);
 
+    // Launch state
+    let launchPrompt = $state("");
+    let launchState = $state<Loadable<LaunchResponse>>({ status: "uninitialized" });
+
+    async function handleLaunch() {
+        if (!launchPrompt.trim()) return;
+        launchState = { status: "loading" };
+        try {
+            const result = await api.launchInvestigation(launchPrompt.trim());
+            launchState = { status: "loaded", data: result };
+            launchPrompt = "";
+            await loadInvestigations();
+            selectInvestigation(result.inv_id);
+        } catch (e) {
+            launchState = { status: "error", error: e };
+        }
+    }
+
     // Load investigations on mount
     $effect(() => {
         loadInvestigations();
@@ -26,12 +49,19 @@
         }
     }
 
-    async function selectInvestigation(swarmId: string, taskId: number) {
+    let selectedInvId = $state<string | null>(null);
+
+    async function selectInvestigation(invId: string) {
+        selectedInvId = invId;
         selected = { status: "loading" };
-        loadedArtifacts = {}; // Reset artifacts when selecting new investigation
+        loadedArtifacts = {};
         artifactsLoading = false;
+        await fetchInvestigation(invId);
+    }
+
+    async function fetchInvestigation(invId: string) {
         try {
-            const data = await api.getInvestigation(swarmId, taskId);
+            const data = await api.getInvestigation(invId);
             selected = { status: "loaded", data };
 
             // Load all artifacts for this investigation
@@ -41,7 +71,7 @@
                 await Promise.all(
                     data.artifact_ids.map(async (artifactId) => {
                         try {
-                            const artifact = await api.getArtifact(swarmId, taskId, artifactId);
+                            const artifact = await api.getArtifact(invId, artifactId);
                             artifacts[artifactId] = artifact;
                         } catch (e) {
                             console.error(`Failed to load artifact ${artifactId}:`, e);
@@ -57,8 +87,13 @@
         }
     }
 
+    function refreshSelected() {
+        if (selectedInvId) fetchInvestigation(selectedInvId);
+    }
+
     function goBack() {
         selected = null;
+        selectedInvId = null;
         loadedArtifacts = {};
         artifactsLoading = false;
     }
@@ -69,8 +104,7 @@
     }
 
     function formatId(id: string): string {
-        // swarm-abc123/1 -> abc123/1
-        return id.replace("swarm-", "");
+        return id.replace("inv-", "");
     }
 
     function getEventTypeColor(eventType: string): string {
@@ -96,6 +130,7 @@
         <div class="detail-header">
             <button class="back-button" onclick={goBack}>← Back</button>
             <h2>{selected.data.title || formatId(selected.data.id)}</h2>
+            <button class="refresh-button" onclick={refreshSelected}>↻ Refresh</button>
             {#if selected.data.status}
                 <span
                     class="status-pill"
@@ -172,6 +207,32 @@
             <button class="refresh-button" onclick={loadInvestigations}>↻ Refresh</button>
         </div>
 
+        <form
+            class="launch-form"
+            onsubmit={(e) => {
+                e.preventDefault();
+                handleLaunch();
+            }}
+        >
+            <input
+                class="launch-input"
+                type="text"
+                placeholder="Ask a research question..."
+                bind:value={launchPrompt}
+                disabled={launchState.status === "loading"}
+            />
+            <button
+                class="launch-button"
+                type="submit"
+                disabled={launchState.status === "loading" || !launchPrompt.trim()}
+            >
+                {launchState.status === "loading" ? "Launching..." : "Investigate"}
+            </button>
+        </form>
+        {#if launchState.status === "error"}
+            <div class="launch-error">{launchState.error}</div>
+        {/if}
+
         {#if investigations.status === "loading"}
             <div class="loading">Loading investigations...</div>
         {:else if investigations.status === "error"}
@@ -179,7 +240,7 @@
         {:else if investigations.status === "loaded"}
             <div class="investigations-list">
                 {#each investigations.data as inv (inv.id)}
-                    <button class="investigation-card" onclick={() => selectInvestigation(inv.swarm_id, inv.task_id)}>
+                    <button class="investigation-card" onclick={() => selectInvestigation(inv.id)}>
                         <div class="card-header">
                             <span class="investigation-id">{formatId(inv.id)}</span>
                             <span class="investigation-date">{formatDate(inv.created_at)}</span>
@@ -187,6 +248,9 @@
                         {#if inv.title}
                             <span class="investigation-title">{inv.title}</span>
                         {/if}
+                        {#if inv.prompt}
+                            <span class="investigation-prompt">{inv.prompt}</span>
+                        {/if}
                         {#if inv.wandb_path}
                             <span class="investigation-wandb">{inv.wandb_path}</span>
                         {/if}
@@ -218,7 +282,9 @@
                         {/if}
                     </button>
                 {:else}
-                    <p class="empty-message">No investigations found. Run <code>spd-swarm</code> to create one.</p>
+                    <p class="empty-message">
+                        No investigations found. Run <code>spd-investigate</code> to create one.
+                    </p>
                 {/each}
             </div>
         {/if}
@@ -276,6 +342,58 @@
         background: var(--bg-surface);
     }
 
+    .launch-form {
+        display: flex;
+        gap: var(--space-2);
+        margin-bottom: var(--space-4);
+    }
+
+    .launch-input {
+        flex: 1;
+        padding: var(--space-2) var(--space-3);
+        background: var(--bg-surface);
+        border: 1px solid var(--border-default);
+        border-radius: var(--radius-md);
+        color: var(--text-primary);
+        font-size: var(--text-sm);
+    }
+
+    .launch-input:focus {
+        outline: none;
+        border-color: var(--accent-primary);
+    }
+
+    .launch-input:disabled {
+        opacity: 0.5;
+    }
+
+    .launch-button {
+        padding: var(--space-2) var(--space-3);
+        background: var(--accent-primary);
+        border: none;
+        border-radius: var(--radius-md);
+        color: var(--bg-base);
+        font-size: var(--text-sm);
+        font-weight: 600;
+        cursor: pointer;
+        white-space: nowrap;
+    }
+
+    .launch-button:hover:not(:disabled) {
+        opacity: 0.9;
+    }
+
+    .launch-button:disabled {
+        opacity: 0.5;
+        cursor: not-allowed;
+    }
+
+    .launch-error {
+        color: var(--accent-red);
+        font-size: var(--text-sm);
+        margin-bottom: var(--space-3);
+    }
+
     .wandb-path,
     .investigation-wandb {
         font-family: var(--font-mono);
@@ -360,6 +478,18 @@
         color: var(--accent-yellow);
     }
 
+    .investigation-prompt {
+        font-size: var(--text-xs);
+        color: var(--text-secondary);
+        font-style: italic;
+        line-height: 1.4;
+        overflow: hidden;
+        text-overflow: ellipsis;
+        display: -webkit-box;
+        -webkit-line-clamp: 2;
+        -webkit-box-orient: vertical;
+    }
+
     .investigation-title {
         font-size: var(--text-sm);
         font-weight: 600;
@@ -444,8 +574,6 @@
         border: 1px solid var(--border-default);
         border-radius: var(--radius-md);
         padding: var(--space-3);
-        max-height: 70vh;
-        overflow-y: auto;
     }
 
     .events-list {
diff --git a/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte b/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte
index 2dfb3dab5..39db2d4ee 100644
--- a/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte
+++ b/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte
@@ -158,7 +158,13 @@
                 const baseX = seqXStarts[seqIdx] + COL_PADDING;
                 const baseY = layerYPositions[layer];
 
-                const sorted = sortComponentsByImportance(nodes, layer, seqIdx, graphData.nodeCiVals, graphData.outputProbs);
+                const sorted = sortComponentsByImportance(
+                    nodes,
+                    layer,
+                    seqIdx,
+                    graphData.nodeCiVals,
+                    graphData.outputProbs,
+                );
                 const offsets = computeComponentOffsets(sorted, COMPONENT_SIZE, compGap);
 
                 for (const cIdx of nodes) {
@@ -414,7 +420,7 @@
     .artifact-graph {
         border: 1px solid var(--border-default);
         border-radius: var(--radius-md);
-        overflow: hidden;
+        overflow: visible;
         margin: var(--space-3) 0;
         background: var(--bg-surface);
     }
@@ -430,9 +436,8 @@
 
     .graph-wrapper {
         display: flex;
-        overflow: hidden;
+        overflow: visible;
         position: relative;
-        height: 400px;
     }
 
     .graph-wrapper.panning {
@@ -449,7 +454,7 @@
     }
 
     .graph-container {
-        overflow: auto;
+        overflow: visible;
         flex: 1;
         position: relative;
         background: var(--bg-inset);
diff --git a/spd/app/frontend/src/lib/api/investigations.ts b/spd/app/frontend/src/lib/api/investigations.ts
index f0d258f4b..42f1fb1f3 100644
--- a/spd/app/frontend/src/lib/api/investigations.ts
+++ b/spd/app/frontend/src/lib/api/investigations.ts
@@ -1,13 +1,11 @@
 /**
- * API client for investigations (agent swarm results).
- * Investigations are flattened across swarms - each task is independent.
+ * API client for investigation results.
  */
 
 export interface InvestigationSummary {
-    id: string; // swarm_id/task_id
-    swarm_id: string;
-    task_id: number;
+    id: string; // inv_id (e.g., "inv-abc12345")
     wandb_path: string | null;
+    prompt: string | null;
     created_at: string;
     has_research_log: boolean;
     has_explanations: boolean;
@@ -29,9 +27,8 @@ export interface EventEntry {
 
 export interface InvestigationDetail {
     id: string;
-    swarm_id: string;
-    task_id: number;
     wandb_path: string | null;
+    prompt: string | null;
     created_at: string;
     research_log: string | null;
     events: EventEntry[];
@@ -64,26 +61,41 @@ export interface GraphArtifact {
     data: ArtifactGraphData;
 }
 
+export interface LaunchResponse {
+    inv_id: string;
+    job_id: string;
+}
+
+export async function launchInvestigation(prompt: string): Promise<LaunchResponse> {
+    const res = await fetch("/api/investigations/launch", {
+        method: "POST",
+        headers: { "Content-Type": "application/json" },
+        body: JSON.stringify({ prompt }),
+    });
+    if (!res.ok) throw new Error(`Failed to launch investigation: ${res.statusText}`);
+    return res.json();
+}
+
 export async function listInvestigations(): Promise<InvestigationSummary[]> {
     const res = await fetch("/api/investigations");
     if (!res.ok) throw new Error(`Failed to list investigations: ${res.statusText}`);
     return res.json();
 }
 
-export async function getInvestigation(swarmId: string, taskId: number): Promise<InvestigationDetail> {
-    const res = await fetch(`/api/investigations/${swarmId}/${taskId}`);
+export async function getInvestigation(invId: string): Promise<InvestigationDetail> {
+    const res = await fetch(`/api/investigations/${invId}`);
     if (!res.ok) throw new Error(`Failed to get investigation: ${res.statusText}`);
     return res.json();
 }
 
-export async function listArtifacts(swarmId: string, taskId: number): Promise<string[]> {
-    const res = await fetch(`/api/investigations/${swarmId}/${taskId}/artifacts`);
+export async function listArtifacts(invId: string): Promise<string[]> {
+    const res = await fetch(`/api/investigations/${invId}/artifacts`);
     if (!res.ok) throw new Error(`Failed to list artifacts: ${res.statusText}`);
     return res.json();
 }
 
-export async function getArtifact(swarmId: string, taskId: number, artifactId: string): Promise<GraphArtifact> {
-    const res = await fetch(`/api/investigations/${swarmId}/${taskId}/artifacts/${artifactId}`);
+export async function getArtifact(invId: string, artifactId: string): Promise<GraphArtifact> {
+    const res = await fetch(`/api/investigations/${invId}/artifacts/${artifactId}`);
     if (!res.ok) throw new Error(`Failed to get artifact: ${res.statusText}`);
     return res.json();
 }
diff --git a/spd/investigate/CLAUDE.md b/spd/investigate/CLAUDE.md
new file mode 100644
index 000000000..922734220
--- /dev/null
+++ b/spd/investigate/CLAUDE.md
@@ -0,0 +1,118 @@
+# Investigation Module
+
+Launch a Claude Code agent to investigate a specific research question about an SPD model decomposition.
+
+## Usage
+
+```bash
+spd-investigate <wandb_path> "How does the model handle gendered pronouns?"
+spd-investigate <wandb_path> "What circuit handles verb agreement?" --max_turns 30 --time 4:00:00
+```
+
+For parallel investigations, run the command multiple times with different prompts.
+
+## Architecture
+
+```
+spd/investigate/
+├── __init__.py           # Public exports
+├── CLAUDE.md             # This file
+├── schemas.py            # Pydantic models for outputs (BehaviorExplanation, InvestigationEvent)
+├── agent_prompt.py       # System prompt template with model info injection
+└── scripts/
+    ├── __init__.py
+    ├── run_slurm_cli.py  # CLI entry point (spd-investigate)
+    ├── run_slurm.py      # SLURM submission logic
+    └── run_agent.py      # Worker script (runs in SLURM job)
+```
+
+## How It Works
+
+1. `spd-investigate` creates output dir, metadata, git snapshot, and submits a single SLURM job
+2. The SLURM job runs `run_agent.py` which:
+   - Starts an isolated FastAPI backend with MCP support
+   - Loads the SPD run onto GPU
+   - Fetches model architecture info
+   - Generates the agent prompt (research question + model context + methodology)
+   - Launches Claude Code with MCP tools
+3. The agent investigates using MCP tools and writes findings to the output directory
+
+## MCP Tools
+
+The agent accesses all SPD functionality via MCP at `/mcp`:
+
+**Circuit Discovery:**
+- `optimize_graph` — Find minimal circuit for a behavior (streams progress)
+- `create_prompt` — Tokenize text and get next-token probabilities
+
+**Component Analysis:**
+- `get_component_info` — Interpretation, token stats, correlations
+- `probe_component` — Fast CI probing on custom text
+- `get_component_activation_examples` — Training examples where a component fires
+- `get_component_attributions` — Dataset-level component dependencies
+- `get_attribution_strength` — Attribution between specific component pairs
+
+**Testing:**
+- `run_ablation` — Test circuit with only selected components
+- `search_dataset` — Search training data
+
+**Metadata:**
+- `get_model_info` — Architecture details
+
+**Output:**
+- `update_research_log` — Append to research log (PRIMARY OUTPUT)
+- `save_graph_artifact` — Save graph for inline visualization
+- `save_explanation` — Save complete behavior explanation
+- `set_investigation_summary` — Set title/summary for UI
+
+## Output Structure
+
+```
+SPD_OUT_DIR/investigations/<inv_id>/
+├── metadata.json          # Investigation config (wandb_path, prompt, etc.)
+├── research_log.md        # Human-readable progress log (PRIMARY OUTPUT)
+├── events.jsonl           # Structured progress events
+├── explanations.jsonl     # Complete behavior explanations
+├── summary.json           # Agent-provided title/summary for UI
+├── artifacts/             # Graph artifacts for visualization
+│   └── graph_001.json
+├── app.db                 # Isolated SQLite database
+├── backend.log            # Backend subprocess output
+├── claude_output.jsonl    # Raw Claude Code output
+├── agent_prompt.md        # The prompt given to the agent
+└── mcp_config.json        # MCP server configuration
+```
+
+## Environment
+
+The backend runs with `SPD_INVESTIGATION_DIR` set to the investigation directory. This controls:
+- Database location: `<dir>/app.db`
+- Events log: `<dir>/events.jsonl`
+- Research log: `<dir>/research_log.md`
+
+## Configuration
+
+CLI arguments:
+- `wandb_path` — Required. WandB run path for the SPD decomposition.
+- `prompt` — Required. Research question or investigation directive.
+- `--context_length` — Token context length (default: 128)
+- `--max_turns` — Max Claude turns (default: 50, prevents runaway)
+- `--partition` — SLURM partition (default: h200-reserved)
+- `--time` — Job time limit (default: 8:00:00)
+- `--job_suffix` — Optional suffix for job names
+
+## Monitoring
+
+```bash
+# Watch research log
+tail -f SPD_OUT_DIR/investigations/<inv_id>/research_log.md
+
+# Watch events
+tail -f SPD_OUT_DIR/investigations/<inv_id>/events.jsonl
+
+# View explanations
+cat SPD_OUT_DIR/investigations/<inv_id>/explanations.jsonl | jq .
+
+# Check SLURM job status
+squeue --me
+```
diff --git a/spd/investigate/__init__.py b/spd/investigate/__init__.py
new file mode 100644
index 000000000..9e666dd7d
--- /dev/null
+++ b/spd/investigate/__init__.py
@@ -0,0 +1,22 @@
+"""Investigation: SLURM-based agent investigation of model behaviors.
+
+This module provides infrastructure for launching a Claude Code agent to investigate
+behaviors in an SPD model decomposition. Each investigation:
+1. Starts an isolated app backend instance (separate database, unique port)
+2. Receives a specific research question and detailed instructions
+3. Investigates behaviors and writes findings to append-only JSONL files
+"""
+
+from spd.investigate.schemas import (
+    BehaviorExplanation,
+    ComponentInfo,
+    Evidence,
+    InvestigationEvent,
+)
+
+__all__ = [
+    "BehaviorExplanation",
+    "ComponentInfo",
+    "Evidence",
+    "InvestigationEvent",
+]
diff --git a/spd/investigate/agent_prompt.py b/spd/investigate/agent_prompt.py
new file mode 100644
index 000000000..d53a47ac3
--- /dev/null
+++ b/spd/investigate/agent_prompt.py
@@ -0,0 +1,211 @@
+"""System prompt for SPD investigation agents.
+
+This module contains the detailed instructions given to the investigation agent.
+The agent has access to SPD tools via MCP - tools are self-documenting.
+"""
+
+from typing import Any
+
+AGENT_SYSTEM_PROMPT = """
+# SPD Behavior Investigation Agent
+
+You are a research agent investigating behaviors in a neural network model decomposition.
+A researcher has given you a specific question to investigate. Your job is to answer it
+thoroughly using the SPD analysis tools available to you.
+
+## Your Mission
+
+{prompt}
+
+## Available Tools (via MCP)
+
+You have access to SPD analysis tools. Use them directly - they have full documentation.
+
+**Circuit Discovery:**
+- **optimize_graph**: Find the minimal circuit for a behavior (e.g., "boy" → "he")
+- **create_prompt**: Tokenize text and get next-token probabilities
+
+**Component Analysis:**
+- **get_component_info**: Get interpretation and token stats for a component
+- **probe_component**: Fast CI probing - test if a component activates on specific text
+- **get_component_activation_examples**: See training examples where a component fires
+- **get_component_attributions**: Dataset-level component dependencies (sources and targets)
+- **get_attribution_strength**: Query attribution strength between two specific components
+
+**Testing:**
+- **run_ablation**: Test a circuit by running with only selected components
+- **search_dataset**: Find examples in the training data
+
+**Metadata:**
+- **get_model_info**: Get model architecture details
+- **get_stored_graphs**: Retrieve previously computed graphs
+
+**Output:**
+- **update_research_log**: Append to your research log (PRIMARY OUTPUT - use frequently!)
+- **save_graph_artifact**: Save a graph for inline visualization in your research log
+- **save_explanation**: Save a complete, validated behavior explanation
+- **set_investigation_summary**: Set a title and summary for your investigation
+
+## Investigation Methodology
+
+### Step 1: Understand the Question
+
+Read the research question carefully. Think about what behaviors, components, or mechanisms
+might be relevant. Use `get_model_info` if you need to understand the model architecture.
+
+### Step 2: Explore and Hypothesize
+
+- Use `create_prompt` to test prompts and see what the model predicts
+- Use `search_dataset` to find relevant examples in the training data
+- Use `probe_component` to quickly test whether specific components respond to your prompts
+- Use `get_component_info` to understand what components do
+
+### Step 3: Find Circuits
+
+- Use `optimize_graph` to find the minimal circuit for specific behaviors
+- Examine which components have high CI values
+- Note the circuit size (fewer active components = cleaner mechanism)
+
+### Step 4: Understand Component Roles
+
+For each important component in a circuit:
+1. Use `get_component_info` for interpretation and token associations
+2. Use `probe_component` to test activation on different inputs
+3. Use `get_component_activation_examples` to see training examples
+4. Use `get_component_attributions` to understand information flow
+5. Check correlated components for related functions
+
+### Step 5: Test with Ablations
+
+Form hypotheses and test them:
+1. Use `run_ablation` with the circuit's components
+2. Verify predictions match expectations
+3. Try removing individual components to find critical ones
+
+### Step 6: Document Your Findings
+
+Use `update_research_log` frequently - this is how humans monitor your work!
+When you have a complete explanation, use `save_explanation` to create a structured record.
+
+## Scientific Principles
+
+- **Be skeptical**: Your first hypothesis is probably incomplete
+- **Triangulate**: Don't rely on a single type of evidence
+- **Document uncertainty**: Note what you're confident in vs. uncertain about
+- **Consider alternatives**: What else could explain the behavior?
+
+## Output Format
+
+### Research Log (PRIMARY OUTPUT - Update frequently!)
+
+Use `update_research_log` with markdown content. Call it every few minutes to show progress:
+
+Example calls:
+```
+update_research_log("## Hypothesis: Gendered Pronoun Circuit\\n\\nTesting prompt: 'The boy said that' → expecting ' he'\\n\\n")
+
+update_research_log("## Ablation Test\\n\\nResult: P(he) = 0.89 (vs 0.22 baseline)\\n\\nThis confirms the circuit is sufficient!\\n\\n")
+```
+
+### Including Graph Visualizations
+
+After running `optimize_graph`, embed the circuit visualization in your research log:
+
+1. Call `save_graph_artifact` with the graph_id returned by optimize_graph
+2. Reference it in your research log using the `spd:graph` code block
+
+Example:
+```
+save_graph_artifact(graph_id=42, caption="Circuit predicting 'he' after 'The boy'")
+
+update_research_log('''## Circuit Visualization
+
+```spd:graph
+artifact: graph_001
+```
+
+This circuit shows the key components involved in predicting "he"...
+''')
+```
+
+### Saving Explanations
+
+When you have a complete explanation, use `save_explanation`:
+
+```
+save_explanation(
+  subject_prompt="The boy said that",
+  behavior_description="Predicts masculine pronoun 'he' after male subject",
+  components_involved=[
+    {{"component_key": "h.0.mlp.c_fc:407", "role": "Male subject detector"}},
+    {{"component_key": "h.3.attn.o_proj:262", "role": "Masculine pronoun promoter"}}
+  ],
+  explanation="Component h.0.mlp.c_fc:407 activates on male subjects...",
+  confidence="medium",
+  limitations=["Only tested on simple sentences"]
+)
+```
+
+## Getting Started
+
+1. **Create your research log** with `update_research_log`
+2. Understand the research question and plan your approach
+3. Use analysis tools to explore the model
+4. **Call `update_research_log` frequently** - humans are watching!
+5. Use `save_explanation` for complete findings
+6. **Call `set_investigation_summary`** with a title and summary when done
+
+Document what you learn, even if it's "this was more complicated than expected."
+"""
+
+
+def _format_model_info(model_info: dict[str, Any]) -> str:
+    """Format model architecture info for inclusion in the agent prompt."""
+    parts = [f"- **Architecture**: {model_info.get('summary', 'Unknown')}"]
+
+    target_config = model_info.get("target_model_config")
+    if target_config:
+        if "n_layer" in target_config:
+            parts.append(f"- **Layers**: {target_config['n_layer']}")
+        if "n_embd" in target_config:
+            parts.append(f"- **Hidden dim**: {target_config['n_embd']}")
+        if "vocab_size" in target_config:
+            parts.append(f"- **Vocab size**: {target_config['vocab_size']}")
+        if "n_ctx" in target_config:
+            parts.append(f"- **Context length**: {target_config['n_ctx']}")
+
+    topology = model_info.get("topology")
+    if topology and topology.get("block_structure"):
+        block = topology["block_structure"][0]
+        attn = ", ".join(block.get("attn_projections", []))
+        ffn = ", ".join(block.get("ffn_projections", []))
+        parts.append(f"- **Attention projections**: {attn}")
+        parts.append(f"- **FFN projections**: {ffn}")
+
+    return "\n".join(parts)
+
+
+def get_agent_prompt(
+    wandb_path: str,
+    prompt: str,
+    model_info: dict[str, Any],
+) -> str:
+    """Generate the full agent prompt with runtime parameters filled in."""
+    formatted_prompt = AGENT_SYSTEM_PROMPT.format(prompt=prompt)
+
+    model_section = f"""
+## Model Architecture
+
+{_format_model_info(model_info)}
+
+## Runtime Context
+
+- **Model Run**: {wandb_path}
+
+Use the MCP tools for ALL output:
+- `update_research_log` → **PRIMARY OUTPUT** - Update frequently with your progress!
+- `save_explanation` → Save complete, validated behavior explanations
+
+**Start by calling update_research_log to create your log, then investigate!**
+"""
+    return formatted_prompt + model_section
diff --git a/spd/agent_swarm/schemas.py b/spd/investigate/schemas.py
similarity index 97%
rename from spd/agent_swarm/schemas.py
rename to spd/investigate/schemas.py
index a323d7be5..d4da1a896 100644
--- a/spd/agent_swarm/schemas.py
+++ b/spd/investigate/schemas.py
@@ -1,4 +1,4 @@
-"""Schemas for agent swarm outputs.
+"""Schemas for investigation outputs.
 
 All agent outputs are append-only JSONL files. Each line is a JSON object
 conforming to one of the schemas defined here.
@@ -83,7 +83,7 @@ class BehaviorExplanation(BaseModel):
     )
 
 
-class SwarmEvent(BaseModel):
+class InvestigationEvent(BaseModel):
     """A generic event logged by an agent during investigation.
 
     Used for logging progress, observations, and other non-explanation events.
diff --git a/spd/investigate/scripts/__init__.py b/spd/investigate/scripts/__init__.py
new file mode 100644
index 000000000..ff51f7654
--- /dev/null
+++ b/spd/investigate/scripts/__init__.py
@@ -0,0 +1 @@
+"""Investigation SLURM scripts."""
diff --git a/spd/agent_swarm/scripts/run_agent.py b/spd/investigate/scripts/run_agent.py
similarity index 55%
rename from spd/agent_swarm/scripts/run_agent.py
rename to spd/investigate/scripts/run_agent.py
index 72ebeb7a8..ff3d0c877 100644
--- a/spd/agent_swarm/scripts/run_agent.py
+++ b/spd/investigate/scripts/run_agent.py
@@ -1,11 +1,11 @@
 """Worker script that runs inside each SLURM job.
 
 This script:
-1. Creates an isolated output directory for this agent
+1. Reads the research question from the investigation metadata
 2. Starts the app backend with an isolated database
-3. Loads the SPD run
+3. Loads the SPD run and fetches model architecture info
 4. Configures MCP server for Claude Code
-5. Launches Claude Code with investigation instructions
+5. Launches Claude Code with the investigation question
 6. Handles cleanup on exit
 """
 
@@ -18,17 +18,18 @@
 import time
 from pathlib import Path
 from types import FrameType
+from typing import Any
 
 import fire
 import requests
 
-from spd.agent_swarm.agent_prompt import get_agent_prompt
-from spd.agent_swarm.schemas import SwarmEvent
-from spd.agent_swarm.scripts.run_slurm import get_swarm_output_dir
+from spd.investigate.agent_prompt import get_agent_prompt
+from spd.investigate.schemas import InvestigationEvent
+from spd.investigate.scripts.run_slurm import get_investigation_output_dir
 from spd.log import logger
 
 
-def write_mcp_config(task_dir: Path, port: int) -> Path:
+def write_mcp_config(inv_dir: Path, port: int) -> Path:
     """Write MCP configuration file for Claude Code."""
     mcp_config = {
         "mcpServers": {
@@ -38,14 +39,14 @@ def write_mcp_config(task_dir: Path, port: int) -> Path:
             }
         }
     }
-    config_path = task_dir / "mcp_config.json"
+    config_path = inv_dir / "mcp_config.json"
     config_path.write_text(json.dumps(mcp_config, indent=2))
     return config_path
 
 
-def write_claude_settings(task_dir: Path) -> None:
+def write_claude_settings(inv_dir: Path) -> None:
     """Write Claude Code settings to pre-grant MCP tool permissions."""
-    claude_dir = task_dir / ".claude"
+    claude_dir = inv_dir / ".claude"
     claude_dir.mkdir(exist_ok=True)
     settings = {"permissions": {"allow": ["mcp__spd__*"]}}
     (claude_dir / "settings.json").write_text(json.dumps(settings, indent=2))
@@ -91,7 +92,15 @@ def load_run(port: int, wandb_path: str, context_length: int) -> None:
     )
 
 
-def log_event(events_path: Path, event: SwarmEvent) -> None:
+def fetch_model_info(port: int) -> dict[str, Any]:
+    """Fetch model architecture info from the backend."""
+    resp = requests.get(f"http://localhost:{port}/api/pretrain_info/loaded", timeout=30)
+    assert resp.status_code == 200, f"Failed to fetch model info: {resp.status_code} {resp.text}"
+    result: dict[str, Any] = resp.json()
+    return result
+
+
+def log_event(events_path: Path, event: InvestigationEvent) -> None:
     """Append an event to the events log."""
     with open(events_path, "a") as f:
         f.write(event.model_dump_json() + "\n")
@@ -99,8 +108,7 @@ def log_event(events_path: Path, event: SwarmEvent) -> None:
 
 def run_agent(
     wandb_path: str,
-    task_id: int,
-    swarm_id: str,
+    inv_id: str,
     context_length: int = 128,
     max_turns: int = 50,
 ) -> None:
@@ -108,48 +116,46 @@ def run_agent(
 
     Args:
         wandb_path: WandB path of the SPD run.
-        task_id: SLURM task ID (1-indexed).
-        swarm_id: Unique identifier for this swarm.
+        inv_id: Unique identifier for this investigation.
         context_length: Context length for prompts.
         max_turns: Maximum agentic turns before stopping (prevents runaway agents).
     """
-    # Setup output directory
-    swarm_dir = get_swarm_output_dir(swarm_id)
-    task_dir = swarm_dir / f"task_{task_id}"
-    task_dir.mkdir(parents=True, exist_ok=True)
+    inv_dir = get_investigation_output_dir(inv_id)
+    assert inv_dir.exists(), f"Investigation directory does not exist: {inv_dir}"
+
+    # Read prompt from metadata
+    metadata: dict[str, Any] = json.loads((inv_dir / "metadata.json").read_text())
+    prompt = metadata["prompt"]
 
-    # Pre-grant MCP tool permissions
-    write_claude_settings(task_dir)
+    write_claude_settings(inv_dir)
 
-    events_path = task_dir / "events.jsonl"
-    (task_dir / "explanations.jsonl").touch()
+    events_path = inv_dir / "events.jsonl"
+    (inv_dir / "explanations.jsonl").touch()
 
     log_event(
         events_path,
-        SwarmEvent(
+        InvestigationEvent(
             event_type="start",
-            message=f"Agent {task_id} starting",
-            details={"wandb_path": wandb_path, "swarm_id": swarm_id},
+            message=f"Investigation {inv_id} starting",
+            details={"wandb_path": wandb_path, "inv_id": inv_id, "prompt": prompt},
         ),
     )
 
-    # Find available port (offset by task_id to reduce collisions)
-    port = find_available_port(start_port=8000 + (task_id - 1) * 10)
-    logger.info(f"[Task {task_id}] Using port {port}")
+    port = find_available_port()
+    logger.info(f"[{inv_id}] Using port {port}")
 
     log_event(
         events_path,
-        SwarmEvent(
+        InvestigationEvent(
             event_type="progress",
             message=f"Starting backend on port {port}",
             details={"port": port},
         ),
     )
 
-    # Start backend with swarm configuration (paths derived from task_dir)
+    # Start backend with investigation configuration
     env = os.environ.copy()
-    env["SPD_SWARM_TASK_DIR"] = str(task_dir)
-    env["SPD_SWARM_SUGGESTIONS_PATH"] = str(swarm_dir.parent / "suggestions.jsonl")
+    env["SPD_INVESTIGATION_DIR"] = str(inv_dir)
 
     backend_cmd = [
         sys.executable,
@@ -159,9 +165,7 @@ def run_agent(
         str(port),
     ]
 
-    # Write backend logs to file instead of piping to avoid buffer deadlock
-    # (if we pipe and don't drain, the buffer fills and blocks the backend)
-    backend_log_path = task_dir / "backend.log"
+    backend_log_path = inv_dir / "backend.log"
     backend_log = open(backend_log_path, "w")  # noqa: SIM115 - managed manually
     backend_proc = subprocess.Popen(
         backend_cmd,
@@ -170,10 +174,9 @@ def run_agent(
         stderr=subprocess.STDOUT,
     )
 
-    # Setup cleanup handler
     def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
-        _ = frame  # Unused but required by signal handler signature
-        logger.info(f"[Task {task_id}] Cleaning up...")
+        _ = frame
+        logger.info(f"[{inv_id}] Cleaning up...")
         if backend_proc.poll() is None:
             backend_proc.terminate()
             try:
@@ -188,57 +191,45 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
     signal.signal(signal.SIGINT, cleanup)
 
     try:
-        # Wait for backend to be ready
-        logger.info(f"[Task {task_id}] Waiting for backend...")
+        logger.info(f"[{inv_id}] Waiting for backend...")
         if not wait_for_backend(port):
             log_event(
                 events_path,
-                SwarmEvent(
-                    event_type="error",
-                    message="Backend failed to start",
-                ),
+                InvestigationEvent(event_type="error", message="Backend failed to start"),
             )
             raise RuntimeError("Backend failed to start")
 
-        logger.info(f"[Task {task_id}] Backend ready, loading run...")
+        logger.info(f"[{inv_id}] Backend ready, loading run...")
         log_event(
             events_path,
-            SwarmEvent(
-                event_type="progress",
-                message="Backend ready, loading run",
-            ),
+            InvestigationEvent(event_type="progress", message="Backend ready, loading run"),
         )
 
-        # Load the SPD run
         load_run(port, wandb_path, context_length)
 
-        logger.info(f"[Task {task_id}] Run loaded, launching Claude Code...")
+        logger.info(f"[{inv_id}] Run loaded, fetching model info...")
+        model_info = fetch_model_info(port)
+
+        logger.info(f"[{inv_id}] Launching Claude Code...")
         log_event(
             events_path,
-            SwarmEvent(
-                event_type="progress",
-                message="Run loaded, launching Claude Code agent",
+            InvestigationEvent(
+                event_type="progress", message="Run loaded, launching Claude Code agent"
             ),
         )
 
-        # Generate agent prompt
         agent_prompt = get_agent_prompt(
-            port=port,
             wandb_path=wandb_path,
-            task_id=task_id,
-            output_dir=str(task_dir),
+            prompt=prompt,
+            model_info=model_info,
         )
 
-        # Write prompt to file for reference
-        prompt_path = task_dir / "agent_prompt.md"
-        prompt_path.write_text(agent_prompt)
+        (inv_dir / "agent_prompt.md").write_text(agent_prompt)
 
-        # Write MCP config for Claude Code
-        mcp_config_path = write_mcp_config(task_dir, port)
-        logger.info(f"[Task {task_id}] MCP config written to {mcp_config_path}")
+        mcp_config_path = write_mcp_config(inv_dir, port)
+        logger.info(f"[{inv_id}] MCP config written to {mcp_config_path}")
 
-        # Launch Claude Code with streaming JSON output and MCP
-        claude_output_path = task_dir / "claude_output.jsonl"
+        claude_output_path = inv_dir / "claude_output.jsonl"
         claude_cmd = [
             "claude",
             "--print",
@@ -251,13 +242,9 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
             str(mcp_config_path),
         ]
 
-        logger.info(f"[Task {task_id}] Starting Claude Code (max_turns={max_turns})...")
-        logger.info(f"[Task {task_id}] Monitor with: tail -f {claude_output_path}")
-        logger.info(
-            f"[Task {task_id}] Parse with: tail -f {claude_output_path} | jq -r '.result // empty'"
-        )
+        logger.info(f"[{inv_id}] Starting Claude Code (max_turns={max_turns})...")
+        logger.info(f"[{inv_id}] Monitor with: tail -f {claude_output_path}")
 
-        # Open output file for streaming writes
         with open(claude_output_path, "w") as output_file:
             claude_proc = subprocess.Popen(
                 claude_cmd,
@@ -265,56 +252,36 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
                 stdout=output_file,
                 stderr=subprocess.STDOUT,
                 text=True,
-                cwd=str(task_dir),
+                cwd=str(inv_dir),
             )
 
-            # Send the investigation prompt and close stdin
-            investigation_request = f"""
-{agent_prompt}
-
----
-
-Please begin your investigation:
-
-1. **FIRST**: Use the `update_research_log` tool to create your research log with a header like:
-   "# Research Log - Task {task_id}\\n\\nStarting investigation of {wandb_path}\\n\\n"
-2. Explore component interpretations using `get_component_info`
-3. Find an interesting behavior to investigate with `optimize_graph`
-4. **Use `update_research_log` frequently** to document your progress, findings, and next steps
-
-Remember:
-- The research log is your PRIMARY output - use `update_research_log` every few minutes
-- Use `save_explanation` to record complete, validated explanations
-- Use `submit_suggestion` if you have ideas for improving the tools or system
-"""
             assert claude_proc.stdin is not None
-            claude_proc.stdin.write(investigation_request)
+            claude_proc.stdin.write(agent_prompt)
             claude_proc.stdin.close()
 
-            # Wait for Claude to finish (output streams to file in real-time)
             claude_proc.wait()
 
         log_event(
             events_path,
-            SwarmEvent(
+            InvestigationEvent(
                 event_type="complete",
                 message="Investigation complete",
                 details={"exit_code": claude_proc.returncode},
             ),
         )
 
-        logger.info(f"[Task {task_id}] Investigation complete")
+        logger.info(f"[{inv_id}] Investigation complete")
 
     except Exception as e:
         log_event(
             events_path,
-            SwarmEvent(
+            InvestigationEvent(
                 event_type="error",
                 message=f"Agent failed: {e}",
                 details={"error_type": type(e).__name__},
             ),
         )
-        logger.error(f"[Task {task_id}] Failed: {e}")
+        logger.error(f"[{inv_id}] Failed: {e}")
         raise
     finally:
         cleanup()
diff --git a/spd/investigate/scripts/run_slurm.py b/spd/investigate/scripts/run_slurm.py
new file mode 100644
index 000000000..af90d7517
--- /dev/null
+++ b/spd/investigate/scripts/run_slurm.py
@@ -0,0 +1,90 @@
+"""SLURM submission logic for investigation jobs."""
+
+import json
+import secrets
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+from spd.log import logger
+from spd.settings import SPD_OUT_DIR
+from spd.utils.git_utils import create_git_snapshot
+from spd.utils.slurm import SlurmConfig, generate_script, submit_slurm_job
+
+
+@dataclass
+class InvestigationResult:
+    inv_id: str
+    job_id: str
+    output_dir: Path
+
+
+def get_investigation_output_dir(inv_id: str) -> Path:
+    return SPD_OUT_DIR / "investigations" / inv_id
+
+
+def launch_investigation(
+    wandb_path: str,
+    prompt: str,
+    context_length: int,
+    max_turns: int,
+    partition: str,
+    time: str,
+    job_suffix: str | None,
+) -> InvestigationResult:
+    """Launch a single investigation agent via SLURM.
+
+    Creates a SLURM job that starts an isolated app backend, loads the SPD run,
+    and launches a Claude Code agent with the given research question.
+    """
+    inv_id = f"inv-{secrets.token_hex(4)}"
+    output_dir = get_investigation_output_dir(inv_id)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    snapshot_branch, commit_hash = create_git_snapshot(inv_id)
+
+    suffix = f"-{job_suffix}" if job_suffix else ""
+    job_name = f"spd-investigate{suffix}"
+
+    metadata = {
+        "inv_id": inv_id,
+        "wandb_path": wandb_path,
+        "prompt": prompt,
+        "context_length": context_length,
+        "max_turns": max_turns,
+        "snapshot_branch": snapshot_branch,
+        "commit_hash": commit_hash,
+    }
+    (output_dir / "metadata.json").write_text(json.dumps(metadata, indent=2))
+
+    cmd = (
+        f"{sys.executable} -m spd.investigate.scripts.run_agent "
+        f'"{wandb_path}" '
+        f"--inv_id {inv_id} "
+        f"--context_length {context_length} "
+        f"--max_turns {max_turns}"
+    )
+
+    slurm_config = SlurmConfig(
+        job_name=job_name,
+        partition=partition,
+        n_gpus=1,
+        time=time,
+        snapshot_branch=snapshot_branch,
+    )
+    script = generate_script(slurm_config, cmd)
+    result = submit_slurm_job(script, "investigate")
+
+    logger.section("Investigation submitted")
+    logger.values(
+        {
+            "Investigation ID": inv_id,
+            "Job ID": result.job_id,
+            "WandB path": wandb_path,
+            "Prompt": prompt[:100] + ("..." if len(prompt) > 100 else ""),
+            "Output directory": str(output_dir),
+            "Logs": result.log_pattern,
+        }
+    )
+
+    return InvestigationResult(inv_id=inv_id, job_id=result.job_id, output_dir=output_dir)
diff --git a/spd/investigate/scripts/run_slurm_cli.py b/spd/investigate/scripts/run_slurm_cli.py
new file mode 100644
index 000000000..df784de61
--- /dev/null
+++ b/spd/investigate/scripts/run_slurm_cli.py
@@ -0,0 +1,59 @@
+"""CLI entry point for investigation SLURM launcher.
+
+Usage:
+    spd-investigate <wandb_path> "<prompt>"
+    spd-investigate <wandb_path> @prompt.txt
+    spd-investigate <wandb_path> "<prompt>" --max_turns 30
+"""
+
+from pathlib import Path
+
+import fire
+
+from spd.settings import DEFAULT_PARTITION_NAME
+
+
+def _resolve_prompt(prompt: str) -> str:
+    """If prompt starts with @, read from that file path. Otherwise return as-is."""
+    if prompt.startswith("@"):
+        path = Path(prompt[1:])
+        assert path.exists(), f"Prompt file not found: {path}"
+        return path.read_text().strip()
+    return prompt
+
+
+def main(
+    wandb_path: str,
+    prompt: str,
+    context_length: int = 128,
+    max_turns: int = 50,
+    partition: str = DEFAULT_PARTITION_NAME,
+    time: str = "8:00:00",
+    job_suffix: str | None = None,
+) -> None:
+    """Launch a single investigation agent for a specific question.
+
+    Args:
+        wandb_path: WandB run path for the SPD decomposition to investigate.
+        prompt: The research question, or @filepath to read from a file.
+        context_length: Context length for prompts (default 128).
+        max_turns: Maximum agentic turns (default 50, prevents runaway).
+        partition: SLURM partition name.
+        time: Job time limit (default 8 hours).
+        job_suffix: Optional suffix for SLURM job names.
+    """
+    from spd.investigate.scripts.run_slurm import launch_investigation
+
+    launch_investigation(
+        wandb_path=wandb_path,
+        prompt=_resolve_prompt(prompt),
+        context_length=context_length,
+        max_turns=max_turns,
+        partition=partition,
+        time=time,
+        job_suffix=job_suffix,
+    )
+
+
+def cli() -> None:
+    fire.Fire(main)

From 1b30e81ade1bffca5b0fbf084f5c7d39879d238d Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Fri, 13 Feb 2026 15:27:05 +0000
Subject: [PATCH 15/62] Fix investigation wandb_path matching

Normalize wandb_path to canonical form (entity/project/run_id) when
storing investigation metadata and when filtering. Handles old
investigations that stored the "runs/" form.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/app/backend/routers/investigations.py | 11 ++++++++++-
 spd/investigate/scripts/run_slurm.py      |  9 +++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/spd/app/backend/routers/investigations.py b/spd/app/backend/routers/investigations.py
index be59ba5f1..0784a0eb4 100644
--- a/spd/app/backend/routers/investigations.py
+++ b/spd/app/backend/routers/investigations.py
@@ -14,6 +14,7 @@
 
 from spd.app.backend.dependencies import DepLoadedRun
 from spd.settings import SPD_OUT_DIR
+from spd.utils.wandb_utils import parse_wandb_run_path
 
 router = APIRouter(prefix="/api/investigations", tags=["investigations"])
 
@@ -159,7 +160,15 @@ def list_investigations(loaded: DepLoadedRun) -> list[InvestigationSummary]:
         metadata = _parse_metadata(inv_path)
 
         meta_wandb_path = metadata.get("wandb_path") if metadata else None
-        if meta_wandb_path != wandb_path:
+        if meta_wandb_path is None:
+            continue
+        # Normalize to canonical form for comparison (strips "runs/", "wandb:" prefix, etc.)
+        try:
+            e, p, r = parse_wandb_run_path(meta_wandb_path)
+            canonical_meta_path = f"{e}/{p}/{r}"
+        except ValueError:
+            continue
+        if canonical_meta_path != wandb_path:
             continue
 
         events_path = inv_path / "events.jsonl"
diff --git a/spd/investigate/scripts/run_slurm.py b/spd/investigate/scripts/run_slurm.py
index af90d7517..703ed2f78 100644
--- a/spd/investigate/scripts/run_slurm.py
+++ b/spd/investigate/scripts/run_slurm.py
@@ -10,6 +10,7 @@
 from spd.settings import SPD_OUT_DIR
 from spd.utils.git_utils import create_git_snapshot
 from spd.utils.slurm import SlurmConfig, generate_script, submit_slurm_job
+from spd.utils.wandb_utils import parse_wandb_run_path
 
 
 @dataclass
@@ -37,6 +38,10 @@ def launch_investigation(
     Creates a SLURM job that starts an isolated app backend, loads the SPD run,
     and launches a Claude Code agent with the given research question.
     """
+    # Normalize wandb_path to canonical form (entity/project/run_id)
+    entity, project, run_id = parse_wandb_run_path(wandb_path)
+    canonical_wandb_path = f"{entity}/{project}/{run_id}"
+
     inv_id = f"inv-{secrets.token_hex(4)}"
     output_dir = get_investigation_output_dir(inv_id)
     output_dir.mkdir(parents=True, exist_ok=True)
@@ -48,7 +53,7 @@ def launch_investigation(
 
     metadata = {
         "inv_id": inv_id,
-        "wandb_path": wandb_path,
+        "wandb_path": canonical_wandb_path,
         "prompt": prompt,
         "context_length": context_length,
         "max_turns": max_turns,
@@ -80,7 +85,7 @@ def launch_investigation(
         {
             "Investigation ID": inv_id,
             "Job ID": result.job_id,
-            "WandB path": wandb_path,
+            "WandB path": canonical_wandb_path,
             "Prompt": prompt[:100] + ("..." if len(prompt) > 100 else ""),
             "Output directory": str(output_dir),
             "Logs": result.log_pattern,

From 22f99710469992213d91e9df1e30978df10c0c0d Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Fri, 13 Feb 2026 15:54:47 +0000
Subject: [PATCH 16/62] UI improvements: run picker arch labels, artifact graph
 layout, investigations UX
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Run picker: replace hardcoded modelName with fetched arch info
  (e.g. "SS LlamaSimple 4L d512"), add dataset_short to pretrain_info
- Artifact graphs: use shared graphLayout.ts for canonical layer names,
  fixing topological grouping (q/k/v rows, gate/up rows)
- Investigations: add launch-from-UI, @file prompt support, refresh button,
  remove research log scroll trap, scope to loaded run
- Remove layerAliasing.ts — backend now handles concrete→canonical translation
- Drop modelName from registry entries

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/app/backend/routers/pretrain_info.py      |  24 ++
 .../ActivationContextsViewer.svelte           |   3 +-
 .../src/components/InvestigationsTab.svelte   |  27 +--
 .../src/components/RunSelector.svelte         |  51 ++--
 .../investigations/ArtifactGraph.svelte       |  45 +---
 .../ui/CorrelatedSubcomponentsList.svelte     |   5 +-
 spd/app/frontend/src/lib/api/graphs.ts        |  17 --
 spd/app/frontend/src/lib/api/pretrainInfo.ts  |   1 +
 spd/app/frontend/src/lib/layerAliasing.ts     | 219 ------------------
 spd/app/frontend/src/lib/registry.ts          |  12 -
 10 files changed, 68 insertions(+), 336 deletions(-)
 delete mode 100644 spd/app/frontend/src/lib/layerAliasing.ts

diff --git a/spd/app/backend/routers/pretrain_info.py b/spd/app/backend/routers/pretrain_info.py
index 2872423c9..424f7b035 100644
--- a/spd/app/backend/routers/pretrain_info.py
+++ b/spd/app/backend/routers/pretrain_info.py
@@ -38,6 +38,7 @@ class TopologyInfo(BaseModel):
 class PretrainInfoResponse(BaseModel):
     model_type: str
     summary: str
+    dataset_short: str | None
     target_model_config: dict[str, Any] | None
     pretrain_config: dict[str, Any] | None
     pretrain_wandb_path: str | None
@@ -161,6 +162,27 @@ def _build_summary(model_type: str, target_model_config: dict[str, Any] | None)
     return " · ".join(parts)
 
 
+_DATASET_SHORT_NAMES: dict[str, str] = {
+    "simplestories": "SS",
+    "pile": "Pile",
+    "tinystories": "TS",
+}
+
+
+def _get_dataset_short(pretrain_config: dict[str, Any] | None) -> str | None:
+    """Extract a short dataset label from the pretrain config."""
+    if pretrain_config is None:
+        return None
+    dataset_name: str = (
+        pretrain_config.get("train_dataset_config", {}).get("name", "")
+        or pretrain_config.get("dataset", "")
+    ).lower()
+    for key, short in _DATASET_SHORT_NAMES.items():
+        if key in dataset_name:
+            return short
+    return None
+
+
 def _get_pretrain_info(spd_config: Config) -> PretrainInfoResponse:
     """Extract pretrain info from an SPD config."""
     model_class_name = spd_config.pretrained_model_class
@@ -190,10 +212,12 @@ def _get_pretrain_info(spd_config: Config) -> PretrainInfoResponse:
     n_blocks = target_model_config.get("n_layer", 0) if target_model_config else 0
     topology = _build_topology(model_type, n_blocks)
     summary = _build_summary(model_type, target_model_config)
+    dataset_short = _get_dataset_short(pretrain_config)
 
     return PretrainInfoResponse(
         model_type=model_type,
         summary=summary,
+        dataset_short=dataset_short,
         target_model_config=target_model_config,
         pretrain_config=pretrain_config,
         pretrain_wandb_path=pretrain_wandb_path,
diff --git a/spd/app/frontend/src/components/ActivationContextsViewer.svelte b/spd/app/frontend/src/components/ActivationContextsViewer.svelte
index d20831c1a..2b1167248 100644
--- a/spd/app/frontend/src/components/ActivationContextsViewer.svelte
+++ b/spd/app/frontend/src/components/ActivationContextsViewer.svelte
@@ -3,7 +3,6 @@
     import { computeMaxAbsComponentAct } from "../lib/colors";
     import { COMPONENT_CARD_CONSTANTS } from "../lib/componentCardConstants";
     import { anyCorrelationStatsEnabled, displaySettings } from "../lib/displaySettings.svelte";
-    import { getLayerAlias } from "../lib/layerAliasing";
     import type { ActivationContextsSummary, SubcomponentMetadata } from "../lib/promptAttributionsTypes";
     import { useComponentData } from "../lib/useComponentData.svelte";
     import { RUN_KEY, type RunContext } from "../lib/useRun.svelte";
@@ -288,7 +287,7 @@
             <label for="layer-select">Layer:</label>
             <select id="layer-select" value={selectedLayer} onchange={handleLayerChange}>
                 {#each availableLayers as layer (layer)}
-                    <option value={layer}>{getLayerAlias(layer)}</option>
+                    <option value={layer}>{layer}</option>
                 {/each}
             </select>
         </div>
diff --git a/spd/app/frontend/src/components/InvestigationsTab.svelte b/spd/app/frontend/src/components/InvestigationsTab.svelte
index ff84b5fbc..a886c7582 100644
--- a/spd/app/frontend/src/components/InvestigationsTab.svelte
+++ b/spd/app/frontend/src/components/InvestigationsTab.svelte
@@ -165,17 +165,15 @@
 
         <div class="tab-content">
             {#if activeTab === "research"}
-                <div class="research-log">
-                    {#if selected.data.research_log}
-                        <ResearchLogViewer
-                            markdown={selected.data.research_log}
-                            artifacts={loadedArtifacts}
-                            {artifactsLoading}
-                        />
-                    {:else}
-                        <p class="empty-message">No research log available</p>
-                    {/if}
-                </div>
+                {#if selected.data.research_log}
+                    <ResearchLogViewer
+                        markdown={selected.data.research_log}
+                        artifacts={loadedArtifacts}
+                        {artifactsLoading}
+                    />
+                {:else}
+                    <p class="empty-message">No research log available</p>
+                {/if}
             {:else}
                 <div class="events-list">
                     {#each selected.data.events as event, i (i)}
@@ -569,13 +567,6 @@
         overflow-y: auto;
     }
 
-    .research-log {
-        background: var(--bg-surface);
-        border: 1px solid var(--border-default);
-        border-radius: var(--radius-md);
-        padding: var(--space-3);
-    }
-
     .events-list {
         display: flex;
         flex-direction: column;
diff --git a/spd/app/frontend/src/components/RunSelector.svelte b/spd/app/frontend/src/components/RunSelector.svelte
index aa4728bd8..790cda119 100644
--- a/spd/app/frontend/src/components/RunSelector.svelte
+++ b/spd/app/frontend/src/components/RunSelector.svelte
@@ -17,6 +17,20 @@
     // Architecture info fetched in real-time for each canonical run
     let archInfo = $state<Record<string, PretrainInfoResponse | "loading" | "error">>({});
 
+    function formatArchLabel(info: PretrainInfoResponse): string {
+        const cfg = info.target_model_config;
+        const parts: string[] = [];
+        if (info.dataset_short) parts.push(info.dataset_short);
+        parts.push(info.model_type);
+        if (cfg) {
+            const nLayer = cfg.n_layer as number | undefined;
+            const nEmbd = cfg.n_embd as number | undefined;
+            if (nLayer != null) parts.push(`${nLayer}L`);
+            if (nEmbd != null) parts.push(`d${nEmbd}`);
+        }
+        return parts.join(" ");
+    }
+
     onMount(() => {
         for (const entry of CANONICAL_RUNS) {
             archInfo[entry.wandbRunId] = "loading";
@@ -63,16 +77,17 @@
             {#each CANONICAL_RUNS as entry (entry.wandbRunId)}
                 {@const info = archInfo[entry.wandbRunId]}
                 <button class="run-card" onclick={() => handleRegistrySelect(entry)} disabled={isLoading}>
-                    <span class="run-model">{entry.modelName}</span>
+                    {#if info && info !== "loading" && info !== "error"}
+                        <span class="run-model">{formatArchLabel(info)}</span>
+                    {:else if info === "loading"}
+                        <span class="run-model loading">loading...</span>
+                    {:else}
+                        <span class="run-model">{formatRunIdForDisplay(entry.wandbRunId)}</span>
+                    {/if}
                     <span class="run-id">{formatRunIdForDisplay(entry.wandbRunId)}</span>
                     {#if entry.notes}
                         <span class="run-notes">{entry.notes}</span>
                     {/if}
-                    {#if info && info !== "loading" && info !== "error"}
-                        <span class="run-arch">{info.summary}</span>
-                    {:else if info === "loading"}
-                        <span class="run-arch loading">loading arch...</span>
-                    {/if}
                     {#if entry.clusterMappings}
                         <span class="run-cluster-mappings">{entry.clusterMappings.length} clustering runs</span>
                     {/if}
@@ -200,7 +215,12 @@
         font-size: var(--text-sm);
         font-weight: 600;
         color: var(--text-primary);
-        font-family: var(--font-sans);
+        font-family: var(--font-mono);
+    }
+
+    .run-model.loading {
+        opacity: 0.5;
+        font-style: italic;
     }
 
     .run-id {
@@ -215,23 +235,6 @@
         font-family: var(--font-sans);
     }
 
-    .run-arch {
-        font-size: 10px;
-        font-family: var(--font-mono);
-        color: var(--text-secondary, var(--text-muted));
-        background: var(--bg-inset, var(--bg-base));
-        padding: 1px 4px;
-        border-radius: 3px;
-        line-height: 1.3;
-    }
-
-    .run-arch.loading {
-        opacity: 0.5;
-        font-style: italic;
-        font-family: var(--font-sans);
-        background: none;
-    }
-
     .run-cluster-mappings {
         font-size: var(--text-xs);
         color: var(--text-muted);
diff --git a/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte b/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte
index 39db2d4ee..ed7c13633 100644
--- a/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte
+++ b/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte
@@ -8,7 +8,7 @@
     import type { NodePosition, EdgeData, OutputProbability, HoveredNode } from "../../lib/promptAttributionsTypes";
     import { buildEdgeIndexes } from "../../lib/promptAttributionsTypes";
     import type { ArtifactGraphData } from "../../lib/api/investigations";
-    import { getAliasedRowLabel } from "../../lib/layerAliasing";
+    import { getRowKey, sortRows, getRowLabel } from "../../lib/graphLayout";
     import { colors, getEdgeColor, rgbToCss } from "../../lib/colors";
     import {
         lerp,
@@ -30,8 +30,6 @@
     const HIT_AREA_PADDING = 4;
     const MARGIN = { top: 60, right: 40, bottom: 20, left: 20 };
     const LABEL_WIDTH = 100;
-    const ROW_ORDER = ["wte", "qkv", "o_proj", "c_fc", "down_proj", "lm_head", "output"];
-    const QKV_SUBTYPES = ["q_proj", "k_proj", "v_proj"];
 
     type Props = {
         data: ArtifactGraphData;
@@ -53,25 +51,6 @@
     let innerContainer: HTMLDivElement;
     const zoom = useZoomPan(() => innerContainer);
 
-    // Helper functions (not reactive)
-    function parseLayer(name: string): { block: number; subtype: string } {
-        if (name === "wte") return { block: -1, subtype: "wte" };
-        if (name === "lm_head") return { block: Infinity - 1, subtype: "lm_head" };
-        if (name === "output") return { block: Infinity, subtype: "output" };
-        const m = name.match(/h\.(\d+)\.(attn|mlp)\.(\w+)/);
-        if (!m) return { block: 0, subtype: name };
-        return { block: +m[1], subtype: m[3] };
-    }
-
-    function getRowKey(layer: string): string {
-        const info = parseLayer(layer);
-        if (QKV_SUBTYPES.includes(info.subtype)) {
-            const m = layer.match(/h\.(\d+)/);
-            return m ? `h.${m[1]}.qkv` : layer;
-        }
-        return layer;
-    }
-
     // Compute layout - ONLY for nodes involved in edges
     function computeLayout(graphData: ArtifactGraphData, edges: EdgeData[], compGap: number, lGap: number) {
         // Collect only nodes involved in the displayed edges
@@ -97,23 +76,7 @@
             nodesPerLayerSeq[key].push(+cIdx);
         }
 
-        // Sort rows
-        const rows = Array.from(allRows).sort((a, b) => {
-            const parseRow = (r: string) => {
-                if (r === "wte") return { block: -1, subtype: "wte" };
-                if (r === "lm_head") return { block: Infinity - 1, subtype: "lm_head" };
-                if (r === "output") return { block: Infinity, subtype: "output" };
-                const mQkv = r.match(/h\.(\d+)\.qkv/);
-                if (mQkv) return { block: +mQkv[1], subtype: "qkv" };
-                const m = r.match(/h\.(\d+)\.(attn|mlp)\.(\w+)/);
-                if (!m) return { block: 0, subtype: r };
-                return { block: +m[1], subtype: m[3] };
-            };
-            const infoA = parseRow(a);
-            const infoB = parseRow(b);
-            if (infoA.block !== infoB.block) return infoA.block - infoB.block;
-            return ROW_ORDER.indexOf(infoA.subtype) - ROW_ORDER.indexOf(infoB.subtype);
-        });
+        const rows = sortRows(Array.from(allRows));
 
         // Assign Y positions
         const rowYPositions: Record<string, number> = {};
@@ -260,7 +223,7 @@
             hoverTimeout = null;
         }
         hoveredNode = { layer, seqIdx, cIdx };
-        const size = layer === "wte" || layer === "output" ? "small" : "large";
+        const size = layer === "embed" || layer === "output" ? "small" : "large";
         tooltipPos = calcTooltipPos(event.clientX, event.clientY, size);
     }
 
@@ -313,7 +276,7 @@
                             font-family="'Berkeley Mono', 'SF Mono', monospace"
                             fill={colors.textSecondary}
                         >
-                            {getAliasedRowLabel(layer, getRowKey(layer).endsWith(".qkv"))}
+                            {getRowLabel(getRowKey(layer))}
                         </text>
                     {/each}
                 </g>
diff --git a/spd/app/frontend/src/components/ui/CorrelatedSubcomponentsList.svelte b/spd/app/frontend/src/components/ui/CorrelatedSubcomponentsList.svelte
index 3423b7c9c..d625c00d2 100644
--- a/spd/app/frontend/src/components/ui/CorrelatedSubcomponentsList.svelte
+++ b/spd/app/frontend/src/components/ui/CorrelatedSubcomponentsList.svelte
@@ -1,6 +1,5 @@
 <script lang="ts">
     import { getContext } from "svelte";
-    import { formatNodeKeyWithAliases } from "../../lib/layerAliasing";
     import type { CorrelatedSubcomponent } from "../../lib/promptAttributionsTypes";
     import { displaySettings } from "../../lib/displaySettings.svelte";
     import { RUN_KEY, type RunContext } from "../../lib/useRun.svelte";
@@ -53,13 +52,13 @@
                 onclick={() => onComponentClick?.(component_key)}
                 onmouseenter={() => (hoveredKey = component_key)}
                 onmouseleave={() => (hoveredKey = null)}
-                title={formatNodeKeyWithAliases(component_key)}
+                title={component_key}
             >
                 <div class="pill-content">
                     {#if label}
                         <span class="interp-label">{label}</span>
                     {:else}
-                        <span class="component-text">{formatNodeKeyWithAliases(component_key)}</span>
+                        <span class="component-text">{component_key}</span>
                     {/if}
                     <span class="component-text">({score.toFixed(2)})</span>
                 </div>
diff --git a/spd/app/frontend/src/lib/api/graphs.ts b/spd/app/frontend/src/lib/api/graphs.ts
index 593361379..768e9266c 100644
--- a/spd/app/frontend/src/lib/api/graphs.ts
+++ b/spd/app/frontend/src/lib/api/graphs.ts
@@ -4,7 +4,6 @@
 
 import type { GraphData, TokenizeResponse, TokenInfo } from "../promptAttributionsTypes";
 import { buildEdgeIndexes } from "../promptAttributionsTypes";
-import { setArchitecture } from "../layerAliasing";
 import { apiUrl, ApiError, fetchJson } from "./index";
 
 export type NormalizeType = "none" | "target" | "layer";
@@ -59,14 +58,6 @@ async function parseGraphSSEStream(
             } else if (data.type === "error") {
                 throw new ApiError(data.error, 500);
             } else if (data.type === "complete") {
-                // Extract all unique layer names from edges to detect architecture
-                const layerNames = new Set<string>();
-                for (const edge of data.data.edges) {
-                    layerNames.add(edge.src.split(":")[0]);
-                    layerNames.add(edge.tgt.split(":")[0]);
-                }
-                setArchitecture(Array.from(layerNames));
-
                 const { edgesBySource, edgesByTarget } = buildEdgeIndexes(data.data.edges);
                 result = { ...data.data, edgesBySource, edgesByTarget };
                 await reader.cancel();
@@ -158,14 +149,6 @@ export async function getGraphs(promptId: number, normalize: NormalizeType, ciTh
     url.searchParams.set("ci_threshold", String(ciThreshold));
     const graphs = await fetchJson<Omit<GraphData, "edgesBySource" | "edgesByTarget">[]>(url.toString());
     return graphs.map((g) => {
-        // Extract all unique layer names from edges to detect architecture
-        const layerNames = new Set<string>();
-        for (const edge of g.edges) {
-            layerNames.add(edge.src.split(":")[0]);
-            layerNames.add(edge.tgt.split(":")[0]);
-        }
-        setArchitecture(Array.from(layerNames));
-
         const { edgesBySource, edgesByTarget } = buildEdgeIndexes(g.edges);
         return { ...g, edgesBySource, edgesByTarget };
     });
diff --git a/spd/app/frontend/src/lib/api/pretrainInfo.ts b/spd/app/frontend/src/lib/api/pretrainInfo.ts
index 7092c735a..0cd66bd97 100644
--- a/spd/app/frontend/src/lib/api/pretrainInfo.ts
+++ b/spd/app/frontend/src/lib/api/pretrainInfo.ts
@@ -20,6 +20,7 @@ export type TopologyInfo = {
 export type PretrainInfoResponse = {
     model_type: string;
     summary: string;
+    dataset_short: string | null;
     target_model_config: Record<string, unknown> | null;
     pretrain_config: Record<string, unknown> | null;
     pretrain_wandb_path: string | null;
diff --git a/spd/app/frontend/src/lib/layerAliasing.ts b/spd/app/frontend/src/lib/layerAliasing.ts
deleted file mode 100644
index 2c5269543..000000000
--- a/spd/app/frontend/src/lib/layerAliasing.ts
+++ /dev/null
@@ -1,219 +0,0 @@
-/**
- * Layer aliasing system - transforms internal module names to human-readable aliases.
- *
- * Formats:
- * - Internal: "h.0.mlp.c_fc", "h.1.attn.q_proj"
- * - Aliased: "L0.mlp.in", "L1.attn.q"
- *
- * Handles multiple architectures:
- * - GPT-2: c_fc -> mlp.in, down_proj -> mlp.out
- * - Llama SwiGLU: gate_proj -> mlp.gate, up_proj -> mlp.up, down_proj -> mlp.down
- * - Attention: q_proj -> attn.q, k_proj -> attn.k, v_proj -> attn.v, o_proj -> attn.o
- * - Special: lm_head -> W_U, embed/output unchanged
- */
-
-type Architecture = "gpt2" | "llama" | "unknown";
-
-/** Mapping of internal module names to aliases by architecture */
-const ALIASES: Record<Architecture, Record<string, string>> = {
-    gpt2: {
-        // MLP
-        c_fc: "in",
-        down_proj: "out",
-        // Attention
-        q_proj: "q",
-        k_proj: "k",
-        v_proj: "v",
-        o_proj: "o",
-    },
-    llama: {
-        // MLP (SwiGLU)
-        gate_proj: "gate",
-        up_proj: "up",
-        down_proj: "down",
-        // Attention
-        q_proj: "q",
-        k_proj: "k",
-        v_proj: "v",
-        o_proj: "o",
-    },
-    unknown: {
-        // Fallback - just do attention mappings
-        q_proj: "q",
-        k_proj: "k",
-        v_proj: "v",
-        o_proj: "o",
-    },
-};
-
-/** Special layers with fixed display names */
-const SPECIAL_LAYERS: Record<string, string> = {
-    lm_head: "W_U",
-    embed: "embed",
-    output: "output",
-};
-
-// Cache for detected architecture from the full model
-let cachedArchitecture: Architecture | null = null;
-
-/**
- * Detect architecture from a collection of layer names.
- * Llama has gate_proj/up_proj, GPT-2 has c_fc.
- *
- * This should be called once with all available layer names to establish
- * the architecture for the session, ensuring down_proj is aliased correctly.
- */
-export function detectArchitectureFromLayers(layers: string[]): Architecture {
-    const hasLlamaLayers = layers.some((layer) => layer.includes("gate_proj") || layer.includes("up_proj"));
-    if (hasLlamaLayers) {
-        return "llama";
-    }
-
-    const hasGPT2Layers = layers.some((layer) => layer.includes("c_fc"));
-    if (hasGPT2Layers) {
-        return "gpt2";
-    }
-
-    return "unknown";
-}
-
-/**
- * Set the architecture for aliasing operations.
- * Call this when you have access to all layer names (e.g., when loading a graph).
- */
-export function setArchitecture(layers: string[]): void {
-    cachedArchitecture = detectArchitectureFromLayers(layers);
-}
-
-/**
- * Detect architecture from layer name.
- * Uses cached architecture if available (set via setArchitecture()),
- * otherwise falls back to single-layer detection.
- *
- * Note: down_proj appears in both architectures with different meanings:
- * - GPT-2: down_proj -> "out" (second MLP projection)
- * - Llama: down_proj -> "down" (third MLP projection after gate/up)
- *
- * Single-layer detection cannot distinguish these cases reliably.
- */
-function detectArchitecture(layer: string): Architecture {
-    // Use cached architecture if available
-    if (cachedArchitecture !== null) {
-        return cachedArchitecture;
-    }
-
-    // Fallback: single-layer detection (less reliable for down_proj)
-    if (layer.includes("gate_proj") || layer.includes("up_proj")) {
-        return "llama";
-    }
-    if (layer.includes("c_fc")) {
-        return "gpt2";
-    }
-    // down_proj is ambiguous without context, default to GPT-2
-    if (layer.includes("down_proj")) {
-        return "gpt2";
-    }
-    return "unknown";
-}
-
-/**
- * Parse a layer name into components.
- * Returns null for special layers (embed, output, lm_head) or unrecognized formats.
- */
-function parseLayerName(layer: string): { block: number; moduleType: string; submodule: string } | null {
-    if (layer in SPECIAL_LAYERS) {
-        return null;
-    }
-
-    const match = layer.match(/^h\.(\d+)\.(attn|mlp)\.(\w+)$/);
-    if (!match) {
-        return null;
-    }
-
-    const [, blockStr, moduleType, submodule] = match;
-    return {
-        block: parseInt(blockStr),
-        moduleType,
-        submodule,
-    };
-}
-
-/**
- * Transform a layer name to its aliased form.
- *
- * Examples:
- * - "h.0.mlp.c_fc" -> "L0.mlp.in"
- * - "h.2.attn.q_proj" -> "L2.attn.q"
- * - "lm_head" -> "W_U"
- * - "embed" -> "embed"
- */
-export function getLayerAlias(layer: string): string {
-    if (layer in SPECIAL_LAYERS) {
-        return SPECIAL_LAYERS[layer];
-    }
-
-    const parsed = parseLayerName(layer);
-    if (!parsed) {
-        return layer;
-    }
-
-    const arch = detectArchitecture(layer);
-    const alias = ALIASES[arch][parsed.submodule];
-
-    if (!alias) {
-        return `L${parsed.block}.${parsed.moduleType}.${parsed.submodule}`;
-    }
-
-    return `L${parsed.block}.${parsed.moduleType}.${alias}`;
-}
-
-/**
- * Get a row label for grouped display in graphs.
- *
- * @param layer - Internal layer name (e.g., "h.0.mlp.c_fc")
- * @param isQkvGroup - Whether this represents a grouped QKV row
- * @returns Label (e.g., "L0.mlp.in", "L2.attn.qkv")
- *
- * @example
- * getAliasedRowLabel("h.0.mlp.c_fc") // => "L0.mlp.in"
- * getAliasedRowLabel("h.2.attn.q_proj", true) // => "L2.attn.qkv"
- */
-export function getAliasedRowLabel(layer: string, isQkvGroup = false): string {
-    if (layer in SPECIAL_LAYERS) {
-        return SPECIAL_LAYERS[layer];
-    }
-
-    const parsed = parseLayerName(layer);
-    if (!parsed) {
-        return layer;
-    }
-
-    if (isQkvGroup) {
-        return `L${parsed.block}.${parsed.moduleType}.qkv`;
-    }
-
-    const arch = detectArchitecture(layer);
-    const alias = ALIASES[arch][parsed.submodule];
-
-    if (!alias) {
-        return `L${parsed.block}.${parsed.moduleType}.${parsed.submodule}`;
-    }
-
-    return `L${parsed.block}.${parsed.moduleType}.${alias}`;
-}
-
-/**
- * Format a node key with aliased layer names.
- *
- * Node keys are "layer:seq:cIdx" or "layer:cIdx" format.
- *
- * Examples:
- * - "h.0.mlp.c_fc:3:5" -> "L0.mlp.in:3:5"
- * - "h.1.attn.q_proj:2:10" -> "L1.attn.q:2:10"
- */
-export function formatNodeKeyWithAliases(nodeKey: string): string {
-    const parts = nodeKey.split(":");
-    const layer = parts[0];
-    const aliasedLayer = getLayerAlias(layer);
-    return [aliasedLayer, ...parts.slice(1)].join(":");
-}
diff --git a/spd/app/frontend/src/lib/registry.ts b/spd/app/frontend/src/lib/registry.ts
index 3d923a322..cacc113fa 100644
--- a/spd/app/frontend/src/lib/registry.ts
+++ b/spd/app/frontend/src/lib/registry.ts
@@ -5,8 +5,6 @@
 export type RegistryEntry = {
     /** Full wandb run id (e.g., "goodfire/spd/jyo9duz5") */
     wandbRunId: string;
-    /** Human-readable model name */
-    modelName: string;
     /** Optional notes about the run */
     notes?: string;
     /** Optional cluster mappings for the run */
@@ -25,27 +23,22 @@ const DEFAULT_ENTITY_PROJECT = "goodfire/spd";
 export const CANONICAL_RUNS: RegistryEntry[] = [
     {
         wandbRunId: "goodfire/spd/s-275c8f21",
-        modelName: "???",
         notes: "Lucius' pile run Feb 11",
     },
     {
         wandbRunId: "goodfire/spd/s-eab2ace8",
-        modelName: "ss_llama_simple_mlp-2L-wide",
         notes: "Oli's PPGD run, great metrics",
     },
     {
         wandbRunId: "goodfire/spd/s-892f140b",
-        modelName: "ss_llama_simple_mlp-2L-wide",
         notes: "Lucius run, Jan 22",
     },
     {
         wandbRunId: "goodfire/spd/s-7884efcc",
-        modelName: "ss_llama_simple_mlp-1.25M (4L)",
         notes: "Lucius' new run, Jan 8",
     },
     {
         wandbRunId: "goodfire/spd/vjbol27n",
-        modelName: "ss_llama_simple_mlp-1.25M (4L)",
         notes: "Lucius' run, Dec 8",
         clusterMappings: [
             {
@@ -56,16 +49,13 @@ export const CANONICAL_RUNS: RegistryEntry[] = [
     },
     {
         wandbRunId: "goodfire/spd/278we8gk",
-        modelName: "ss_llama_simple_mlp-1.25M (4L)",
         notes: "Dan's initial run, Dec 6",
     },
     {
         wandbRunId: "goodfire/spd/jyo9duz5",
-        modelName: "ss_gpt2_simple-1.25M (4L)",
     },
     {
         wandbRunId: "goodfire/spd/5cr21lbs",
-        modelName: "ss_llama_simple_mlp (1L)",
         clusterMappings: [
             {
                 path: "clustering/ensembles/e-04370c84/cluster_mapping_e-04370c84.json",
@@ -79,11 +69,9 @@ export const CANONICAL_RUNS: RegistryEntry[] = [
     },
     {
         wandbRunId: "goodfire/spd/itmexlj0",
-        modelName: "ss_llama_simple_mlp (2L)",
     },
     {
         wandbRunId: "goodfire/spd/33n6xjjt",
-        modelName: "ss_gpt2_simple (1L)",
     },
 ];
 

From 2625f34144c2af979bb9a2e54367df5a7c168783 Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Fri, 13 Feb 2026 16:14:05 +0000
Subject: [PATCH 17/62] Fix MCP canonical/concrete key translation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- get_component_info: translate canonical → concrete for harvest/interp
  lookups, canonicalize correlated component keys in response
- save_graph_artifact: use 'embed' not 'wte' for pseudo-nodes
- get_component_activation_examples: return canonical keys
- Tool descriptions: update examples to canonical format
- ArtifactGraph: prefetch component data on mount for tooltip cards
- Filter both 'wte' and 'embed' as non-interventable nodes
- Remove unused CSS selector in StagedNodesPanel

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/app/backend/routers/mcp.py                | 51 ++++++++++++-------
 .../src/components/InvestigationsTab.svelte   |  1 -
 .../investigations/ArtifactGraph.svelte       | 20 ++++++--
 .../prompt-attr/StagedNodesPanel.svelte       |  8 ---
 .../src/lib/promptAttributionsTypes.ts        |  2 +-
 5 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/spd/app/backend/routers/mcp.py b/spd/app/backend/routers/mcp.py
index 7705d3396..481421e31 100644
--- a/spd/app/backend/routers/mcp.py
+++ b/spd/app/backend/routers/mcp.py
@@ -166,7 +166,7 @@ class ToolDefinition(BaseModel):
             "properties": {
                 "layer": {
                     "type": "string",
-                    "description": "Layer name (e.g., 'h.0.mlp.c_fc', 'h.2.attn.o_proj')",
+                    "description": "Canonical layer name (e.g., '0.mlp.up', '2.attn.o')",
                 },
                 "component_idx": {
                     "type": "integer",
@@ -295,7 +295,7 @@ class ToolDefinition(BaseModel):
                         "properties": {
                             "component_key": {
                                 "type": "string",
-                                "description": "Component key (e.g., 'h.0.mlp.c_fc:5')",
+                                "description": "Component key (e.g., '0.mlp.up:5')",
                             },
                             "role": {
                                 "type": "string",
@@ -574,6 +574,12 @@ def _canonicalize_layer(layer: str, loaded: Any) -> str:
     return loaded.topology.target_to_canon(layer)
 
 
+def _canonicalize_key(concrete_key: str, loaded: Any) -> str:
+    """Translate concrete component key (e.g. 'h.0.mlp.c_fc:444') to canonical ('0.mlp.up:444')."""
+    layer, idx = concrete_key.rsplit(":", 1)
+    return f"{_canonicalize_layer(layer, loaded)}:{idx}"
+
+
 def _tool_optimize_graph(params: dict[str, Any]) -> Generator[dict[str, Any]]:
     """Optimize a sparse circuit for a behavior. Yields progress events."""
     manager, loaded = _get_state()
@@ -775,17 +781,23 @@ def _tool_get_component_info(params: dict[str, Any]) -> dict[str, Any]:
     layer = params["layer"]
     component_idx = params["component_idx"]
     top_k = params.get("top_k", 20)
-    component_key = f"{layer}:{component_idx}"
+    canonical_key = f"{layer}:{component_idx}"
+
+    # Harvest/interp repos store concrete keys (e.g. "h.0.mlp.c_fc:444")
+    concrete_layer = loaded.topology.canon_to_target(layer)
+    concrete_key = f"{concrete_layer}:{component_idx}"
 
     _log_event(
-        "tool_call", f"get_component_info: {component_key}", {"layer": layer, "idx": component_idx}
+        "tool_call",
+        f"get_component_info: {canonical_key}",
+        {"layer": layer, "idx": component_idx},
     )
 
-    result: dict[str, Any] = {"component_key": component_key}
+    result: dict[str, Any] = {"component_key": canonical_key}
 
     # Get interpretation
     if loaded.interp is not None:
-        interp = loaded.interp.get_interpretation(component_key)
+        interp = loaded.interp.get_interpretation(concrete_key)
         if interp is not None:
             result["interpretation"] = {
                 "label": interp.label,
@@ -802,10 +814,10 @@ def _tool_get_component_info(params: dict[str, Any]) -> dict[str, Any]:
     token_stats = loaded.harvest.get_token_stats()
     if token_stats is not None:
         input_stats = analysis.get_input_token_stats(
-            token_stats, component_key, loaded.tokenizer, top_k
+            token_stats, concrete_key, loaded.tokenizer, top_k
         )
         output_stats = analysis.get_output_token_stats(
-            token_stats, component_key, loaded.tokenizer, top_k
+            token_stats, concrete_key, loaded.tokenizer, top_k
         )
         if input_stats and output_stats:
             result["token_stats"] = {
@@ -826,20 +838,20 @@ def _tool_get_component_info(params: dict[str, Any]) -> dict[str, Any]:
     else:
         result["token_stats"] = None
 
-    # Get correlations
+    # Get correlations (return canonical keys)
     correlations = loaded.harvest.get_correlations()
-    if correlations is not None and analysis.has_component(correlations, component_key):
+    if correlations is not None and analysis.has_component(correlations, concrete_key):
         result["correlated_components"] = {
             "precision": [
-                {"key": c.component_key, "score": c.score}
+                {"key": _canonicalize_key(c.component_key, loaded), "score": c.score}
                 for c in analysis.get_correlated_components(
-                    correlations, component_key, "precision", top_k
+                    correlations, concrete_key, "precision", top_k
                 )
             ],
             "pmi": [
-                {"key": c.component_key, "score": c.score}
+                {"key": _canonicalize_key(c.component_key, loaded), "score": c.score}
                 for c in analysis.get_correlated_components(
-                    correlations, component_key, "pmi", top_k
+                    correlations, concrete_key, "pmi", top_k
                 )
             ],
         }
@@ -873,7 +885,7 @@ def _tool_run_ablation(params: dict[str, Any]) -> dict[str, Any]:
         if len(parts) != 3:
             raise ValueError(f"Invalid node key format: {key!r} (expected 'layer:seq:cIdx')")
         layer, seq_str, cidx_str = parts
-        if layer in ("wte", "output"):
+        if layer in ("wte", "embed", "output"):
             raise ValueError(f"Cannot intervene on {layer!r} nodes - only internal layers allowed")
         active_nodes.append((layer, int(seq_str), int(cidx_str)))
 
@@ -1157,10 +1169,10 @@ def _tool_save_graph_artifact(params: dict[str, Any]) -> dict[str, Any]:
     filtered_ci_vals = {k: v for k, v in graph.node_ci_vals.items() if v > ci_threshold}
     l0_total = len(filtered_ci_vals)
 
-    # Step 2: Add pseudo nodes (wte and output) - same as _add_pseudo_layer_nodes
+    # Step 2: Add pseudo nodes (embed and output) - same as _add_pseudo_layer_nodes
     node_ci_vals_with_pseudo = dict(filtered_ci_vals)
     for seq_pos in range(num_tokens):
-        node_ci_vals_with_pseudo[f"wte:{seq_pos}:0"] = 1.0
+        node_ci_vals_with_pseudo[f"embed:{seq_pos}:0"] = 1.0
     for key, out_prob in out_probs.items():
         seq_pos, token_id = key.split(":")
         node_ci_vals_with_pseudo[f"output:{seq_pos}:{token_id}"] = out_prob.prob
@@ -1298,9 +1310,10 @@ def _tool_get_component_activation_examples(params: dict[str, Any]) -> dict[str,
     )
 
     assert loaded.harvest is not None, "harvest data not loaded"
+    canonical_key = f"{layer}:{component_idx}"
     comp = loaded.harvest.get_component(component_key)
     if comp is None:
-        return {"component_key": component_key, "examples": [], "total": 0}
+        return {"component_key": canonical_key, "examples": [], "total": 0}
 
     examples = []
     for ex in comp.activation_examples[:limit]:
@@ -1314,7 +1327,7 @@ def _tool_get_component_activation_examples(params: dict[str, Any]) -> dict[str,
         )
 
     return {
-        "component_key": component_key,
+        "component_key": canonical_key,
         "examples": examples,
         "total": len(comp.activation_examples),
         "mean_ci": comp.mean_ci,
diff --git a/spd/app/frontend/src/components/InvestigationsTab.svelte b/spd/app/frontend/src/components/InvestigationsTab.svelte
index a886c7582..b7752cb5f 100644
--- a/spd/app/frontend/src/components/InvestigationsTab.svelte
+++ b/spd/app/frontend/src/components/InvestigationsTab.svelte
@@ -484,7 +484,6 @@
         overflow: hidden;
         text-overflow: ellipsis;
         display: -webkit-box;
-        -webkit-line-clamp: 2;
         -webkit-box-orient: vertical;
     }
 
diff --git a/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte b/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte
index ed7c13633..7ac736961 100644
--- a/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte
+++ b/spd/app/frontend/src/components/investigations/ArtifactGraph.svelte
@@ -4,9 +4,9 @@
      * Includes tooltips using the same NodeTooltip as the main graph.
      */
 
-    import { getContext } from "svelte";
+    import { getContext, onMount } from "svelte";
     import type { NodePosition, EdgeData, OutputProbability, HoveredNode } from "../../lib/promptAttributionsTypes";
-    import { buildEdgeIndexes } from "../../lib/promptAttributionsTypes";
+    import { buildEdgeIndexes, isInterventableNode, nodeKeyToComponentKey } from "../../lib/promptAttributionsTypes";
     import type { ArtifactGraphData } from "../../lib/api/investigations";
     import { getRowKey, sortRows, getRowLabel } from "../../lib/graphLayout";
     import { colors, getEdgeColor, rgbToCss } from "../../lib/colors";
@@ -22,9 +22,23 @@
     import NodeTooltip from "../prompt-attr/NodeTooltip.svelte";
     import { RUN_KEY, type RunContext } from "../../lib/useRun.svelte";
 
-    // Get run context for tooltips
+    // Get run context for tooltips and prefetching
     const runState = getContext<RunContext>(RUN_KEY);
 
+    // Prefetch component data on mount so tooltips can show full detail
+    onMount(() => {
+        // eslint-disable-next-line svelte/prefer-svelte-reactivity -- local variable, not reactive state
+        const componentKeys = new Set<string>();
+        for (const nodeKey of Object.keys(data.nodeCiVals)) {
+            if (isInterventableNode(nodeKey)) {
+                componentKeys.add(nodeKeyToComponentKey(nodeKey));
+            }
+        }
+        if (componentKeys.size > 0) {
+            runState.prefetchComponentData(Array.from(componentKeys));
+        }
+    });
+
     // Constants
     const COMPONENT_SIZE = 8;
     const HIT_AREA_PADDING = 4;
diff --git a/spd/app/frontend/src/components/prompt-attr/StagedNodesPanel.svelte b/spd/app/frontend/src/components/prompt-attr/StagedNodesPanel.svelte
index c8c55018c..957edf2cb 100644
--- a/spd/app/frontend/src/components/prompt-attr/StagedNodesPanel.svelte
+++ b/spd/app/frontend/src/components/prompt-attr/StagedNodesPanel.svelte
@@ -159,14 +159,6 @@
         border-color: var(--status-negative);
     }
 
-    .staged-item h3 {
-        font-size: var(--text-base);
-        font-family: var(--font-mono);
-        font-weight: 600;
-        color: var(--text-primary);
-        margin: 0 0 var(--space-2) 0;
-    }
-
     .token-display {
         font-size: var(--text-sm);
         font-family: var(--font-mono);
diff --git a/spd/app/frontend/src/lib/promptAttributionsTypes.ts b/spd/app/frontend/src/lib/promptAttributionsTypes.ts
index c639e20c6..f497a2bcf 100644
--- a/spd/app/frontend/src/lib/promptAttributionsTypes.ts
+++ b/spd/app/frontend/src/lib/promptAttributionsTypes.ts
@@ -228,7 +228,7 @@ export function formatNodeKeyForDisplay(nodeKey: string, displayNames: Record<st
 // "embed" and "output" are pseudo-layers used for visualization but are not part of the
 // decomposed model. They cannot be intervened on - only the internal layers (attn/mlp)
 // can have their components selectively activated.
-const NON_INTERVENTABLE_LAYERS = new Set(["embed", "output"]);
+const NON_INTERVENTABLE_LAYERS = new Set(["embed", "wte", "output"]);
 
 export function isInterventableNode(nodeKey: string): boolean {
     const layer = nodeKey.split(":")[0];

From 474e2f38434b7d95746237e4a2ad7d5fd7c3fedd Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Fri, 13 Feb 2026 16:23:52 +0000
Subject: [PATCH 18/62] Move app DB from repo-local .data/ to SPD_OUT_DIR/app/

Persists alongside other artifacts instead of being tied to a repo
checkout. Keyed by run, so multiple runs share the DB safely.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/app/backend/database.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/spd/app/backend/database.py b/spd/app/backend/database.py
index 8dbf3178d..84abd011d 100644
--- a/spd/app/backend/database.py
+++ b/spd/app/backend/database.py
@@ -19,14 +19,12 @@
 
 from spd.app.backend.compute import Edge, Node
 from spd.app.backend.optim_cis import CELossConfig, KLLossConfig, LossConfig, MaskType
-from spd.settings import REPO_ROOT
+from spd.settings import SPD_OUT_DIR
 
 GraphType = Literal["standard", "optimized", "manual"]
 
-# Persistent data directories
-# Can be overridden via SPD_APP_DB_PATH environment variable for isolation
-_APP_DATA_DIR = REPO_ROOT / ".data" / "app"
-_DEFAULT_DB_PATH = _APP_DATA_DIR / "prompt_attr.db"
+# Default DB path: SPD_OUT_DIR/app/prompt_attr.db
+_DEFAULT_DB_PATH = SPD_OUT_DIR / "app" / "prompt_attr.db"
 
 
 def get_default_db_path() -> Path:
@@ -35,7 +33,7 @@ def get_default_db_path() -> Path:
     Checks env vars in order:
     1. SPD_INVESTIGATION_DIR - investigation mode, db at dir/app.db
     2. SPD_APP_DB_PATH - explicit override
-    3. Default: .data/app/prompt_attr.db
+    3. Default: SPD_OUT_DIR/app/prompt_attr.db
     """
     investigation_dir = os.environ.get("SPD_INVESTIGATION_DIR")
     if investigation_dir:
@@ -46,10 +44,6 @@ def get_default_db_path() -> Path:
     return _DEFAULT_DB_PATH
 
 
-# For backwards compatibility
-DEFAULT_DB_PATH = _DEFAULT_DB_PATH
-
-
 class Run(BaseModel):
     """A run record."""
 

From 2e6dff1906eb86fa044fce5d99be23cdf55d7bf4 Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Fri, 13 Feb 2026 16:41:07 +0000
Subject: [PATCH 19/62] Sandbox investigation agent to MCP-only, revert DB path
 move

- Add --permission-mode dontAsk and --allowedTools mcp__spd__* to
  Claude Code launch, preventing use of Bash/Read/Write/Edit and
  blocking inheritance from ~/.claude/settings.json
- Revert DB path back to .data/app/prompt_attr.db

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/app/backend/database.py          | 9 +++++----
 spd/investigate/scripts/run_agent.py | 4 ++++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/spd/app/backend/database.py b/spd/app/backend/database.py
index 84abd011d..6e1105f7e 100644
--- a/spd/app/backend/database.py
+++ b/spd/app/backend/database.py
@@ -19,12 +19,13 @@
 
 from spd.app.backend.compute import Edge, Node
 from spd.app.backend.optim_cis import CELossConfig, KLLossConfig, LossConfig, MaskType
-from spd.settings import SPD_OUT_DIR
+from spd.settings import REPO_ROOT
 
 GraphType = Literal["standard", "optimized", "manual"]
 
-# Default DB path: SPD_OUT_DIR/app/prompt_attr.db
-_DEFAULT_DB_PATH = SPD_OUT_DIR / "app" / "prompt_attr.db"
+# Persistent data directories
+_APP_DATA_DIR = REPO_ROOT / ".data" / "app"
+_DEFAULT_DB_PATH = _APP_DATA_DIR / "prompt_attr.db"
 
 
 def get_default_db_path() -> Path:
@@ -33,7 +34,7 @@ def get_default_db_path() -> Path:
     Checks env vars in order:
     1. SPD_INVESTIGATION_DIR - investigation mode, db at dir/app.db
     2. SPD_APP_DB_PATH - explicit override
-    3. Default: SPD_OUT_DIR/app/prompt_attr.db
+    3. Default: .data/app/prompt_attr.db
     """
     investigation_dir = os.environ.get("SPD_INVESTIGATION_DIR")
     if investigation_dir:
diff --git a/spd/investigate/scripts/run_agent.py b/spd/investigate/scripts/run_agent.py
index ff3d0c877..3a46da5a9 100644
--- a/spd/investigate/scripts/run_agent.py
+++ b/spd/investigate/scripts/run_agent.py
@@ -240,6 +240,10 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
             str(max_turns),
             "--mcp-config",
             str(mcp_config_path),
+            "--permission-mode",
+            "dontAsk",
+            "--allowedTools",
+            "mcp__spd__*",
         ]
 
         logger.info(f"[{inv_id}] Starting Claude Code (max_turns={max_turns})...")

From 26ff2a7ecbc83c19ef2a8ed11802fa9e41313a52 Mon Sep 17 00:00:00 2001
From: Claude SPD1 <claude_spd1@proton.me>
Date: Fri, 13 Feb 2026 17:08:40 +0000
Subject: [PATCH 20/62] Isolate investigation agent from global Claude Code
 config

- Add --setting-sources "" to skip all user/project settings
  (no plugins, no inherited model, no alwaysThinkingEnabled)
- Add --model opus explicitly since global settings are skipped

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/investigate/scripts/run_agent.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/spd/investigate/scripts/run_agent.py b/spd/investigate/scripts/run_agent.py
index 3a46da5a9..54806ed36 100644
--- a/spd/investigate/scripts/run_agent.py
+++ b/spd/investigate/scripts/run_agent.py
@@ -238,12 +238,19 @@ def cleanup(signum: int | None = None, frame: FrameType | None = None) -> None:
             "stream-json",
             "--max-turns",
             str(max_turns),
+            # MCP: only our backend, no inherited servers
             "--mcp-config",
             str(mcp_config_path),
+            # Permissions: only MCP tools, deny everything else
             "--permission-mode",
             "dontAsk",
             "--allowedTools",
             "mcp__spd__*",
+            # Isolation: skip all user/project settings (no plugins, no inherited config)
+            "--setting-sources",
+            "",
+            "--model",
+            "opus",
         ]
 
         logger.info(f"[{inv_id}] Starting Claude Code (max_turns={max_turns})...")

From edc7c5850209ad64c8f5310eb5d4ca6f17faf431 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Thu, 19 Feb 2026 16:29:51 +0000
Subject: [PATCH 21/62] Add topological interpretation module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three-phase context-aware component labeling using network graph structure:
1. Output pass (late→early): labels what each component does, with downstream neighbor context
2. Input pass (early→late): labels what triggers each component, with upstream + co-firing context
3. Unification: synthesizes output + input labels into unified label

Output and input passes are independent (both layer-serial, but no cross-dependency).

Also extracts shared prompt helpers from dual_view.py into autointerp/prompt_helpers.py,
and uses the topology module's CanonicalWeight system for correct layer ordering.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md                                     |   7 +-
 pyproject.toml                                |   3 +-
 spd/autointerp/prompt_helpers.py              | 161 ++++++
 spd/autointerp/strategies/dual_view.py        | 179 +------
 spd/topological_interp/CLAUDE.md              |  71 +++
 spd/topological_interp/__init__.py            |   1 +
 spd/topological_interp/config.py              |  25 +
 spd/topological_interp/db.py                  | 227 ++++++++
 spd/topological_interp/interpret.py           | 507 ++++++++++++++++++
 spd/topological_interp/neighbors.py           | 163 ++++++
 spd/topological_interp/ordering.py            |  88 +++
 spd/topological_interp/prompts.py             | 237 ++++++++
 spd/topological_interp/repo.py                |  88 +++
 spd/topological_interp/schemas.py             |  38 ++
 spd/topological_interp/scripts/__init__.py    |   0
 spd/topological_interp/scripts/run.py         |  93 ++++
 spd/topological_interp/scripts/run_slurm.py   |  70 +++
 .../scripts/run_slurm_cli.py                  |  27 +
 18 files changed, 1821 insertions(+), 164 deletions(-)
 create mode 100644 spd/autointerp/prompt_helpers.py
 create mode 100644 spd/topological_interp/CLAUDE.md
 create mode 100644 spd/topological_interp/__init__.py
 create mode 100644 spd/topological_interp/config.py
 create mode 100644 spd/topological_interp/db.py
 create mode 100644 spd/topological_interp/interpret.py
 create mode 100644 spd/topological_interp/neighbors.py
 create mode 100644 spd/topological_interp/ordering.py
 create mode 100644 spd/topological_interp/prompts.py
 create mode 100644 spd/topological_interp/repo.py
 create mode 100644 spd/topological_interp/schemas.py
 create mode 100644 spd/topological_interp/scripts/__init__.py
 create mode 100644 spd/topological_interp/scripts/run.py
 create mode 100644 spd/topological_interp/scripts/run_slurm.py
 create mode 100644 spd/topological_interp/scripts/run_slurm_cli.py

diff --git a/CLAUDE.md b/CLAUDE.md
index 6bb73e8b2..477ae1f2f 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -137,8 +137,9 @@ Each experiment (`spd/experiments/{tms,resid_mlp,lm}/`) contains:
 - `spd/harvest/` - Offline GPU pipeline for collecting component statistics (correlations, token stats, activation examples)
 - `spd/autointerp/` - LLM-based automated interpretation of components
 - `spd/dataset_attributions/` - Multi-GPU pipeline for computing component-to-component attribution strengths aggregated over training data
-- Data stored at `SPD_OUT_DIR/{harvest,autointerp,dataset_attributions}/<run_id>/`
-- See `spd/harvest/CLAUDE.md`, `spd/autointerp/CLAUDE.md`, and `spd/dataset_attributions/CLAUDE.md` for details
+- `spd/topological_interp/` - Context-aware component labeling using graph structure (attributions + correlations)
+- Data stored at `SPD_OUT_DIR/{harvest,autointerp,dataset_attributions,topological_interp}/<run_id>/`
+- See `spd/harvest/CLAUDE.md`, `spd/autointerp/CLAUDE.md`, `spd/dataset_attributions/CLAUDE.md`, and `spd/topological_interp/CLAUDE.md` for details
 
 **Output Directory (`SPD_OUT_DIR`):**
 
@@ -166,6 +167,7 @@ Each experiment (`spd/experiments/{tms,resid_mlp,lm}/`) contains:
 │   ├── dataset_attributions/        # Dataset attributions (see dataset_attributions/CLAUDE.md)
 │   ├── harvest/                     # Statistics collection (see harvest/CLAUDE.md)
 │   ├── postprocess/                 # Unified postprocessing pipeline (harvest + attributions + autointerp)
+│   ├── topological_interp/          # Context-aware interpretation (see topological_interp/CLAUDE.md)
 │   ├── pretrain/                    # Target model pretraining (see pretrain/CLAUDE.md)
 │   ├── experiments/                 # Experiment implementations
 │   │   ├── tms/                     # Toy Model of Superposition
@@ -201,6 +203,7 @@ Each experiment (`spd/experiments/{tms,resid_mlp,lm}/`) contains:
 | `spd-autointerp` | `spd/autointerp/scripts/run_slurm_cli.py` | Submit autointerp SLURM job |
 | `spd-attributions` | `spd/dataset_attributions/scripts/run_slurm_cli.py` | Submit dataset attribution SLURM job |
 | `spd-postprocess` | `spd/postprocess/cli.py` | Unified postprocessing pipeline (harvest + attributions + interpret + evals) |
+| `spd-topological-interp` | `spd/topological_interp/scripts/run_slurm_cli.py` | Submit topological interpretation SLURM job |
 | `spd-clustering` | `spd/clustering/scripts/run_pipeline.py` | Clustering pipeline |
 | `spd-pretrain` | `spd/pretrain/scripts/run_slurm_cli.py` | Pretrain target models |
 
diff --git a/pyproject.toml b/pyproject.toml
index 88c3405a8..0a9265125 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,6 +57,7 @@ spd-harvest = "spd.harvest.scripts.run_slurm_cli:cli"
 spd-autointerp = "spd.autointerp.scripts.run_slurm_cli:cli"
 spd-attributions = "spd.dataset_attributions.scripts.run_slurm_cli:cli"
 spd-postprocess = "spd.postprocess.cli:cli"
+spd-topological-interp = "spd.topological_interp.scripts.run_slurm_cli:cli"
 
 [build-system]
 requires = ["setuptools", "wheel"]
@@ -69,7 +70,7 @@ include = ["spd*"]
 [tool.ruff]
 line-length = 100
 fix = true
-extend-exclude = ["spd/app/frontend"]
+extend-exclude = ["spd/app/frontend", ".circuits-ref"]
 
 [tool.ruff.lint]
 ignore = [
diff --git a/spd/autointerp/prompt_helpers.py b/spd/autointerp/prompt_helpers.py
new file mode 100644
index 000000000..b52c93f64
--- /dev/null
+++ b/spd/autointerp/prompt_helpers.py
@@ -0,0 +1,161 @@
+"""Shared prompt-building helpers for autointerp and topological interpretation.
+
+Pure functions for formatting component data into LLM prompt sections.
+"""
+
+import re
+
+from spd.app.backend.app_tokenizer import AppTokenizer
+from spd.app.backend.utils import delimit_tokens
+from spd.harvest.analysis import TokenPRLift
+from spd.harvest.schemas import ComponentData
+
+DATASET_DESCRIPTIONS: dict[str, str] = {
+    "SimpleStories/SimpleStories": (
+        "SimpleStories: 2M+ short stories (200-350 words), grade 1-8 reading level. "
+        "Simple vocabulary, common narrative elements."
+    ),
+}
+
+WEIGHT_NAMES: dict[str, str] = {
+    "attn.q": "attention query projection",
+    "attn.k": "attention key projection",
+    "attn.v": "attention value projection",
+    "attn.o": "attention output projection",
+    "mlp.up": "MLP up-projection",
+    "mlp.down": "MLP down-projection",
+    "glu.up": "GLU up-projection",
+    "glu.down": "GLU down-projection",
+    "glu.gate": "GLU gate projection",
+}
+
+_ORDINALS = ["1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th"]
+
+
+def ordinal(n: int) -> str:
+    if 1 <= n <= len(_ORDINALS):
+        return _ORDINALS[n - 1]
+    return f"{n}th"
+
+
+def human_layer_desc(canonical: str, n_blocks: int) -> str:
+    """Convert canonical layer string to human-readable description.
+
+    '0.mlp.up' -> 'MLP up-projection in the 1st of 4 blocks'
+    '1.attn.q' -> 'attention query projection in the 2nd of 4 blocks'
+    """
+    m = re.match(r"(\d+)\.(.*)", canonical)
+    if not m:
+        return canonical
+    layer_idx = int(m.group(1))
+    weight_key = m.group(2)
+    weight_name = WEIGHT_NAMES.get(weight_key, weight_key)
+    return f"{weight_name} in the {ordinal(layer_idx + 1)} of {n_blocks} blocks"
+
+
+def layer_position_note(canonical: str, n_blocks: int) -> str:
+    """Brief note about what layer position means for interpretation."""
+    m = re.match(r"(\d+)\.", canonical)
+    if not m:
+        return ""
+    layer_idx = int(m.group(1))
+    if layer_idx == n_blocks - 1:
+        return "This is in the final block, so its output directly influences token predictions."
+    remaining = n_blocks - 1 - layer_idx
+    return (
+        f"This is {remaining} block{'s' if remaining > 1 else ''} from the output, "
+        f"so its effect on token predictions is indirect — filtered through later layers."
+    )
+
+
+def density_note(firing_density: float) -> str:
+    if firing_density > 0.15:
+        return (
+            "This is a high-density component (fires frequently). "
+            "High-density components often act as broad biases rather than selective features."
+        )
+    if firing_density < 0.005:
+        return "This is a very sparse component, likely highly specific."
+    return ""
+
+
+def build_output_section(
+    output_stats: TokenPRLift,
+    output_pmi: list[tuple[str, float]] | None,
+) -> str:
+    section = ""
+
+    if output_pmi:
+        section += (
+            "**Output PMI (pointwise mutual information, in nats: how much more likely "
+            "a token is to be produced when this component fires, vs its base rate. "
+            "0 = no association, 1 = ~3x more likely, 2 = ~7x, 3 = ~20x):**\n"
+        )
+        for tok, pmi in output_pmi[:10]:
+            section += f"- {repr(tok)}: {pmi:.2f}\n"
+
+    if output_stats.top_precision:
+        section += "\n**Output precision — of all probability mass for token X, what fraction is at positions where this component fires?**\n"
+        for tok, prec in output_stats.top_precision[:10]:
+            section += f"- {repr(tok)}: {prec * 100:.0f}%\n"
+
+    return section
+
+
+def build_input_section(
+    input_stats: TokenPRLift,
+    input_pmi: list[tuple[str, float]] | None,
+) -> str:
+    section = ""
+
+    if input_pmi:
+        section += "**Input PMI (same metric as above, for input tokens):**\n"
+        for tok, pmi in input_pmi[:6]:
+            section += f"- {repr(tok)}: {pmi:.2f}\n"
+
+    if input_stats.top_recall:
+        section += "\n**Input recall — most common tokens when the component fires:**\n"
+        for tok, recall in input_stats.top_recall[:8]:
+            section += f"- {repr(tok)}: {recall * 100:.0f}%\n"
+
+    if input_stats.top_precision:
+        section += "\n**Input precision — probability the component fires given the current token is X:**\n"
+        for tok, prec in input_stats.top_precision[:8]:
+            section += f"- {repr(tok)}: {prec * 100:.0f}%\n"
+
+    return section
+
+
+def build_fires_on_examples(
+    component: ComponentData,
+    app_tok: AppTokenizer,
+    max_examples: int,
+) -> str:
+    section = ""
+    examples = component.activation_examples[:max_examples]
+
+    for i, ex in enumerate(examples):
+        if any(ex.firings):
+            spans = app_tok.get_spans(ex.token_ids)
+            tokens = list(zip(spans, ex.firings, strict=True))
+            section += f"{i + 1}. {delimit_tokens(tokens)}\n"
+
+    return section
+
+
+def build_says_examples(
+    component: ComponentData,
+    app_tok: AppTokenizer,
+    max_examples: int,
+) -> str:
+    section = ""
+    examples = component.activation_examples[:max_examples]
+
+    for i, ex in enumerate(examples):
+        if any(ex.firings):
+            spans = app_tok.get_spans(ex.token_ids)
+            shifted_firings = [False] + ex.firings[:-1]
+            tokens = list(zip(spans, shifted_firings, strict=True))
+            section += f"{i + 1}. {delimit_tokens(tokens)}\n"
+
+    return section
diff --git a/spd/autointerp/strategies/dual_view.py b/spd/autointerp/strategies/dual_view.py
index 430405e41..1206c73f5 100644
--- a/spd/autointerp/strategies/dual_view.py
+++ b/spd/autointerp/strategies/dual_view.py
@@ -7,83 +7,22 @@
 - Task framing asks for functional description, not detection label
 """
 
-import re
-
 from spd.app.backend.app_tokenizer import AppTokenizer
-from spd.app.backend.utils import delimit_tokens
 from spd.autointerp.config import DualViewConfig
+from spd.autointerp.prompt_helpers import (
+    DATASET_DESCRIPTIONS,
+    build_fires_on_examples,
+    build_input_section,
+    build_output_section,
+    build_says_examples,
+    density_note,
+    human_layer_desc,
+    layer_position_note,
+)
 from spd.autointerp.schemas import ModelMetadata
 from spd.harvest.analysis import TokenPRLift
 from spd.harvest.schemas import ComponentData
 
-DATASET_DESCRIPTIONS: dict[str, str] = {
-    "SimpleStories/SimpleStories": (
-        "SimpleStories: 2M+ short stories (200-350 words), grade 1-8 reading level. "
-        "Simple vocabulary, common narrative elements."
-    ),
-}
-
-WEIGHT_NAMES: dict[str, str] = {
-    "attn.q": "attention query projection",
-    "attn.k": "attention key projection",
-    "attn.v": "attention value projection",
-    "attn.o": "attention output projection",
-    "mlp.up": "MLP up-projection",
-    "mlp.down": "MLP down-projection",
-    "glu.up": "GLU up-projection",
-    "glu.down": "GLU down-projection",
-    "glu.gate": "GLU gate projection",
-}
-
-_ORDINALS = ["1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th"]
-
-
-def _ordinal(n: int) -> str:
-    if 1 <= n <= len(_ORDINALS):
-        return _ORDINALS[n - 1]
-    return f"{n}th"
-
-
-def _human_layer_desc(canonical: str, n_blocks: int) -> str:
-    """Convert canonical layer string to human-readable description.
-
-    '0.mlp.up' -> 'MLP up-projection in the 1st of 4 blocks'
-    '1.attn.q' -> 'attention query projection in the 2nd of 4 blocks'
-    """
-    m = re.match(r"(\d+)\.(.*)", canonical)
-    if not m:
-        return canonical
-    layer_idx = int(m.group(1))
-    weight_key = m.group(2)
-    weight_name = WEIGHT_NAMES.get(weight_key, weight_key)
-    return f"{weight_name} in the {_ordinal(layer_idx + 1)} of {n_blocks} blocks"
-
-
-def _layer_position_note(canonical: str, n_blocks: int) -> str:
-    """Brief note about what layer position means for interpretation."""
-    m = re.match(r"(\d+)\.", canonical)
-    if not m:
-        return ""
-    layer_idx = int(m.group(1))
-    if layer_idx == n_blocks - 1:
-        return "This is in the final block, so its output directly influences token predictions."
-    remaining = n_blocks - 1 - layer_idx
-    return (
-        f"This is {remaining} block{'s' if remaining > 1 else ''} from the output, "
-        f"so its effect on token predictions is indirect — filtered through later layers."
-    )
-
-
-def _density_note(firing_density: float) -> str:
-    if firing_density > 0.15:
-        return (
-            "This is a high-density component (fires frequently). "
-            "High-density components often act as broad biases rather than selective features."
-        )
-    if firing_density < 0.005:
-        return "This is a very sparse component, likely highly specific."
-    return ""
-
 
 def format_prompt(
     config: DualViewConfig,
@@ -108,10 +47,10 @@ def format_prompt(
             else None
         )
 
-    output_section = _build_output_section(output_token_stats, output_pmi)
-    input_section = _build_input_section(input_token_stats, input_pmi)
-    fires_on_examples = _build_fires_on_examples(component, app_tok, config.max_examples)
-    says_examples = _build_says_examples(component, app_tok, config.max_examples)
+    output_section = build_output_section(output_token_stats, output_pmi)
+    input_section = build_input_section(input_token_stats, input_pmi)
+    fires_on_examples = build_fires_on_examples(component, app_tok, config.max_examples)
+    says_examples = build_says_examples(component, app_tok, config.max_examples)
 
     if component.firing_density > 0.0:
         rate_str = f"~1 in {int(1 / component.firing_density)} tokens"
@@ -119,11 +58,11 @@ def format_prompt(
         rate_str = "extremely rare"
 
     canonical = model_metadata.layer_descriptions.get(component.layer, component.layer)
-    layer_desc = _human_layer_desc(canonical, model_metadata.n_blocks)
-    position_note = _layer_position_note(canonical, model_metadata.n_blocks)
-    density_note = _density_note(component.firing_density)
+    layer_desc = human_layer_desc(canonical, model_metadata.n_blocks)
+    position_note = layer_position_note(canonical, model_metadata.n_blocks)
+    dens_note = density_note(component.firing_density)
 
-    context_notes = " ".join(filter(None, [position_note, density_note]))
+    context_notes = " ".join(filter(None, [position_note, dens_note]))
 
     dataset_line = ""
     if config.include_dataset_description:
@@ -183,85 +122,3 @@ def format_prompt(
 
 Say "unclear" if the evidence is too weak or diffuse. {forbidden_sentence}Lowercase only.
 """
-
-
-def _build_output_section(
-    output_stats: TokenPRLift,
-    output_pmi: list[tuple[str, float]] | None,
-) -> str:
-    section = ""
-
-    if output_pmi:
-        section += (
-            "**Output PMI (pointwise mutual information, in nats: how much more likely "
-            "a token is to be produced when this component fires, vs its base rate. "
-            "0 = no association, 1 = ~3x more likely, 2 = ~7x, 3 = ~20x):**\n"
-        )
-        for tok, pmi in output_pmi[:10]:
-            section += f"- {repr(tok)}: {pmi:.2f}\n"
-
-    if output_stats.top_precision:
-        section += "\n**Output precision — of all probability mass for token X, what fraction is at positions where this component fires?**\n"
-        for tok, prec in output_stats.top_precision[:10]:
-            section += f"- {repr(tok)}: {prec * 100:.0f}%\n"
-
-    return section
-
-
-def _build_input_section(
-    input_stats: TokenPRLift,
-    input_pmi: list[tuple[str, float]] | None,
-) -> str:
-    section = ""
-
-    if input_pmi:
-        section += "**Input PMI (same metric as above, for input tokens):**\n"
-        for tok, pmi in input_pmi[:6]:
-            section += f"- {repr(tok)}: {pmi:.2f}\n"
-
-    if input_stats.top_recall:
-        section += "\n**Input recall — most common tokens when the component fires:**\n"
-        for tok, recall in input_stats.top_recall[:8]:
-            section += f"- {repr(tok)}: {recall * 100:.0f}%\n"
-
-    if input_stats.top_precision:
-        section += "\n**Input precision — probability the component fires given the current token is X:**\n"
-        for tok, prec in input_stats.top_precision[:8]:
-            section += f"- {repr(tok)}: {prec * 100:.0f}%\n"
-
-    return section
-
-
-def _build_fires_on_examples(
-    component: ComponentData,
-    app_tok: AppTokenizer,
-    max_examples: int,
-) -> str:
-    section = ""
-    examples = component.activation_examples[:max_examples]
-
-    for i, ex in enumerate(examples):
-        if any(ex.firings):
-            spans = app_tok.get_spans(ex.token_ids)
-            tokens = list(zip(spans, ex.firings, strict=True))
-            section += f"{i + 1}. {delimit_tokens(tokens)}\n"
-
-    return section
-
-
-def _build_says_examples(
-    component: ComponentData,
-    app_tok: AppTokenizer,
-    max_examples: int,
-) -> str:
-    section = ""
-    examples = component.activation_examples[:max_examples]
-
-    for i, ex in enumerate(examples):
-        if any(ex.firings):
-            spans = app_tok.get_spans(ex.token_ids)
-            shifted_firings = [False] + ex.firings[:-1]
-            tokens = list(zip(spans, shifted_firings, strict=True))
-            section += f"{i + 1}. {delimit_tokens(tokens)}\n"
-
-    return section
diff --git a/spd/topological_interp/CLAUDE.md b/spd/topological_interp/CLAUDE.md
new file mode 100644
index 000000000..842f1038d
--- /dev/null
+++ b/spd/topological_interp/CLAUDE.md
@@ -0,0 +1,71 @@
+# Topological Interpretation Module
+
+Context-aware component labeling using network graph structure. Unlike standard autointerp (one-shot per component), this module uses dataset attributions to provide neighbor context: each component's prompt includes labels from already-labeled neighbors.
+
+## Usage
+
+```bash
+# Via SLURM (standalone)
+spd-topological-interp <decomposition_id> --config config.yaml
+
+# Direct execution
+python -m spd.topological_interp.scripts.run <decomposition_id> --config_json '{...}'
+```
+
+Requires `OPENROUTER_API_KEY` env var. Requires both harvest data and dataset attributions to exist.
+
+## Three-Phase Pipeline
+
+1. **Output pass** (late → early): "What does this component DO?" Each component's prompt includes top-K downstream neighbors (by attribution) with their labels. Late layers labeled first so earlier layers see labeled downstream context.
+
+2. **Input pass** (early → late): "What TRIGGERS this component?" Each component's prompt includes top-K upstream neighbors (by attribution) + co-firing components (Jaccard/PMI). Plus the output label from phase 1. Early layers labeled first so later layers see labeled upstream context.
+
+3. **Unification** (parallel): Synthesizes output + input labels into a single unified label per component.
+
+All three phases run in a single invocation. Resume is per-phase via completed key sets in the DB.
+
+## Data Storage
+
+```
+SPD_OUT_DIR/topological_interp/<decomposition_id>/
+└── ti-YYYYMMDD_HHMMSS/
+    ├── interp.db       # SQLite: output_labels, input_labels, unified_labels, prompt_edges
+    └── config.yaml
+```
+
+## Database Schema
+
+- `output_labels`: component_key → label, confidence, reasoning, raw_response, prompt
+- `input_labels`: same schema as output_labels
+- `unified_labels`: same schema as output_labels
+- `prompt_edges`: directed filtered graph of (component, neighbor, direction, pass, attribution, neighbor_label)
+- `config`: key-value store
+
+## Architecture
+
+| File | Purpose |
+|------|---------|
+| `config.py` | `TopologicalInterpConfig`, `TopologicalInterpSlurmConfig` |
+| `schemas.py` | `LabelResult`, `PromptEdge`, path helpers |
+| `db.py` | `TopologicalInterpDB` — SQLite with WAL mode |
+| `ordering.py` | Layer parsing and topological sort (matches frontend `graphLayout.ts`) |
+| `neighbors.py` | Gather neighbor context from attributions + correlations |
+| `prompts.py` | Three prompt formatters (output, input, unification) |
+| `interpret.py` | Main three-phase execution loop |
+| `repo.py` | `TopologicalInterpRepo` — read-only access to results |
+| `scripts/run.py` | CLI entry point (called by SLURM) |
+| `scripts/run_slurm.py` | SLURM submission |
+| `scripts/run_slurm_cli.py` | Thin CLI wrapper for `spd-topological-interp` |
+
+## Dependencies
+
+- Harvest data (component stats, correlations, token stats)
+- Dataset attributions (component-to-component attribution strengths)
+- Reuses `map_llm_calls` from `spd/autointerp/llm_api.py`
+- Reuses prompt helpers from `spd/autointerp/strategies/dual_view.py`
+
+## SLURM Integration
+
+- 0 GPUs, 16 CPUs, 240GB memory (CPU-only, LLM API calls)
+- Depends on both harvest merge AND attribution merge jobs
+- Entry point: `spd-topological-interp`
diff --git a/spd/topological_interp/__init__.py b/spd/topological_interp/__init__.py
new file mode 100644
index 000000000..be1d47c12
--- /dev/null
+++ b/spd/topological_interp/__init__.py
@@ -0,0 +1 @@
+"""Topological interpretation: context-aware component labeling using graph structure."""
diff --git a/spd/topological_interp/config.py b/spd/topological_interp/config.py
new file mode 100644
index 000000000..1618167b4
--- /dev/null
+++ b/spd/topological_interp/config.py
@@ -0,0 +1,25 @@
+"""Topological interpretation configuration."""
+
+from openrouter.components import Effort
+
+from spd.base_config import BaseConfig
+from spd.settings import DEFAULT_PARTITION_NAME
+
+
+class TopologicalInterpConfig(BaseConfig):
+    model: str = "google/gemini-3-flash-preview"
+    reasoning_effort: Effort = "low"
+    top_k_neighbors: int = 8
+    top_k_correlated: int = 5
+    max_examples: int = 30
+    label_max_words: int = 8
+    cost_limit_usd: float | None = None
+    max_requests_per_minute: int = 500
+    max_concurrent: int = 50
+    limit: int | None = None
+
+
+class TopologicalInterpSlurmConfig(BaseConfig):
+    config: TopologicalInterpConfig
+    partition: str = DEFAULT_PARTITION_NAME
+    time: str = "24:00:00"
diff --git a/spd/topological_interp/db.py b/spd/topological_interp/db.py
new file mode 100644
index 000000000..276050122
--- /dev/null
+++ b/spd/topological_interp/db.py
@@ -0,0 +1,227 @@
+"""SQLite database for topological interpretation data."""
+
+import sqlite3
+from pathlib import Path
+
+from spd.topological_interp.schemas import LabelResult, PromptEdge
+
+_SCHEMA = """\
+CREATE TABLE IF NOT EXISTS output_labels (
+    component_key TEXT PRIMARY KEY,
+    label TEXT NOT NULL,
+    confidence TEXT NOT NULL,
+    reasoning TEXT NOT NULL,
+    raw_response TEXT NOT NULL,
+    prompt TEXT NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS input_labels (
+    component_key TEXT PRIMARY KEY,
+    label TEXT NOT NULL,
+    confidence TEXT NOT NULL,
+    reasoning TEXT NOT NULL,
+    raw_response TEXT NOT NULL,
+    prompt TEXT NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS unified_labels (
+    component_key TEXT PRIMARY KEY,
+    label TEXT NOT NULL,
+    confidence TEXT NOT NULL,
+    reasoning TEXT NOT NULL,
+    raw_response TEXT NOT NULL,
+    prompt TEXT NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS prompt_edges (
+    component_key TEXT NOT NULL,
+    neighbor_key TEXT NOT NULL,
+    direction TEXT NOT NULL,
+    pass TEXT NOT NULL,
+    attribution REAL NOT NULL,
+    neighbor_label TEXT,
+    neighbor_confidence TEXT,
+    PRIMARY KEY (component_key, neighbor_key, direction, pass)
+);
+
+CREATE TABLE IF NOT EXISTS config (
+    key TEXT PRIMARY KEY,
+    value TEXT NOT NULL
+);
+"""
+
+
+class TopologicalInterpDB:
+    def __init__(self, db_path: Path, readonly: bool = False) -> None:
+        if readonly:
+            self._conn = sqlite3.connect(
+                f"file:{db_path}?immutable=1", uri=True, check_same_thread=False
+            )
+        else:
+            self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
+            self._conn.execute("PRAGMA journal_mode=WAL")
+            self._conn.executescript(_SCHEMA)
+        self._conn.row_factory = sqlite3.Row
+
+    # -- Output labels ---------------------------------------------------------
+
+    def save_output_label(self, result: LabelResult) -> None:
+        self._conn.execute(
+            "INSERT OR REPLACE INTO output_labels VALUES (?, ?, ?, ?, ?, ?)",
+            (
+                result.component_key,
+                result.label,
+                result.confidence,
+                result.reasoning,
+                result.raw_response,
+                result.prompt,
+            ),
+        )
+        self._conn.commit()
+
+    def get_output_label(self, component_key: str) -> LabelResult | None:
+        row = self._conn.execute(
+            "SELECT * FROM output_labels WHERE component_key = ?", (component_key,)
+        ).fetchone()
+        if row is None:
+            return None
+        return _row_to_label_result(row)
+
+    def get_all_output_labels(self) -> dict[str, LabelResult]:
+        rows = self._conn.execute("SELECT * FROM output_labels").fetchall()
+        return {row["component_key"]: _row_to_label_result(row) for row in rows}
+
+    def get_completed_output_keys(self) -> set[str]:
+        rows = self._conn.execute("SELECT component_key FROM output_labels").fetchall()
+        return {row["component_key"] for row in rows}
+
+    # -- Input labels ----------------------------------------------------------
+
+    def save_input_label(self, result: LabelResult) -> None:
+        self._conn.execute(
+            "INSERT OR REPLACE INTO input_labels VALUES (?, ?, ?, ?, ?, ?)",
+            (
+                result.component_key,
+                result.label,
+                result.confidence,
+                result.reasoning,
+                result.raw_response,
+                result.prompt,
+            ),
+        )
+        self._conn.commit()
+
+    def get_input_label(self, component_key: str) -> LabelResult | None:
+        row = self._conn.execute(
+            "SELECT * FROM input_labels WHERE component_key = ?", (component_key,)
+        ).fetchone()
+        if row is None:
+            return None
+        return _row_to_label_result(row)
+
+    def get_all_input_labels(self) -> dict[str, LabelResult]:
+        rows = self._conn.execute("SELECT * FROM input_labels").fetchall()
+        return {row["component_key"]: _row_to_label_result(row) for row in rows}
+
+    def get_completed_input_keys(self) -> set[str]:
+        rows = self._conn.execute("SELECT component_key FROM input_labels").fetchall()
+        return {row["component_key"] for row in rows}
+
+    # -- Unified labels --------------------------------------------------------
+
+    def save_unified_label(self, result: LabelResult) -> None:
+        self._conn.execute(
+            "INSERT OR REPLACE INTO unified_labels VALUES (?, ?, ?, ?, ?, ?)",
+            (
+                result.component_key,
+                result.label,
+                result.confidence,
+                result.reasoning,
+                result.raw_response,
+                result.prompt,
+            ),
+        )
+        self._conn.commit()
+
+    def get_unified_label(self, component_key: str) -> LabelResult | None:
+        row = self._conn.execute(
+            "SELECT * FROM unified_labels WHERE component_key = ?", (component_key,)
+        ).fetchone()
+        if row is None:
+            return None
+        return _row_to_label_result(row)
+
+    def get_all_unified_labels(self) -> dict[str, LabelResult]:
+        rows = self._conn.execute("SELECT * FROM unified_labels").fetchall()
+        return {row["component_key"]: _row_to_label_result(row) for row in rows}
+
+    def get_completed_unified_keys(self) -> set[str]:
+        rows = self._conn.execute("SELECT component_key FROM unified_labels").fetchall()
+        return {row["component_key"] for row in rows}
+
+    # -- Prompt edges ----------------------------------------------------------
+
+    def save_prompt_edges(self, edges: list[PromptEdge]) -> None:
+        rows = [
+            (
+                e.component_key,
+                e.neighbor_key,
+                e.direction,
+                e.pass_name,
+                e.attribution,
+                e.neighbor_label,
+                e.neighbor_confidence,
+            )
+            for e in edges
+        ]
+        self._conn.executemany(
+            "INSERT OR REPLACE INTO prompt_edges VALUES (?, ?, ?, ?, ?, ?, ?)",
+            rows,
+        )
+        self._conn.commit()
+
+    def get_prompt_edges(self, component_key: str) -> list[PromptEdge]:
+        rows = self._conn.execute(
+            "SELECT * FROM prompt_edges WHERE component_key = ?", (component_key,)
+        ).fetchall()
+        return [_row_to_prompt_edge(row) for row in rows]
+
+    # -- Config ----------------------------------------------------------------
+
+    def save_config(self, key: str, value: str) -> None:
+        self._conn.execute("INSERT OR REPLACE INTO config VALUES (?, ?)", (key, value))
+        self._conn.commit()
+
+    # -- Stats -----------------------------------------------------------------
+
+    def get_label_count(self, table: str) -> int:
+        assert table in ("output_labels", "input_labels", "unified_labels")
+        row = self._conn.execute(f"SELECT COUNT(*) FROM {table}").fetchone()
+        assert row is not None
+        return row[0]
+
+    def close(self) -> None:
+        self._conn.close()
+
+
+def _row_to_label_result(row: sqlite3.Row) -> LabelResult:
+    return LabelResult(
+        component_key=row["component_key"],
+        label=row["label"],
+        confidence=row["confidence"],
+        reasoning=row["reasoning"],
+        raw_response=row["raw_response"],
+        prompt=row["prompt"],
+    )
+
+
+def _row_to_prompt_edge(row: sqlite3.Row) -> PromptEdge:
+    return PromptEdge(
+        component_key=row["component_key"],
+        neighbor_key=row["neighbor_key"],
+        direction=row["direction"],
+        pass_name=row["pass"],
+        attribution=row["attribution"],
+        neighbor_label=row["neighbor_label"],
+        neighbor_confidence=row["neighbor_confidence"],
+    )
diff --git a/spd/topological_interp/interpret.py b/spd/topological_interp/interpret.py
new file mode 100644
index 000000000..64e4c28f9
--- /dev/null
+++ b/spd/topological_interp/interpret.py
@@ -0,0 +1,507 @@
+"""Main three-phase topological interpretation execution.
+
+Phase 1: Output pass (late → early) — "What does this component DO?"
+Phase 2: Input pass (early → late) — "What TRIGGERS this component?"
+Phase 3: Unification (parallel over all) — Synthesize into unified label.
+"""
+
+import asyncio
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Literal
+
+from spd.app.backend.app_tokenizer import AppTokenizer
+from spd.autointerp.llm_api import LLMError, LLMJob, LLMResult, map_llm_calls
+from spd.autointerp.schemas import ModelMetadata
+from spd.dataset_attributions.storage import DatasetAttributionStorage
+from spd.harvest.analysis import get_input_token_stats, get_output_token_stats
+from spd.harvest.repo import HarvestRepo
+from spd.harvest.schemas import ComponentData
+from spd.harvest.storage import CorrelationStorage, TokenStatsStorage
+from spd.log import logger
+from spd.topological_interp.config import TopologicalInterpConfig
+from spd.topological_interp.db import TopologicalInterpDB
+from spd.topological_interp.neighbors import (
+    NeighborContext,
+    get_cofiring_neighbors,
+    get_downstream_neighbors,
+    get_upstream_neighbors,
+)
+from spd.topological_interp.ordering import group_and_sort_by_layer
+from spd.topological_interp.prompts import (
+    LABEL_SCHEMA,
+    format_input_prompt,
+    format_output_prompt,
+    format_unification_prompt,
+)
+from spd.topological_interp.schemas import LabelResult, PromptEdge
+
+
+def run_topological_interp(
+    openrouter_api_key: str,
+    config: TopologicalInterpConfig,
+    harvest: HarvestRepo,
+    attribution_storage: DatasetAttributionStorage,
+    correlation_storage: CorrelationStorage,
+    token_stats: TokenStatsStorage,
+    model_metadata: ModelMetadata,
+    db_path: Path,
+    tokenizer_name: str,
+) -> None:
+    app_tok = AppTokenizer.from_pretrained(tokenizer_name)
+    components = harvest.get_all_components()
+
+    # Sort by firing density descending; skip zero-firing components
+    components = [c for c in components if c.firing_density > 0.0]
+    components = sorted(components, key=lambda c: c.firing_density, reverse=True)
+
+    if config.limit is not None:
+        components = components[: config.limit]
+
+    all_keys = [c.component_key for c in components]
+    component_by_key = {c.component_key: c for c in components}
+    layers_ordered = group_and_sort_by_layer(all_keys, model_metadata.layer_descriptions)
+
+    total = len(all_keys)
+    logger.info(f"Topological interp: {total} components across {len(layers_ordered)} layers")
+
+    async def _run() -> None:
+        db = TopologicalInterpDB(db_path)
+
+        try:
+            logger.section("Phase 1: Output pass (late → early)")
+            await _run_output_pass(
+                db=db,
+                layers_ordered=layers_ordered,
+                component_by_key=component_by_key,
+                config=config,
+                openrouter_api_key=openrouter_api_key,
+                model_metadata=model_metadata,
+                app_tok=app_tok,
+                token_stats=token_stats,
+                attribution_storage=attribution_storage,
+                correlation_storage=correlation_storage,
+                total=total,
+            )
+
+            logger.section("Phase 2: Input pass (early → late)")
+            await _run_input_pass(
+                db=db,
+                layers_ordered=layers_ordered,
+                component_by_key=component_by_key,
+                config=config,
+                openrouter_api_key=openrouter_api_key,
+                model_metadata=model_metadata,
+                app_tok=app_tok,
+                token_stats=token_stats,
+                attribution_storage=attribution_storage,
+                correlation_storage=correlation_storage,
+                total=total,
+            )
+
+            logger.section("Phase 3: Unification")
+            await _run_unification(
+                db=db,
+                all_keys=all_keys,
+                config=config,
+                openrouter_api_key=openrouter_api_key,
+            )
+
+            output_count = db.get_label_count("output_labels")
+            input_count = db.get_label_count("input_labels")
+            unified_count = db.get_label_count("unified_labels")
+            logger.info(
+                f"Completed: {output_count} output, {input_count} input, "
+                f"{unified_count} unified labels -> {db_path}"
+            )
+        finally:
+            db.close()
+
+    asyncio.run(_run())
+
+
+# -- Phase 1: Output pass -----------------------------------------------------
+
+
+def _build_output_jobs(
+    keys: list[str],
+    component_by_key: dict[str, ComponentData],
+    token_stats: TokenStatsStorage,
+    app_tok: AppTokenizer,
+    attribution_storage: DatasetAttributionStorage,
+    correlation_storage: CorrelationStorage,
+    db: TopologicalInterpDB,
+    config: TopologicalInterpConfig,
+    model_metadata: ModelMetadata,
+) -> Iterable[LLMJob]:
+    for key in keys:
+        component = component_by_key[key]
+
+        input_stats = get_input_token_stats(token_stats, key, app_tok, top_k=20)
+        output_stats = get_output_token_stats(token_stats, key, app_tok, top_k=50)
+        assert input_stats is not None, f"No input token stats for {key}"
+        assert output_stats is not None, f"No output token stats for {key}"
+
+        downstream = get_downstream_neighbors(
+            key,
+            attribution_storage,
+            correlation_storage,
+            db,
+            model_metadata.layer_descriptions,
+            config.top_k_neighbors,
+        )
+
+        _save_edges_for_pass(db, key, downstream, [], "output")
+
+        prompt = format_output_prompt(
+            component=component,
+            model_metadata=model_metadata,
+            app_tok=app_tok,
+            input_token_stats=input_stats,
+            output_token_stats=output_stats,
+            downstream_neighbors=downstream,
+            label_max_words=config.label_max_words,
+            max_examples=config.max_examples,
+        )
+        yield LLMJob(prompt=prompt, schema=LABEL_SCHEMA, key=key)
+
+
+async def _run_output_pass(
+    db: TopologicalInterpDB,
+    layers_ordered: list[tuple[str, list[str]]],
+    component_by_key: dict[str, ComponentData],
+    config: TopologicalInterpConfig,
+    openrouter_api_key: str,
+    model_metadata: ModelMetadata,
+    app_tok: AppTokenizer,
+    token_stats: TokenStatsStorage,
+    attribution_storage: DatasetAttributionStorage,
+    correlation_storage: CorrelationStorage,
+    total: int,
+) -> None:
+    completed = db.get_completed_output_keys()
+    if completed:
+        logger.info(f"Output pass: resuming, {len(completed)} already completed")
+
+    completed_so_far = 0
+
+    for layer, keys in reversed(layers_ordered):
+        pending = [k for k in keys if k not in completed]
+        if not pending:
+            completed_so_far += len(keys)
+            continue
+
+        jobs = _build_output_jobs(
+            pending,
+            component_by_key,
+            token_stats,
+            app_tok,
+            attribution_storage,
+            correlation_storage,
+            db,
+            config,
+            model_metadata,
+        )
+
+        n_errors = 0
+        n_done = 0
+
+        async for outcome in map_llm_calls(
+            openrouter_api_key=openrouter_api_key,
+            model=config.model,
+            reasoning_effort=config.reasoning_effort,
+            jobs=jobs,
+            max_tokens=8000,
+            max_concurrent=config.max_concurrent,
+            max_requests_per_minute=config.max_requests_per_minute,
+            cost_limit_usd=config.cost_limit_usd,
+            response_schema=LABEL_SCHEMA,
+            n_total=len(pending),
+        ):
+            match outcome:
+                case LLMResult(job=job, parsed=parsed, raw=raw):
+                    result = _parsed_to_label_result(job.key, parsed, raw, job.prompt)
+                    db.save_output_label(result)
+                    completed.add(job.key)
+                    n_done += 1
+                case LLMError(job=job, error=e):
+                    n_errors += 1
+                    logger.error(f"Output pass: skipping {job.key}: {type(e).__name__}: {e}")
+
+            _check_error_rate(n_errors, n_done)
+
+        completed_so_far += len(keys)
+        logger.info(f"Output pass: completed layer {layer} ({completed_so_far}/{total})")
+
+
+# -- Phase 2: Input pass ------------------------------------------------------
+
+
+def _build_input_jobs(
+    keys: list[str],
+    component_by_key: dict[str, ComponentData],
+    token_stats: TokenStatsStorage,
+    app_tok: AppTokenizer,
+    attribution_storage: DatasetAttributionStorage,
+    correlation_storage: CorrelationStorage,
+    db: TopologicalInterpDB,
+    config: TopologicalInterpConfig,
+    model_metadata: ModelMetadata,
+) -> Iterable[LLMJob]:
+    for key in keys:
+        component = component_by_key[key]
+
+        input_stats = get_input_token_stats(token_stats, key, app_tok, top_k=20)
+        output_stats = get_output_token_stats(token_stats, key, app_tok, top_k=50)
+        assert input_stats is not None, f"No input token stats for {key}"
+        assert output_stats is not None, f"No output token stats for {key}"
+
+        upstream = get_upstream_neighbors(
+            key,
+            attribution_storage,
+            correlation_storage,
+            db,
+            model_metadata.layer_descriptions,
+            config.top_k_neighbors,
+        )
+        cofiring = get_cofiring_neighbors(key, correlation_storage, db, config.top_k_correlated)
+
+        _save_edges_for_pass(db, key, [], upstream, "input", cofiring=cofiring)
+
+        prompt = format_input_prompt(
+            component=component,
+            model_metadata=model_metadata,
+            app_tok=app_tok,
+            input_token_stats=input_stats,
+            output_token_stats=output_stats,
+            upstream_neighbors=upstream,
+            cofiring_neighbors=cofiring,
+            label_max_words=config.label_max_words,
+            max_examples=config.max_examples,
+        )
+        yield LLMJob(prompt=prompt, schema=LABEL_SCHEMA, key=key)
+
+
+async def _run_input_pass(
+    db: TopologicalInterpDB,
+    layers_ordered: list[tuple[str, list[str]]],
+    component_by_key: dict[str, ComponentData],
+    config: TopologicalInterpConfig,
+    openrouter_api_key: str,
+    model_metadata: ModelMetadata,
+    app_tok: AppTokenizer,
+    token_stats: TokenStatsStorage,
+    attribution_storage: DatasetAttributionStorage,
+    correlation_storage: CorrelationStorage,
+    total: int,
+) -> None:
+    completed = db.get_completed_input_keys()
+    if completed:
+        logger.info(f"Input pass: resuming, {len(completed)} already completed")
+
+    completed_so_far = 0
+
+    for layer, keys in layers_ordered:
+        pending = [k for k in keys if k not in completed]
+        if not pending:
+            completed_so_far += len(keys)
+            continue
+
+        jobs = _build_input_jobs(
+            pending,
+            component_by_key,
+            token_stats,
+            app_tok,
+            attribution_storage,
+            correlation_storage,
+            db,
+            config,
+            model_metadata,
+        )
+
+        n_errors = 0
+        n_done = 0
+
+        async for outcome in map_llm_calls(
+            openrouter_api_key=openrouter_api_key,
+            model=config.model,
+            reasoning_effort=config.reasoning_effort,
+            jobs=jobs,
+            max_tokens=8000,
+            max_concurrent=config.max_concurrent,
+            max_requests_per_minute=config.max_requests_per_minute,
+            cost_limit_usd=config.cost_limit_usd,
+            response_schema=LABEL_SCHEMA,
+            n_total=len(pending),
+        ):
+            match outcome:
+                case LLMResult(job=job, parsed=parsed, raw=raw):
+                    result = _parsed_to_label_result(job.key, parsed, raw, job.prompt)
+                    db.save_input_label(result)
+                    completed.add(job.key)
+                    n_done += 1
+                case LLMError(job=job, error=e):
+                    n_errors += 1
+                    logger.error(f"Input pass: skipping {job.key}: {type(e).__name__}: {e}")
+
+            _check_error_rate(n_errors, n_done)
+
+        completed_so_far += len(keys)
+        logger.info(f"Input pass: completed layer {layer} ({completed_so_far}/{total})")
+
+
+# -- Phase 3: Unification -----------------------------------------------------
+
+
+def _build_unification_jobs(
+    keys: list[str],
+    db: TopologicalInterpDB,
+    config: TopologicalInterpConfig,
+) -> Iterable[LLMJob]:
+    for key in keys:
+        output_label = db.get_output_label(key)
+        input_label = db.get_input_label(key)
+        assert output_label is not None, f"Output label missing for {key}"
+        assert input_label is not None, f"Input label missing for {key}"
+
+        prompt = format_unification_prompt(
+            output_label=output_label,
+            input_label=input_label,
+            label_max_words=config.label_max_words,
+        )
+        yield LLMJob(prompt=prompt, schema=LABEL_SCHEMA, key=key)
+
+
+async def _run_unification(
+    db: TopologicalInterpDB,
+    all_keys: list[str],
+    config: TopologicalInterpConfig,
+    openrouter_api_key: str,
+) -> None:
+    completed = db.get_completed_unified_keys()
+    pending = [k for k in all_keys if k not in completed]
+
+    if not pending:
+        logger.info("Unification: all labels already completed")
+        return
+
+    if completed:
+        logger.info(f"Unification: resuming, {len(completed)} already completed")
+
+    logger.info(f"Unifying {len(pending)} components")
+
+    jobs = _build_unification_jobs(pending, db, config)
+
+    n_errors = 0
+    n_done = 0
+
+    async for outcome in map_llm_calls(
+        openrouter_api_key=openrouter_api_key,
+        model=config.model,
+        reasoning_effort=config.reasoning_effort,
+        jobs=jobs,
+        max_tokens=4000,
+        max_concurrent=config.max_concurrent,
+        max_requests_per_minute=config.max_requests_per_minute,
+        cost_limit_usd=config.cost_limit_usd,
+        response_schema=LABEL_SCHEMA,
+        n_total=len(pending),
+    ):
+        match outcome:
+            case LLMResult(job=job, parsed=parsed, raw=raw):
+                result = _parsed_to_label_result(job.key, parsed, raw, job.prompt)
+                db.save_unified_label(result)
+                n_done += 1
+            case LLMError(job=job, error=e):
+                n_errors += 1
+                logger.error(f"Unification: skipping {job.key}: {type(e).__name__}: {e}")
+
+        _check_error_rate(n_errors, n_done)
+
+    logger.info(f"Unification: completed {n_done}/{len(pending)}")
+
+
+# -- Helpers -------------------------------------------------------------------
+
+
+def _parsed_to_label_result(
+    component_key: str,
+    parsed: dict[str, object],
+    raw: str,
+    prompt: str,
+) -> LabelResult:
+    assert len(parsed) == 3, f"Expected 3 fields, got {len(parsed)}"
+    label = parsed["label"]
+    confidence = parsed["confidence"]
+    reasoning = parsed["reasoning"]
+    assert isinstance(label, str) and isinstance(confidence, str) and isinstance(reasoning, str)
+    return LabelResult(
+        component_key=component_key,
+        label=label,
+        confidence=confidence,
+        reasoning=reasoning,
+        raw_response=raw,
+        prompt=prompt,
+    )
+
+
+def _check_error_rate(n_errors: int, n_done: int) -> None:
+    total = n_errors + n_done
+    if total > 10 and n_errors / total > 0.2:
+        raise RuntimeError(
+            f"Error rate {n_errors / total:.0%} ({n_errors}/{total}) exceeds 20% threshold"
+        )
+
+
+def _save_edges_for_pass(
+    db: TopologicalInterpDB,
+    component_key: str,
+    downstream: list[NeighborContext],
+    upstream: list[NeighborContext],
+    pass_name: Literal["output", "input"],
+    cofiring: list[NeighborContext] | None = None,
+) -> None:
+    edges: list[PromptEdge] = []
+
+    for n in downstream:
+        edges.append(
+            PromptEdge(
+                component_key=component_key,
+                neighbor_key=n.component_key,
+                direction="downstream",
+                pass_name=pass_name,
+                attribution=n.attribution,
+                neighbor_label=n.label,
+                neighbor_confidence=n.confidence,
+            )
+        )
+
+    for n in upstream:
+        edges.append(
+            PromptEdge(
+                component_key=component_key,
+                neighbor_key=n.component_key,
+                direction="upstream",
+                pass_name=pass_name,
+                attribution=n.attribution,
+                neighbor_label=n.label,
+                neighbor_confidence=n.confidence,
+            )
+        )
+
+    if cofiring:
+        for n in cofiring:
+            edges.append(
+                PromptEdge(
+                    component_key=component_key,
+                    neighbor_key=n.component_key,
+                    direction="upstream",
+                    pass_name=pass_name,
+                    attribution=n.attribution,
+                    neighbor_label=n.label,
+                    neighbor_confidence=n.confidence,
+                )
+            )
+
+    if edges:
+        db.save_prompt_edges(edges)
diff --git a/spd/topological_interp/neighbors.py b/spd/topological_interp/neighbors.py
new file mode 100644
index 000000000..7c2c8e0bf
--- /dev/null
+++ b/spd/topological_interp/neighbors.py
@@ -0,0 +1,163 @@
+"""Gather neighbor context from attributions and correlations."""
+
+from dataclasses import dataclass
+
+from spd.dataset_attributions.storage import DatasetAttributionStorage
+from spd.harvest.analysis import get_correlated_components
+from spd.harvest.storage import CorrelationStorage
+from spd.topological_interp.db import TopologicalInterpDB
+from spd.topological_interp.ordering import is_later_layer, parse_component_key
+
+
+@dataclass
+class NeighborContext:
+    component_key: str
+    attribution: float
+    label: str | None
+    confidence: str | None
+    jaccard: float | None
+    pmi: float | None
+
+
+def get_downstream_neighbors(
+    component_key: str,
+    attribution_storage: DatasetAttributionStorage,
+    correlation_storage: CorrelationStorage,
+    db: TopologicalInterpDB,
+    layer_descriptions: dict[str, str],
+    k: int,
+) -> list[NeighborContext]:
+    """Top-K downstream (later-layer) components by absolute attribution."""
+    source_layer, _ = parse_component_key(component_key)
+
+    pos_targets = attribution_storage.get_top_component_targets(
+        component_key, k=k * 2, sign="positive"
+    )
+    neg_targets = attribution_storage.get_top_component_targets(
+        component_key, k=k * 2, sign="negative"
+    )
+
+    all_targets = pos_targets + neg_targets
+    all_targets.sort(key=lambda e: abs(e.value), reverse=True)
+
+    downstream = [
+        e
+        for e in all_targets
+        if e.layer in layer_descriptions
+        and is_later_layer(source_layer, e.layer, layer_descriptions)
+    ]
+    downstream = downstream[:k]
+
+    cofiring = _build_cofiring_lookup(component_key, correlation_storage, k * 3)
+
+    return [_build_neighbor_context(e.component_key, e.value, cofiring, db) for e in downstream]
+
+
+def get_upstream_neighbors(
+    component_key: str,
+    attribution_storage: DatasetAttributionStorage,
+    correlation_storage: CorrelationStorage,
+    db: TopologicalInterpDB,
+    layer_descriptions: dict[str, str],
+    k: int,
+) -> list[NeighborContext]:
+    """Top-K upstream (earlier-layer) components by absolute attribution."""
+    target_layer, _ = parse_component_key(component_key)
+
+    pos_sources = attribution_storage.get_top_sources(component_key, k=k * 2, sign="positive")
+    neg_sources = attribution_storage.get_top_sources(component_key, k=k * 2, sign="negative")
+
+    all_sources = [e for e in pos_sources + neg_sources if not e.component_key.startswith("wte:")]
+    all_sources.sort(key=lambda e: abs(e.value), reverse=True)
+
+    upstream = [
+        e
+        for e in all_sources
+        if e.layer in layer_descriptions
+        and is_later_layer(e.layer, target_layer, layer_descriptions)
+    ]
+    upstream = upstream[:k]
+
+    cofiring = _build_cofiring_lookup(component_key, correlation_storage, k * 3)
+
+    return [_build_neighbor_context(e.component_key, e.value, cofiring, db) for e in upstream]
+
+
+def get_cofiring_neighbors(
+    component_key: str,
+    correlation_storage: CorrelationStorage,
+    db: TopologicalInterpDB,
+    k: int,
+) -> list[NeighborContext]:
+    """Top-K co-firing components by Jaccard similarity."""
+    correlated = get_correlated_components(
+        correlation_storage, component_key, metric="jaccard", top_k=k
+    )
+
+    pmi_lookup: dict[str, float] = {}
+    pmi_results = get_correlated_components(
+        correlation_storage, component_key, metric="pmi", top_k=k * 3
+    )
+    for c in pmi_results:
+        pmi_lookup[c.component_key] = c.score
+
+    result: list[NeighborContext] = []
+    for c in correlated:
+        output_label = db.get_output_label(c.component_key)
+        result.append(
+            NeighborContext(
+                component_key=c.component_key,
+                attribution=0.0,
+                label=output_label.label if output_label else None,
+                confidence=output_label.confidence if output_label else None,
+                jaccard=c.score,
+                pmi=pmi_lookup.get(c.component_key),
+            )
+        )
+    return result
+
+
+def _build_cofiring_lookup(
+    component_key: str,
+    correlation_storage: CorrelationStorage,
+    k: int,
+) -> dict[str, tuple[float, float | None]]:
+    """Build {neighbor_key: (jaccard, pmi)} lookup for co-firing stats."""
+    lookup: dict[str, tuple[float, float | None]] = {}
+
+    jaccard_results = get_correlated_components(
+        correlation_storage, component_key, metric="jaccard", top_k=k
+    )
+    for c in jaccard_results:
+        lookup[c.component_key] = (c.score, None)
+
+    pmi_results = get_correlated_components(
+        correlation_storage, component_key, metric="pmi", top_k=k
+    )
+    for c in pmi_results:
+        if c.component_key in lookup:
+            jaccard_val = lookup[c.component_key][0]
+            lookup[c.component_key] = (jaccard_val, c.score)
+        else:
+            lookup[c.component_key] = (0.0, c.score)
+
+    return lookup
+
+
+def _build_neighbor_context(
+    neighbor_key: str,
+    attribution: float,
+    cofiring: dict[str, tuple[float, float | None]],
+    db: TopologicalInterpDB,
+) -> NeighborContext:
+    output_label = db.get_output_label(neighbor_key)
+    jaccard, pmi = cofiring.get(neighbor_key, (None, None))
+
+    return NeighborContext(
+        component_key=neighbor_key,
+        attribution=attribution,
+        label=output_label.label if output_label else None,
+        confidence=output_label.confidence if output_label else None,
+        jaccard=jaccard,
+        pmi=pmi,
+    )
diff --git a/spd/topological_interp/ordering.py b/spd/topological_interp/ordering.py
new file mode 100644
index 000000000..0807b000a
--- /dev/null
+++ b/spd/topological_interp/ordering.py
@@ -0,0 +1,88 @@
+"""Layer ordering for topological interpretation.
+
+Uses the topology module's CanonicalWeight system for correct ordering
+across all model architectures. Canonical addresses are provided by
+ModelMetadata.layer_descriptions (concrete path → canonical string).
+"""
+
+from spd.topology.canonical import (
+    CanonicalWeight,
+    FusedAttnWeight,
+    GLUWeight,
+    LayerWeight,
+    MLPWeight,
+    SeparateAttnWeight,
+)
+
+_SUBLAYER_ORDER = {"attn": 0, "attn_fused": 0, "glu": 1, "mlp": 1}
+
+_PROJECTION_ORDER: dict[type, dict[str, int]] = {
+    SeparateAttnWeight: {"q": 0, "k": 1, "v": 2, "o": 3},
+    FusedAttnWeight: {"qkv": 0, "o": 1},
+    GLUWeight: {"gate": 0, "up": 1, "down": 2},
+    MLPWeight: {"up": 0, "down": 1},
+}
+
+
+def canonical_sort_key(canonical: str) -> tuple[int, int, int]:
+    """Sort key for a canonical address string like '0.attn.q' or '1.mlp.down'."""
+    weight = CanonicalWeight.parse(canonical)
+    assert isinstance(weight, LayerWeight), f"Expected LayerWeight, got {type(weight).__name__}"
+
+    match weight.name:
+        case SeparateAttnWeight(weight=p):
+            sublayer_idx = _SUBLAYER_ORDER["attn"]
+            proj_idx = _PROJECTION_ORDER[SeparateAttnWeight][p]
+        case FusedAttnWeight(weight=p):
+            sublayer_idx = _SUBLAYER_ORDER["attn_fused"]
+            proj_idx = _PROJECTION_ORDER[FusedAttnWeight][p]
+        case GLUWeight(weight=p):
+            sublayer_idx = _SUBLAYER_ORDER["glu"]
+            proj_idx = _PROJECTION_ORDER[GLUWeight][p]
+        case MLPWeight(weight=p):
+            sublayer_idx = _SUBLAYER_ORDER["mlp"]
+            proj_idx = _PROJECTION_ORDER[MLPWeight][p]
+
+    return weight.layer_idx, sublayer_idx, proj_idx
+
+
+def parse_component_key(key: str) -> tuple[str, int]:
+    """Split 'h.1.mlp.c_fc:42' into ('h.1.mlp.c_fc', 42)."""
+    layer, idx_str = key.rsplit(":", 1)
+    return layer, int(idx_str)
+
+
+def group_and_sort_by_layer(
+    component_keys: list[str],
+    layer_descriptions: dict[str, str],
+) -> list[tuple[str, list[str]]]:
+    """Group component keys by layer, return [(layer, [keys])] in topological order.
+
+    Args:
+        component_keys: Component keys like 'h.0.attn.q_proj:42'.
+        layer_descriptions: Mapping from concrete layer path to canonical address
+            (from ModelMetadata.layer_descriptions).
+    """
+    by_layer: dict[str, list[str]] = {}
+    for key in component_keys:
+        layer, _ = parse_component_key(key)
+        by_layer.setdefault(layer, []).append(key)
+
+    def sort_key(layer: str) -> tuple[int, int, int]:
+        canonical = layer_descriptions[layer]
+        return canonical_sort_key(canonical)
+
+    sorted_layers = sorted(by_layer.keys(), key=sort_key)
+
+    result: list[tuple[str, list[str]]] = []
+    for layer in sorted_layers:
+        keys = sorted(by_layer[layer], key=lambda k: parse_component_key(k)[1])
+        result.append((layer, keys))
+    return result
+
+
+def is_later_layer(earlier: str, later: str, layer_descriptions: dict[str, str]) -> bool:
+    """Check if `later` is topologically after `earlier`."""
+    return canonical_sort_key(layer_descriptions[earlier]) < canonical_sort_key(
+        layer_descriptions[later]
+    )
diff --git a/spd/topological_interp/prompts.py b/spd/topological_interp/prompts.py
new file mode 100644
index 000000000..3a3e62469
--- /dev/null
+++ b/spd/topological_interp/prompts.py
@@ -0,0 +1,237 @@
+"""Prompt formatters for topological interpretation.
+
+Three prompts:
+1. Output pass (late→early): "What does this component DO?"
+2. Input pass (early→late): "What TRIGGERS this component?"
+3. Unification: Synthesize output + input labels into unified label.
+
+Output and input passes are independent — neither depends on the other's labels.
+The unification step combines them.
+"""
+
+from spd.app.backend.app_tokenizer import AppTokenizer
+from spd.autointerp.prompt_helpers import (
+    DATASET_DESCRIPTIONS,
+    build_fires_on_examples,
+    build_input_section,
+    build_output_section,
+    build_says_examples,
+    density_note,
+    human_layer_desc,
+    layer_position_note,
+)
+from spd.autointerp.schemas import ModelMetadata
+from spd.harvest.analysis import TokenPRLift
+from spd.harvest.schemas import ComponentData
+from spd.topological_interp.neighbors import NeighborContext
+from spd.topological_interp.schemas import LabelResult
+
+LABEL_SCHEMA: dict[str, object] = {
+    "type": "object",
+    "properties": {
+        "label": {"type": "string"},
+        "confidence": {"type": "string", "enum": ["low", "medium", "high"]},
+        "reasoning": {"type": "string"},
+    },
+    "required": ["label", "confidence", "reasoning"],
+    "additionalProperties": False,
+}
+
+_FORBIDDEN = "FORBIDDEN vague words: narrative, story, character, theme, descriptive, content, transition, scene."
+
+
+def _build_context_block(
+    component: ComponentData,
+    model_metadata: ModelMetadata,
+    app_tok: AppTokenizer,
+    input_token_stats: TokenPRLift,
+    output_token_stats: TokenPRLift,
+    max_examples: int,
+) -> str:
+    """Shared context block used by both output and input prompts."""
+    canonical = model_metadata.layer_descriptions.get(component.layer, component.layer)
+    layer_desc = human_layer_desc(canonical, model_metadata.n_blocks)
+    position_note = layer_position_note(canonical, model_metadata.n_blocks)
+    dens_note = density_note(component.firing_density)
+
+    rate_str = (
+        f"~1 in {int(1 / component.firing_density)} tokens"
+        if component.firing_density > 0.0
+        else "extremely rare"
+    )
+
+    dataset_desc = DATASET_DESCRIPTIONS.get(
+        model_metadata.dataset_name, model_metadata.dataset_name
+    )
+
+    input_pmi = (
+        [(app_tok.get_tok_display(tid), pmi) for tid, pmi in component.input_token_pmi.top]
+        if component.input_token_pmi.top
+        else None
+    )
+    output_pmi = (
+        [(app_tok.get_tok_display(tid), pmi) for tid, pmi in component.output_token_pmi.top]
+        if component.output_token_pmi.top
+        else None
+    )
+
+    output_section = build_output_section(output_token_stats, output_pmi)
+    input_section = build_input_section(input_token_stats, input_pmi)
+    fires_on = build_fires_on_examples(component, app_tok, max_examples)
+    says = build_says_examples(component, app_tok, max_examples)
+
+    context_notes = " ".join(filter(None, [position_note, dens_note]))
+
+    return f"""\
+## Context
+- Model: {model_metadata.model_class} ({model_metadata.n_blocks} blocks), dataset: {dataset_desc}
+- Component: {layer_desc} (component {component.component_idx})
+- Firing rate: {component.firing_density * 100:.2f}% ({rate_str})
+{context_notes}
+
+## Output tokens (what the model produces when this component fires)
+{output_section}
+## Input tokens (what causes this component to fire)
+{input_section}
+## Activation examples — where the component fires
+{fires_on}
+## Activation examples — what the model produces
+{says}"""
+
+
+def format_output_prompt(
+    component: ComponentData,
+    model_metadata: ModelMetadata,
+    app_tok: AppTokenizer,
+    input_token_stats: TokenPRLift,
+    output_token_stats: TokenPRLift,
+    downstream_neighbors: list[NeighborContext],
+    label_max_words: int,
+    max_examples: int,
+) -> str:
+    context = _build_context_block(
+        component, model_metadata, app_tok, input_token_stats, output_token_stats, max_examples
+    )
+    downstream_table = _format_neighbor_table(downstream_neighbors)
+
+    return f"""\
+You are analyzing a component in a neural network to understand its OUTPUT FUNCTION — what it does when it fires.
+
+{context}
+## Downstream components (what this component influences)
+These components in later layers are most influenced by this component (by gradient attribution):
+{downstream_table}
+## Task
+Give a {label_max_words}-word-or-fewer label describing this component's OUTPUT FUNCTION — what it does when it fires.
+
+Examples of good labels:
+- "word stem completion (stems → suffixes)"
+- "closes dialogue with quotation marks"
+- "object pronouns after verbs"
+- "aquatic scene vocabulary (frog, river, pond)"
+
+{_FORBIDDEN} Lowercase only. Say "unclear" if the evidence is too weak.
+
+Respond with JSON: {{"label": "...", "confidence": "low|medium|high", "reasoning": "..."}}
+"""
+
+
+def format_input_prompt(
+    component: ComponentData,
+    model_metadata: ModelMetadata,
+    app_tok: AppTokenizer,
+    input_token_stats: TokenPRLift,
+    output_token_stats: TokenPRLift,
+    upstream_neighbors: list[NeighborContext],
+    cofiring_neighbors: list[NeighborContext],
+    label_max_words: int,
+    max_examples: int,
+) -> str:
+    context = _build_context_block(
+        component, model_metadata, app_tok, input_token_stats, output_token_stats, max_examples
+    )
+    upstream_table = _format_neighbor_table(upstream_neighbors)
+    cofiring_table = _format_cofiring_table(cofiring_neighbors)
+
+    return f"""\
+You are analyzing a component in a neural network to understand its INPUT FUNCTION — what triggers it to fire.
+
+{context}
+## Upstream components (what feeds into this component)
+These components in earlier layers most strongly attribute to this component:
+{upstream_table}
+## Co-firing components
+Components that frequently fire together with this one:
+{cofiring_table}
+## Task
+Give a {label_max_words}-word-or-fewer label describing this component's INPUT FUNCTION — what conditions trigger it to fire.
+
+Examples of good labels:
+- "periods and sentence boundaries"
+- "prepositions before noun phrases"
+- "tokens following proper nouns"
+- "positions requiring verb conjugation"
+
+{_FORBIDDEN} Lowercase only. Say "unclear" if the evidence is too weak.
+
+Respond with JSON: {{"label": "...", "confidence": "low|medium|high", "reasoning": "..."}}
+"""
+
+
+def format_unification_prompt(
+    output_label: LabelResult,
+    input_label: LabelResult,
+    label_max_words: int,
+) -> str:
+    return f"""\
+A neural network component has been analyzed from two perspectives:
+
+OUTPUT FUNCTION: "{output_label.label}" (confidence: {output_label.confidence})
+  Reasoning: {output_label.reasoning}
+
+INPUT FUNCTION: "{input_label.label}" (confidence: {input_label.confidence})
+  Reasoning: {input_label.reasoning}
+
+Synthesize these into a single unified label (max {label_max_words} words) that captures the component's complete role. If input and output suggest the same concept, unify them. If they describe genuinely different aspects (e.g. fires on X, produces Y), combine both. Lowercase only.
+
+Respond with JSON: {{"label": "...", "confidence": "low|medium|high", "reasoning": "..."}}
+"""
+
+
+def _format_neighbor_table(neighbors: list[NeighborContext]) -> str:
+    if not neighbors:
+        return "(no attributed neighbors found)\n"
+
+    lines: list[str] = []
+    for n in neighbors:
+        parts = [f"  {n.component_key} (attribution: {n.attribution:.4f}"]
+        if n.jaccard is not None:
+            parts.append(f", co-firing Jaccard: {n.jaccard:.3f}")
+        parts.append(")")
+
+        line = "".join(parts)
+        if n.label is not None:
+            line += f'\n    label: "{n.label}" (confidence: {n.confidence})'
+        lines.append(line)
+
+    return "\n".join(lines) + "\n"
+
+
+def _format_cofiring_table(neighbors: list[NeighborContext]) -> str:
+    if not neighbors:
+        return "(no co-firing components found)\n"
+
+    lines: list[str] = []
+    for n in neighbors:
+        parts = [f"  {n.component_key}"]
+        if n.jaccard is not None:
+            parts.append(f" (Jaccard: {n.jaccard:.3f}")
+            if n.pmi is not None:
+                parts.append(f", PMI: {n.pmi:.2f}")
+            parts.append(")")
+        line = "".join(parts)
+        if n.label is not None:
+            line += f'\n    label: "{n.label}" (confidence: {n.confidence})'
+        lines.append(line)
+
+    return "\n".join(lines) + "\n"
diff --git a/spd/topological_interp/repo.py b/spd/topological_interp/repo.py
new file mode 100644
index 000000000..7325e512b
--- /dev/null
+++ b/spd/topological_interp/repo.py
@@ -0,0 +1,88 @@
+"""Topological interpretation data repository.
+
+Owns SPD_OUT_DIR/topological_interp/<decomposition_id>/ and provides read access
+to output, input, and unified labels.
+
+Use TopologicalInterpRepo.open() to construct — returns None if no data exists.
+"""
+
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+from spd.topological_interp.db import TopologicalInterpDB
+from spd.topological_interp.schemas import LabelResult, PromptEdge, get_topological_interp_dir
+
+
+class TopologicalInterpRepo:
+    """Read access to topological interpretation data for a single run."""
+
+    def __init__(self, db: TopologicalInterpDB, subrun_dir: Path, run_id: str) -> None:
+        self._db = db
+        self._subrun_dir = subrun_dir
+        self.subrun_id = subrun_dir.name
+        self.run_id = run_id
+
+    @classmethod
+    def open(cls, run_id: str) -> "TopologicalInterpRepo | None":
+        """Open topological interp data for a run. Returns None if no data exists."""
+        base_dir = get_topological_interp_dir(run_id)
+        if not base_dir.exists():
+            return None
+        candidates = sorted(
+            [d for d in base_dir.iterdir() if d.is_dir() and d.name.startswith("ti-")],
+            key=lambda d: d.name,
+        )
+        if not candidates:
+            return None
+        subrun_dir = candidates[-1]
+        db_path = subrun_dir / "interp.db"
+        if not db_path.exists():
+            return None
+        return cls(
+            db=TopologicalInterpDB(db_path, readonly=True),
+            subrun_dir=subrun_dir,
+            run_id=run_id,
+        )
+
+    def get_config(self) -> dict[str, Any] | None:
+        config_path = self._subrun_dir / "config.yaml"
+        if not config_path.exists():
+            return None
+        with open(config_path) as f:
+            return yaml.safe_load(f)
+
+    # -- Labels ----------------------------------------------------------------
+
+    def get_all_output_labels(self) -> dict[str, LabelResult]:
+        return self._db.get_all_output_labels()
+
+    def get_all_input_labels(self) -> dict[str, LabelResult]:
+        return self._db.get_all_input_labels()
+
+    def get_all_unified_labels(self) -> dict[str, LabelResult]:
+        return self._db.get_all_unified_labels()
+
+    def get_output_label(self, component_key: str) -> LabelResult | None:
+        return self._db.get_output_label(component_key)
+
+    def get_input_label(self, component_key: str) -> LabelResult | None:
+        return self._db.get_input_label(component_key)
+
+    def get_unified_label(self, component_key: str) -> LabelResult | None:
+        return self._db.get_unified_label(component_key)
+
+    # -- Edges -----------------------------------------------------------------
+
+    def get_prompt_edges(self, component_key: str) -> list[PromptEdge]:
+        return self._db.get_prompt_edges(component_key)
+
+    # -- Stats -----------------------------------------------------------------
+
+    def get_label_counts(self) -> dict[str, int]:
+        return {
+            "output": self._db.get_label_count("output_labels"),
+            "input": self._db.get_label_count("input_labels"),
+            "unified": self._db.get_label_count("unified_labels"),
+        }
diff --git a/spd/topological_interp/schemas.py b/spd/topological_interp/schemas.py
new file mode 100644
index 000000000..102f8e279
--- /dev/null
+++ b/spd/topological_interp/schemas.py
@@ -0,0 +1,38 @@
+"""Data types and path helpers for topological interpretation."""
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Literal
+
+from spd.settings import SPD_OUT_DIR
+
+TOPOLOGICAL_INTERP_DIR = SPD_OUT_DIR / "topological_interp"
+
+
+def get_topological_interp_dir(decomposition_id: str) -> Path:
+    return TOPOLOGICAL_INTERP_DIR / decomposition_id
+
+
+def get_topological_interp_subrun_dir(decomposition_id: str, subrun_id: str) -> Path:
+    return get_topological_interp_dir(decomposition_id) / subrun_id
+
+
+@dataclass
+class LabelResult:
+    component_key: str
+    label: str
+    confidence: str
+    reasoning: str
+    raw_response: str
+    prompt: str
+
+
+@dataclass
+class PromptEdge:
+    component_key: str
+    neighbor_key: str
+    direction: Literal["upstream", "downstream"]
+    pass_name: Literal["output", "input"]
+    attribution: float
+    neighbor_label: str | None
+    neighbor_confidence: str | None
diff --git a/spd/topological_interp/scripts/__init__.py b/spd/topological_interp/scripts/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spd/topological_interp/scripts/run.py b/spd/topological_interp/scripts/run.py
new file mode 100644
index 000000000..9857a60be
--- /dev/null
+++ b/spd/topological_interp/scripts/run.py
@@ -0,0 +1,93 @@
+"""CLI entry point for topological interpretation.
+
+Called by SLURM or directly:
+    python -m spd.topological_interp.scripts.run <decomposition_id> --config_json '{...}'
+"""
+
+import os
+from datetime import datetime
+from typing import Any
+
+from dotenv import load_dotenv
+
+from spd.adapters import adapter_from_id
+from spd.dataset_attributions.repo import AttributionRepo
+from spd.harvest.repo import HarvestRepo
+from spd.log import logger
+from spd.topological_interp.config import TopologicalInterpConfig
+from spd.topological_interp.interpret import run_topological_interp
+from spd.topological_interp.schemas import get_topological_interp_subrun_dir
+
+
+def main(
+    decomposition_id: str,
+    config_json: dict[str, Any],
+    harvest_subrun_id: str | None = None,
+) -> None:
+    assert isinstance(config_json, dict), f"Expected dict from fire, got {type(config_json)}"
+    config = TopologicalInterpConfig.model_validate(config_json)
+
+    load_dotenv()
+    openrouter_api_key = os.environ.get("OPENROUTER_API_KEY")
+    assert openrouter_api_key, "OPENROUTER_API_KEY not set"
+
+    if harvest_subrun_id is not None:
+        harvest = HarvestRepo(decomposition_id, subrun_id=harvest_subrun_id, readonly=True)
+    else:
+        harvest = HarvestRepo.open_most_recent(decomposition_id, readonly=True)
+        assert harvest is not None, f"No harvest data for {decomposition_id}"
+
+    attributions = AttributionRepo.open(decomposition_id)
+    assert attributions is not None, f"Dataset attributions required for {decomposition_id}"
+    attribution_storage = attributions.get_attributions()
+
+    correlations = harvest.get_correlations()
+    assert correlations is not None, f"Component correlations required for {decomposition_id}"
+
+    token_stats = harvest.get_token_stats()
+    assert token_stats is not None, f"Token stats required for {decomposition_id}"
+
+    subrun_id = "ti-" + datetime.now().strftime("%Y%m%d_%H%M%S")
+    subrun_dir = get_topological_interp_subrun_dir(decomposition_id, subrun_id)
+    subrun_dir.mkdir(parents=True, exist_ok=True)
+
+    config.to_file(subrun_dir / "config.yaml")
+    db_path = subrun_dir / "interp.db"
+
+    logger.info(f"Topological interp run: {subrun_dir}")
+
+    adapter = adapter_from_id(decomposition_id)
+
+    run_topological_interp(
+        openrouter_api_key=openrouter_api_key,
+        config=config,
+        harvest=harvest,
+        attribution_storage=attribution_storage,
+        correlation_storage=correlations,
+        token_stats=token_stats,
+        model_metadata=adapter.model_metadata,
+        db_path=db_path,
+        tokenizer_name=adapter.tokenizer_name,
+    )
+
+
+def get_command(
+    decomposition_id: str,
+    config: TopologicalInterpConfig,
+    harvest_subrun_id: str | None = None,
+) -> str:
+    config_json = config.model_dump_json(exclude_none=True)
+    cmd = (
+        "python -m spd.topological_interp.scripts.run "
+        f"--decomposition_id {decomposition_id} "
+        f"--config_json '{config_json}' "
+    )
+    if harvest_subrun_id is not None:
+        cmd += f"--harvest_subrun_id {harvest_subrun_id} "
+    return cmd
+
+
+if __name__ == "__main__":
+    import fire
+
+    fire.Fire(main)
diff --git a/spd/topological_interp/scripts/run_slurm.py b/spd/topological_interp/scripts/run_slurm.py
new file mode 100644
index 000000000..9e500efcd
--- /dev/null
+++ b/spd/topological_interp/scripts/run_slurm.py
@@ -0,0 +1,70 @@
+"""SLURM launcher for topological interpretation.
+
+Submits a single CPU job that runs the three-phase interpretation pipeline.
+Depends on both harvest merge and attribution merge jobs.
+"""
+
+from dataclasses import dataclass
+
+from spd.log import logger
+from spd.topological_interp.config import TopologicalInterpSlurmConfig
+from spd.topological_interp.scripts import run
+from spd.utils.slurm import SlurmConfig, SubmitResult, generate_script, submit_slurm_job
+
+
+@dataclass
+class TopologicalInterpSubmitResult:
+    result: SubmitResult
+
+
+def submit_topological_interp(
+    decomposition_id: str,
+    config: TopologicalInterpSlurmConfig,
+    dependency_job_ids: list[str],
+    snapshot_branch: str | None = None,
+    harvest_subrun_id: str | None = None,
+) -> TopologicalInterpSubmitResult:
+    """Submit topological interpretation to SLURM.
+
+    Args:
+        decomposition_id: ID of the target decomposition.
+        config: Topological interp SLURM configuration.
+        dependency_job_ids: Jobs to wait for (harvest merge + attribution merge).
+        snapshot_branch: Git snapshot branch to use.
+        harvest_subrun_id: Specific harvest subrun to use.
+    """
+    cmd = run.get_command(
+        decomposition_id=decomposition_id,
+        config=config.config,
+        harvest_subrun_id=harvest_subrun_id,
+    )
+
+    # Chain dependencies: job starts only after ALL dependencies complete
+    dependency_str = ":".join(dependency_job_ids) if dependency_job_ids else None
+
+    slurm_config = SlurmConfig(
+        job_name="spd-topological-interp",
+        partition=config.partition,
+        n_gpus=0,
+        cpus_per_task=16,
+        mem="240G",
+        time=config.time,
+        snapshot_branch=snapshot_branch,
+        dependency_job_id=dependency_str,
+        comment=decomposition_id,
+    )
+    script_content = generate_script(slurm_config, cmd)
+    result = submit_slurm_job(script_content, "spd-topological-interp")
+
+    logger.section("Topological interp job submitted")
+    logger.values(
+        {
+            "Job ID": result.job_id,
+            "Decomposition ID": decomposition_id,
+            "Model": config.config.model,
+            "Depends on": ", ".join(dependency_job_ids),
+            "Log": result.log_pattern,
+        }
+    )
+
+    return TopologicalInterpSubmitResult(result=result)
diff --git a/spd/topological_interp/scripts/run_slurm_cli.py b/spd/topological_interp/scripts/run_slurm_cli.py
new file mode 100644
index 000000000..1b9aa6c7d
--- /dev/null
+++ b/spd/topological_interp/scripts/run_slurm_cli.py
@@ -0,0 +1,27 @@
+"""CLI entry point for topological interp SLURM launcher.
+
+Thin wrapper for fast --help. Heavy imports deferred to run_slurm.py.
+
+Usage:
+    spd-topological-interp <decomposition_id> --config topological_interp_config.yaml
+"""
+
+import fire
+
+
+def main(decomposition_id: str, config: str) -> None:
+    """Submit topological interpretation pipeline to SLURM.
+
+    Args:
+        decomposition_id: ID of the target decomposition run.
+        config: Path to TopologicalInterpSlurmConfig YAML/JSON.
+    """
+    from spd.topological_interp.config import TopologicalInterpSlurmConfig
+    from spd.topological_interp.scripts.run_slurm import submit_topological_interp
+
+    slurm_config = TopologicalInterpSlurmConfig.from_file(config)
+    submit_topological_interp(decomposition_id, slurm_config, dependency_job_ids=[])
+
+
+def cli() -> None:
+    fire.Fire(main)

From a41cfefc5e4cf5d37dd2882ad3b396a1c3bbd55e Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Thu, 19 Feb 2026 16:42:33 +0000
Subject: [PATCH 22/62] Remove output-label dependency from cofiring neighbors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

get_cofiring_neighbors no longer reads from the DB — it returns
pure co-firing stats (Jaccard/PMI) with no labels. This ensures
the input and output passes have zero logical coupling.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/topological_interp/interpret.py |  2 +-
 spd/topological_interp/neighbors.py | 24 ++++++++++--------------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/spd/topological_interp/interpret.py b/spd/topological_interp/interpret.py
index 64e4c28f9..e4e297da1 100644
--- a/spd/topological_interp/interpret.py
+++ b/spd/topological_interp/interpret.py
@@ -264,7 +264,7 @@ def _build_input_jobs(
             model_metadata.layer_descriptions,
             config.top_k_neighbors,
         )
-        cofiring = get_cofiring_neighbors(key, correlation_storage, db, config.top_k_correlated)
+        cofiring = get_cofiring_neighbors(key, correlation_storage, config.top_k_correlated)
 
         _save_edges_for_pass(db, key, [], upstream, "input", cofiring=cofiring)
 
diff --git a/spd/topological_interp/neighbors.py b/spd/topological_interp/neighbors.py
index 7c2c8e0bf..711c11109 100644
--- a/spd/topological_interp/neighbors.py
+++ b/spd/topological_interp/neighbors.py
@@ -86,7 +86,6 @@ def get_upstream_neighbors(
 def get_cofiring_neighbors(
     component_key: str,
     correlation_storage: CorrelationStorage,
-    db: TopologicalInterpDB,
     k: int,
 ) -> list[NeighborContext]:
     """Top-K co-firing components by Jaccard similarity."""
@@ -101,20 +100,17 @@ def get_cofiring_neighbors(
     for c in pmi_results:
         pmi_lookup[c.component_key] = c.score
 
-    result: list[NeighborContext] = []
-    for c in correlated:
-        output_label = db.get_output_label(c.component_key)
-        result.append(
-            NeighborContext(
-                component_key=c.component_key,
-                attribution=0.0,
-                label=output_label.label if output_label else None,
-                confidence=output_label.confidence if output_label else None,
-                jaccard=c.score,
-                pmi=pmi_lookup.get(c.component_key),
-            )
+    return [
+        NeighborContext(
+            component_key=c.component_key,
+            attribution=0.0,
+            label=None,
+            confidence=None,
+            jaccard=c.score,
+            pmi=pmi_lookup.get(c.component_key),
         )
-    return result
+        for c in correlated
+    ]
 
 
 def _build_cofiring_lookup(

From 54d5a7be5f8f932cb2916acb4c7023aed3f498c0 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Thu, 19 Feb 2026 16:46:53 +0000
Subject: [PATCH 23/62] =?UTF-8?q?Rename=20neighbor=20=E2=86=92=20related?=
 =?UTF-8?q?=20component=20terminology?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- neighbors.py → graph_context.py
- NeighborContext → RelatedComponent
- get_downstream_neighbors → get_downstream_components
- get_upstream_neighbors → get_upstream_components
- get_cofiring_neighbors → get_cofiring_components
- top_k_neighbors → top_k_attributed
- DB columns: neighbor_key → related_key, neighbor_label → related_label

"Neighbours" implied same-layer adjacency; "related components" better
conveys the attribution-graph and co-firing relationships.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/topological_interp/CLAUDE.md              | 14 +++---
 spd/topological_interp/config.py              |  2 +-
 spd/topological_interp/db.py                  | 20 ++++----
 .../{neighbors.py => graph_context.py}        | 37 +++++++-------
 spd/topological_interp/interpret.py           | 50 +++++++++----------
 spd/topological_interp/prompts.py             | 28 +++++------
 spd/topological_interp/schemas.py             |  6 +--
 7 files changed, 78 insertions(+), 79 deletions(-)
 rename spd/topological_interp/{neighbors.py => graph_context.py} (84%)

diff --git a/spd/topological_interp/CLAUDE.md b/spd/topological_interp/CLAUDE.md
index 842f1038d..4da233247 100644
--- a/spd/topological_interp/CLAUDE.md
+++ b/spd/topological_interp/CLAUDE.md
@@ -1,6 +1,6 @@
 # Topological Interpretation Module
 
-Context-aware component labeling using network graph structure. Unlike standard autointerp (one-shot per component), this module uses dataset attributions to provide neighbor context: each component's prompt includes labels from already-labeled neighbors.
+Context-aware component labeling using network graph structure. Unlike standard autointerp (one-shot per component), this module uses dataset attributions to provide graph context: each component's prompt includes labels from already-labeled components connected via the attribution graph.
 
 ## Usage
 
@@ -16,9 +16,9 @@ Requires `OPENROUTER_API_KEY` env var. Requires both harvest data and dataset at
 
 ## Three-Phase Pipeline
 
-1. **Output pass** (late → early): "What does this component DO?" Each component's prompt includes top-K downstream neighbors (by attribution) with their labels. Late layers labeled first so earlier layers see labeled downstream context.
+1. **Output pass** (late → early): "What does this component DO?" Each component's prompt includes top-K downstream components (by attribution) with their labels. Late layers labeled first so earlier layers see labeled downstream context.
 
-2. **Input pass** (early → late): "What TRIGGERS this component?" Each component's prompt includes top-K upstream neighbors (by attribution) + co-firing components (Jaccard/PMI). Plus the output label from phase 1. Early layers labeled first so later layers see labeled upstream context.
+2. **Input pass** (early → late): "What TRIGGERS this component?" Each component's prompt includes top-K upstream components (by attribution) + co-firing components (Jaccard/PMI). Early layers labeled first so later layers see labeled upstream context. Independent of the output pass.
 
 3. **Unification** (parallel): Synthesizes output + input labels into a single unified label per component.
 
@@ -38,7 +38,7 @@ SPD_OUT_DIR/topological_interp/<decomposition_id>/
 - `output_labels`: component_key → label, confidence, reasoning, raw_response, prompt
 - `input_labels`: same schema as output_labels
 - `unified_labels`: same schema as output_labels
-- `prompt_edges`: directed filtered graph of (component, neighbor, direction, pass, attribution, neighbor_label)
+- `prompt_edges`: directed filtered graph of (component, related_key, direction, pass, attribution, related_label)
 - `config`: key-value store
 
 ## Architecture
@@ -48,8 +48,8 @@ SPD_OUT_DIR/topological_interp/<decomposition_id>/
 | `config.py` | `TopologicalInterpConfig`, `TopologicalInterpSlurmConfig` |
 | `schemas.py` | `LabelResult`, `PromptEdge`, path helpers |
 | `db.py` | `TopologicalInterpDB` — SQLite with WAL mode |
-| `ordering.py` | Layer parsing and topological sort (matches frontend `graphLayout.ts`) |
-| `neighbors.py` | Gather neighbor context from attributions + correlations |
+| `ordering.py` | Topological sort via `CanonicalWeight` from topology module |
+| `graph_context.py` | `RelatedComponent`, gather attributed + co-firing components |
 | `prompts.py` | Three prompt formatters (output, input, unification) |
 | `interpret.py` | Main three-phase execution loop |
 | `repo.py` | `TopologicalInterpRepo` — read-only access to results |
@@ -62,7 +62,7 @@ SPD_OUT_DIR/topological_interp/<decomposition_id>/
 - Harvest data (component stats, correlations, token stats)
 - Dataset attributions (component-to-component attribution strengths)
 - Reuses `map_llm_calls` from `spd/autointerp/llm_api.py`
-- Reuses prompt helpers from `spd/autointerp/strategies/dual_view.py`
+- Reuses prompt helpers from `spd/autointerp/prompt_helpers.py`
 
 ## SLURM Integration
 
diff --git a/spd/topological_interp/config.py b/spd/topological_interp/config.py
index 1618167b4..9a580aee7 100644
--- a/spd/topological_interp/config.py
+++ b/spd/topological_interp/config.py
@@ -9,7 +9,7 @@
 class TopologicalInterpConfig(BaseConfig):
     model: str = "google/gemini-3-flash-preview"
     reasoning_effort: Effort = "low"
-    top_k_neighbors: int = 8
+    top_k_attributed: int = 8
     top_k_correlated: int = 5
     max_examples: int = 30
     label_max_words: int = 8
diff --git a/spd/topological_interp/db.py b/spd/topological_interp/db.py
index 276050122..e36c376f8 100644
--- a/spd/topological_interp/db.py
+++ b/spd/topological_interp/db.py
@@ -35,13 +35,13 @@
 
 CREATE TABLE IF NOT EXISTS prompt_edges (
     component_key TEXT NOT NULL,
-    neighbor_key TEXT NOT NULL,
+    related_key TEXT NOT NULL,
     direction TEXT NOT NULL,
     pass TEXT NOT NULL,
     attribution REAL NOT NULL,
-    neighbor_label TEXT,
-    neighbor_confidence TEXT,
-    PRIMARY KEY (component_key, neighbor_key, direction, pass)
+    related_label TEXT,
+    related_confidence TEXT,
+    PRIMARY KEY (component_key, related_key, direction, pass)
 );
 
 CREATE TABLE IF NOT EXISTS config (
@@ -165,12 +165,12 @@ def save_prompt_edges(self, edges: list[PromptEdge]) -> None:
         rows = [
             (
                 e.component_key,
-                e.neighbor_key,
+                e.related_key,
                 e.direction,
                 e.pass_name,
                 e.attribution,
-                e.neighbor_label,
-                e.neighbor_confidence,
+                e.related_label,
+                e.related_confidence,
             )
             for e in edges
         ]
@@ -218,10 +218,10 @@ def _row_to_label_result(row: sqlite3.Row) -> LabelResult:
 def _row_to_prompt_edge(row: sqlite3.Row) -> PromptEdge:
     return PromptEdge(
         component_key=row["component_key"],
-        neighbor_key=row["neighbor_key"],
+        related_key=row["related_key"],
         direction=row["direction"],
         pass_name=row["pass"],
         attribution=row["attribution"],
-        neighbor_label=row["neighbor_label"],
-        neighbor_confidence=row["neighbor_confidence"],
+        related_label=row["related_label"],
+        related_confidence=row["related_confidence"],
     )
diff --git a/spd/topological_interp/neighbors.py b/spd/topological_interp/graph_context.py
similarity index 84%
rename from spd/topological_interp/neighbors.py
rename to spd/topological_interp/graph_context.py
index 711c11109..c7ce70f3a 100644
--- a/spd/topological_interp/neighbors.py
+++ b/spd/topological_interp/graph_context.py
@@ -1,4 +1,4 @@
-"""Gather neighbor context from attributions and correlations."""
+"""Gather related components from attribution graph and co-firing statistics."""
 
 from dataclasses import dataclass
 
@@ -10,7 +10,7 @@
 
 
 @dataclass
-class NeighborContext:
+class RelatedComponent:
     component_key: str
     attribution: float
     label: str | None
@@ -19,14 +19,14 @@ class NeighborContext:
     pmi: float | None
 
 
-def get_downstream_neighbors(
+def get_downstream_components(
     component_key: str,
     attribution_storage: DatasetAttributionStorage,
     correlation_storage: CorrelationStorage,
     db: TopologicalInterpDB,
     layer_descriptions: dict[str, str],
     k: int,
-) -> list[NeighborContext]:
+) -> list[RelatedComponent]:
     """Top-K downstream (later-layer) components by absolute attribution."""
     source_layer, _ = parse_component_key(component_key)
 
@@ -50,17 +50,17 @@ def get_downstream_neighbors(
 
     cofiring = _build_cofiring_lookup(component_key, correlation_storage, k * 3)
 
-    return [_build_neighbor_context(e.component_key, e.value, cofiring, db) for e in downstream]
+    return [_build_related(e.component_key, e.value, cofiring, db) for e in downstream]
 
 
-def get_upstream_neighbors(
+def get_upstream_components(
     component_key: str,
     attribution_storage: DatasetAttributionStorage,
     correlation_storage: CorrelationStorage,
     db: TopologicalInterpDB,
     layer_descriptions: dict[str, str],
     k: int,
-) -> list[NeighborContext]:
+) -> list[RelatedComponent]:
     """Top-K upstream (earlier-layer) components by absolute attribution."""
     target_layer, _ = parse_component_key(component_key)
 
@@ -80,14 +80,14 @@ def get_upstream_neighbors(
 
     cofiring = _build_cofiring_lookup(component_key, correlation_storage, k * 3)
 
-    return [_build_neighbor_context(e.component_key, e.value, cofiring, db) for e in upstream]
+    return [_build_related(e.component_key, e.value, cofiring, db) for e in upstream]
 
 
-def get_cofiring_neighbors(
+def get_cofiring_components(
     component_key: str,
     correlation_storage: CorrelationStorage,
     k: int,
-) -> list[NeighborContext]:
+) -> list[RelatedComponent]:
     """Top-K co-firing components by Jaccard similarity."""
     correlated = get_correlated_components(
         correlation_storage, component_key, metric="jaccard", top_k=k
@@ -101,7 +101,7 @@ def get_cofiring_neighbors(
         pmi_lookup[c.component_key] = c.score
 
     return [
-        NeighborContext(
+        RelatedComponent(
             component_key=c.component_key,
             attribution=0.0,
             label=None,
@@ -118,7 +118,6 @@ def _build_cofiring_lookup(
     correlation_storage: CorrelationStorage,
     k: int,
 ) -> dict[str, tuple[float, float | None]]:
-    """Build {neighbor_key: (jaccard, pmi)} lookup for co-firing stats."""
     lookup: dict[str, tuple[float, float | None]] = {}
 
     jaccard_results = get_correlated_components(
@@ -140,17 +139,17 @@ def _build_cofiring_lookup(
     return lookup
 
 
-def _build_neighbor_context(
-    neighbor_key: str,
+def _build_related(
+    related_key: str,
     attribution: float,
     cofiring: dict[str, tuple[float, float | None]],
     db: TopologicalInterpDB,
-) -> NeighborContext:
-    output_label = db.get_output_label(neighbor_key)
-    jaccard, pmi = cofiring.get(neighbor_key, (None, None))
+) -> RelatedComponent:
+    output_label = db.get_output_label(related_key)
+    jaccard, pmi = cofiring.get(related_key, (None, None))
 
-    return NeighborContext(
-        component_key=neighbor_key,
+    return RelatedComponent(
+        component_key=related_key,
         attribution=attribution,
         label=output_label.label if output_label else None,
         confidence=output_label.confidence if output_label else None,
diff --git a/spd/topological_interp/interpret.py b/spd/topological_interp/interpret.py
index e4e297da1..6ff0f52e2 100644
--- a/spd/topological_interp/interpret.py
+++ b/spd/topological_interp/interpret.py
@@ -21,11 +21,11 @@
 from spd.log import logger
 from spd.topological_interp.config import TopologicalInterpConfig
 from spd.topological_interp.db import TopologicalInterpDB
-from spd.topological_interp.neighbors import (
-    NeighborContext,
-    get_cofiring_neighbors,
-    get_downstream_neighbors,
-    get_upstream_neighbors,
+from spd.topological_interp.graph_context import (
+    RelatedComponent,
+    get_cofiring_components,
+    get_downstream_components,
+    get_upstream_components,
 )
 from spd.topological_interp.ordering import group_and_sort_by_layer
 from spd.topological_interp.prompts import (
@@ -142,13 +142,13 @@ def _build_output_jobs(
         assert input_stats is not None, f"No input token stats for {key}"
         assert output_stats is not None, f"No output token stats for {key}"
 
-        downstream = get_downstream_neighbors(
+        downstream = get_downstream_components(
             key,
             attribution_storage,
             correlation_storage,
             db,
             model_metadata.layer_descriptions,
-            config.top_k_neighbors,
+            config.top_k_attributed,
         )
 
         _save_edges_for_pass(db, key, downstream, [], "output")
@@ -159,7 +159,7 @@ def _build_output_jobs(
             app_tok=app_tok,
             input_token_stats=input_stats,
             output_token_stats=output_stats,
-            downstream_neighbors=downstream,
+            downstream=downstream,
             label_max_words=config.label_max_words,
             max_examples=config.max_examples,
         )
@@ -256,15 +256,15 @@ def _build_input_jobs(
         assert input_stats is not None, f"No input token stats for {key}"
         assert output_stats is not None, f"No output token stats for {key}"
 
-        upstream = get_upstream_neighbors(
+        upstream = get_upstream_components(
             key,
             attribution_storage,
             correlation_storage,
             db,
             model_metadata.layer_descriptions,
-            config.top_k_neighbors,
+            config.top_k_attributed,
         )
-        cofiring = get_cofiring_neighbors(key, correlation_storage, config.top_k_correlated)
+        cofiring = get_cofiring_components(key, correlation_storage, config.top_k_correlated)
 
         _save_edges_for_pass(db, key, [], upstream, "input", cofiring=cofiring)
 
@@ -274,8 +274,8 @@ def _build_input_jobs(
             app_tok=app_tok,
             input_token_stats=input_stats,
             output_token_stats=output_stats,
-            upstream_neighbors=upstream,
-            cofiring_neighbors=cofiring,
+            upstream=upstream,
+            cofiring=cofiring,
             label_max_words=config.label_max_words,
             max_examples=config.max_examples,
         )
@@ -456,10 +456,10 @@ def _check_error_rate(n_errors: int, n_done: int) -> None:
 def _save_edges_for_pass(
     db: TopologicalInterpDB,
     component_key: str,
-    downstream: list[NeighborContext],
-    upstream: list[NeighborContext],
+    downstream: list[RelatedComponent],
+    upstream: list[RelatedComponent],
     pass_name: Literal["output", "input"],
-    cofiring: list[NeighborContext] | None = None,
+    cofiring: list[RelatedComponent] | None = None,
 ) -> None:
     edges: list[PromptEdge] = []
 
@@ -467,12 +467,12 @@ def _save_edges_for_pass(
         edges.append(
             PromptEdge(
                 component_key=component_key,
-                neighbor_key=n.component_key,
+                related_key=n.component_key,
                 direction="downstream",
                 pass_name=pass_name,
                 attribution=n.attribution,
-                neighbor_label=n.label,
-                neighbor_confidence=n.confidence,
+                related_label=n.label,
+                related_confidence=n.confidence,
             )
         )
 
@@ -480,12 +480,12 @@ def _save_edges_for_pass(
         edges.append(
             PromptEdge(
                 component_key=component_key,
-                neighbor_key=n.component_key,
+                related_key=n.component_key,
                 direction="upstream",
                 pass_name=pass_name,
                 attribution=n.attribution,
-                neighbor_label=n.label,
-                neighbor_confidence=n.confidence,
+                related_label=n.label,
+                related_confidence=n.confidence,
             )
         )
 
@@ -494,12 +494,12 @@ def _save_edges_for_pass(
             edges.append(
                 PromptEdge(
                     component_key=component_key,
-                    neighbor_key=n.component_key,
+                    related_key=n.component_key,
                     direction="upstream",
                     pass_name=pass_name,
                     attribution=n.attribution,
-                    neighbor_label=n.label,
-                    neighbor_confidence=n.confidence,
+                    related_label=n.label,
+                    related_confidence=n.confidence,
                 )
             )
 
diff --git a/spd/topological_interp/prompts.py b/spd/topological_interp/prompts.py
index 3a3e62469..0c4abc8ca 100644
--- a/spd/topological_interp/prompts.py
+++ b/spd/topological_interp/prompts.py
@@ -23,7 +23,7 @@
 from spd.autointerp.schemas import ModelMetadata
 from spd.harvest.analysis import TokenPRLift
 from spd.harvest.schemas import ComponentData
-from spd.topological_interp.neighbors import NeighborContext
+from spd.topological_interp.graph_context import RelatedComponent
 from spd.topological_interp.schemas import LabelResult
 
 LABEL_SCHEMA: dict[str, object] = {
@@ -105,14 +105,14 @@ def format_output_prompt(
     app_tok: AppTokenizer,
     input_token_stats: TokenPRLift,
     output_token_stats: TokenPRLift,
-    downstream_neighbors: list[NeighborContext],
+    downstream: list[RelatedComponent],
     label_max_words: int,
     max_examples: int,
 ) -> str:
     context = _build_context_block(
         component, model_metadata, app_tok, input_token_stats, output_token_stats, max_examples
     )
-    downstream_table = _format_neighbor_table(downstream_neighbors)
+    downstream_table = _format_attributed_table(downstream)
 
     return f"""\
 You are analyzing a component in a neural network to understand its OUTPUT FUNCTION — what it does when it fires.
@@ -142,16 +142,16 @@ def format_input_prompt(
     app_tok: AppTokenizer,
     input_token_stats: TokenPRLift,
     output_token_stats: TokenPRLift,
-    upstream_neighbors: list[NeighborContext],
-    cofiring_neighbors: list[NeighborContext],
+    upstream: list[RelatedComponent],
+    cofiring: list[RelatedComponent],
     label_max_words: int,
     max_examples: int,
 ) -> str:
     context = _build_context_block(
         component, model_metadata, app_tok, input_token_stats, output_token_stats, max_examples
     )
-    upstream_table = _format_neighbor_table(upstream_neighbors)
-    cofiring_table = _format_cofiring_table(cofiring_neighbors)
+    upstream_table = _format_attributed_table(upstream)
+    cofiring_table = _format_cofiring_table(cofiring)
 
     return f"""\
 You are analyzing a component in a neural network to understand its INPUT FUNCTION — what triggers it to fire.
@@ -198,12 +198,12 @@ def format_unification_prompt(
 """
 
 
-def _format_neighbor_table(neighbors: list[NeighborContext]) -> str:
-    if not neighbors:
-        return "(no attributed neighbors found)\n"
+def _format_attributed_table(components: list[RelatedComponent]) -> str:
+    if not components:
+        return "(no attributed components found)\n"
 
     lines: list[str] = []
-    for n in neighbors:
+    for n in components:
         parts = [f"  {n.component_key} (attribution: {n.attribution:.4f}"]
         if n.jaccard is not None:
             parts.append(f", co-firing Jaccard: {n.jaccard:.3f}")
@@ -217,12 +217,12 @@ def _format_neighbor_table(neighbors: list[NeighborContext]) -> str:
     return "\n".join(lines) + "\n"
 
 
-def _format_cofiring_table(neighbors: list[NeighborContext]) -> str:
-    if not neighbors:
+def _format_cofiring_table(components: list[RelatedComponent]) -> str:
+    if not components:
         return "(no co-firing components found)\n"
 
     lines: list[str] = []
-    for n in neighbors:
+    for n in components:
         parts = [f"  {n.component_key}"]
         if n.jaccard is not None:
             parts.append(f" (Jaccard: {n.jaccard:.3f}")
diff --git a/spd/topological_interp/schemas.py b/spd/topological_interp/schemas.py
index 102f8e279..a403aa2c1 100644
--- a/spd/topological_interp/schemas.py
+++ b/spd/topological_interp/schemas.py
@@ -30,9 +30,9 @@ class LabelResult:
 @dataclass
 class PromptEdge:
     component_key: str
-    neighbor_key: str
+    related_key: str
     direction: Literal["upstream", "downstream"]
     pass_name: Literal["output", "input"]
     attribution: float
-    neighbor_label: str | None
-    neighbor_confidence: str | None
+    related_label: str | None
+    related_confidence: str | None

From c7991a9da9fce7cbd3f11dfdde9e8e73d5495374 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Fri, 20 Feb 2026 14:04:21 +0000
Subject: [PATCH 24/62] Skip components missing labels in unification pass

If a component failed its output or input pass (e.g. transient API error),
the unification pass now logs a warning and skips it instead of asserting
and silently deadlocking the async pipeline.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/topological_interp/interpret.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/spd/topological_interp/interpret.py b/spd/topological_interp/interpret.py
index 6ff0f52e2..127473897 100644
--- a/spd/topological_interp/interpret.py
+++ b/spd/topological_interp/interpret.py
@@ -358,11 +358,18 @@ def _build_unification_jobs(
     db: TopologicalInterpDB,
     config: TopologicalInterpConfig,
 ) -> Iterable[LLMJob]:
+    n_skipped = 0
     for key in keys:
         output_label = db.get_output_label(key)
         input_label = db.get_input_label(key)
-        assert output_label is not None, f"Output label missing for {key}"
-        assert input_label is not None, f"Input label missing for {key}"
+        if output_label is None or input_label is None:
+            n_skipped += 1
+            logger.warning(
+                f"Skipping unification for {key}: "
+                f"output={'yes' if output_label else 'MISSING'}, "
+                f"input={'yes' if input_label else 'MISSING'}"
+            )
+            continue
 
         prompt = format_unification_prompt(
             output_label=output_label,
@@ -370,6 +377,8 @@ def _build_unification_jobs(
             label_max_words=config.label_max_words,
         )
         yield LLMJob(prompt=prompt, schema=LABEL_SCHEMA, key=key)
+    if n_skipped:
+        logger.warning(f"Skipped {n_skipped} components missing output or input labels")
 
 
 async def _run_unification(

From cb18c86a77720f94a292e7421a19694082813c8c Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Fri, 20 Feb 2026 14:29:22 +0000
Subject: [PATCH 25/62] Use in-memory accumulator for scan state, DB is
 write-only

Each directional pass now maintains labels_so_far: dict[str, LabelResult]
as the scan accumulator. Related components look up labels from this dict
instead of querying the DB. The DB is seeded from on resume and written to
for durability, but never read mid-scan.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/topological_interp/graph_context.py |  18 ++--
 spd/topological_interp/interpret.py     | 105 ++++++++++--------------
 2 files changed, 52 insertions(+), 71 deletions(-)

diff --git a/spd/topological_interp/graph_context.py b/spd/topological_interp/graph_context.py
index c7ce70f3a..316f51a85 100644
--- a/spd/topological_interp/graph_context.py
+++ b/spd/topological_interp/graph_context.py
@@ -5,8 +5,8 @@
 from spd.dataset_attributions.storage import DatasetAttributionStorage
 from spd.harvest.analysis import get_correlated_components
 from spd.harvest.storage import CorrelationStorage
-from spd.topological_interp.db import TopologicalInterpDB
 from spd.topological_interp.ordering import is_later_layer, parse_component_key
+from spd.topological_interp.schemas import LabelResult
 
 
 @dataclass
@@ -23,7 +23,7 @@ def get_downstream_components(
     component_key: str,
     attribution_storage: DatasetAttributionStorage,
     correlation_storage: CorrelationStorage,
-    db: TopologicalInterpDB,
+    labels_so_far: dict[str, LabelResult],
     layer_descriptions: dict[str, str],
     k: int,
 ) -> list[RelatedComponent]:
@@ -50,14 +50,14 @@ def get_downstream_components(
 
     cofiring = _build_cofiring_lookup(component_key, correlation_storage, k * 3)
 
-    return [_build_related(e.component_key, e.value, cofiring, db) for e in downstream]
+    return [_build_related(e.component_key, e.value, cofiring, labels_so_far) for e in downstream]
 
 
 def get_upstream_components(
     component_key: str,
     attribution_storage: DatasetAttributionStorage,
     correlation_storage: CorrelationStorage,
-    db: TopologicalInterpDB,
+    labels_so_far: dict[str, LabelResult],
     layer_descriptions: dict[str, str],
     k: int,
 ) -> list[RelatedComponent]:
@@ -80,7 +80,7 @@ def get_upstream_components(
 
     cofiring = _build_cofiring_lookup(component_key, correlation_storage, k * 3)
 
-    return [_build_related(e.component_key, e.value, cofiring, db) for e in upstream]
+    return [_build_related(e.component_key, e.value, cofiring, labels_so_far) for e in upstream]
 
 
 def get_cofiring_components(
@@ -143,16 +143,16 @@ def _build_related(
     related_key: str,
     attribution: float,
     cofiring: dict[str, tuple[float, float | None]],
-    db: TopologicalInterpDB,
+    labels_so_far: dict[str, LabelResult],
 ) -> RelatedComponent:
-    output_label = db.get_output_label(related_key)
+    label = labels_so_far.get(related_key)
     jaccard, pmi = cofiring.get(related_key, (None, None))
 
     return RelatedComponent(
         component_key=related_key,
         attribution=attribution,
-        label=output_label.label if output_label else None,
-        confidence=output_label.confidence if output_label else None,
+        label=label.label if label else None,
+        confidence=label.confidence if label else None,
         jaccard=jaccard,
         pmi=pmi,
     )
diff --git a/spd/topological_interp/interpret.py b/spd/topological_interp/interpret.py
index 127473897..1b9637323 100644
--- a/spd/topological_interp/interpret.py
+++ b/spd/topological_interp/interpret.py
@@ -3,6 +3,11 @@
 Phase 1: Output pass (late → early) — "What does this component DO?"
 Phase 2: Input pass (early → late) — "What TRIGGERS this component?"
 Phase 3: Unification (parallel over all) — Synthesize into unified label.
+
+Each directional pass is a scan: components are processed layer-by-layer, and
+each component's prompt includes labels from previously-labeled components in
+the same scan direction. The scan accumulator is an in-memory dict; the DB is
+a write-only durable sink (read only on resume to seed the accumulator).
 """
 
 import asyncio
@@ -51,7 +56,6 @@ def run_topological_interp(
     app_tok = AppTokenizer.from_pretrained(tokenizer_name)
     components = harvest.get_all_components()
 
-    # Sort by firing density descending; skip zero-firing components
     components = [c for c in components if c.firing_density > 0.0]
     components = sorted(components, key=lambda c: c.firing_density, reverse=True)
 
@@ -130,6 +134,7 @@ def _build_output_jobs(
     app_tok: AppTokenizer,
     attribution_storage: DatasetAttributionStorage,
     correlation_storage: CorrelationStorage,
+    labels_so_far: dict[str, LabelResult],
     db: TopologicalInterpDB,
     config: TopologicalInterpConfig,
     model_metadata: ModelMetadata,
@@ -146,12 +151,12 @@ def _build_output_jobs(
             key,
             attribution_storage,
             correlation_storage,
-            db,
+            labels_so_far,
             model_metadata.layer_descriptions,
             config.top_k_attributed,
         )
 
-        _save_edges_for_pass(db, key, downstream, [], "output")
+        _save_edges(db, key, downstream, "downstream", "output")
 
         prompt = format_output_prompt(
             component=component,
@@ -179,14 +184,15 @@ async def _run_output_pass(
     correlation_storage: CorrelationStorage,
     total: int,
 ) -> None:
-    completed = db.get_completed_output_keys()
-    if completed:
-        logger.info(f"Output pass: resuming, {len(completed)} already completed")
+    # Seed scan accumulator from DB for resume
+    labels_so_far: dict[str, LabelResult] = db.get_all_output_labels()
+    if labels_so_far:
+        logger.info(f"Output pass: resuming, {len(labels_so_far)} already completed")
 
     completed_so_far = 0
 
     for layer, keys in reversed(layers_ordered):
-        pending = [k for k in keys if k not in completed]
+        pending = [k for k in keys if k not in labels_so_far]
         if not pending:
             completed_so_far += len(keys)
             continue
@@ -198,6 +204,7 @@ async def _run_output_pass(
             app_tok,
             attribution_storage,
             correlation_storage,
+            labels_so_far,
             db,
             config,
             model_metadata,
@@ -221,8 +228,8 @@ async def _run_output_pass(
             match outcome:
                 case LLMResult(job=job, parsed=parsed, raw=raw):
                     result = _parsed_to_label_result(job.key, parsed, raw, job.prompt)
+                    labels_so_far[job.key] = result
                     db.save_output_label(result)
-                    completed.add(job.key)
                     n_done += 1
                 case LLMError(job=job, error=e):
                     n_errors += 1
@@ -244,6 +251,7 @@ def _build_input_jobs(
     app_tok: AppTokenizer,
     attribution_storage: DatasetAttributionStorage,
     correlation_storage: CorrelationStorage,
+    labels_so_far: dict[str, LabelResult],
     db: TopologicalInterpDB,
     config: TopologicalInterpConfig,
     model_metadata: ModelMetadata,
@@ -260,13 +268,14 @@ def _build_input_jobs(
             key,
             attribution_storage,
             correlation_storage,
-            db,
+            labels_so_far,
             model_metadata.layer_descriptions,
             config.top_k_attributed,
         )
         cofiring = get_cofiring_components(key, correlation_storage, config.top_k_correlated)
 
-        _save_edges_for_pass(db, key, [], upstream, "input", cofiring=cofiring)
+        _save_edges(db, key, upstream, "upstream", "input")
+        _save_edges(db, key, cofiring, "upstream", "input")
 
         prompt = format_input_prompt(
             component=component,
@@ -295,14 +304,15 @@ async def _run_input_pass(
     correlation_storage: CorrelationStorage,
     total: int,
 ) -> None:
-    completed = db.get_completed_input_keys()
-    if completed:
-        logger.info(f"Input pass: resuming, {len(completed)} already completed")
+    # Seed scan accumulator from DB for resume
+    labels_so_far: dict[str, LabelResult] = db.get_all_input_labels()
+    if labels_so_far:
+        logger.info(f"Input pass: resuming, {len(labels_so_far)} already completed")
 
     completed_so_far = 0
 
     for layer, keys in layers_ordered:
-        pending = [k for k in keys if k not in completed]
+        pending = [k for k in keys if k not in labels_so_far]
         if not pending:
             completed_so_far += len(keys)
             continue
@@ -314,6 +324,7 @@ async def _run_input_pass(
             app_tok,
             attribution_storage,
             correlation_storage,
+            labels_so_far,
             db,
             config,
             model_metadata,
@@ -337,8 +348,8 @@ async def _run_input_pass(
             match outcome:
                 case LLMResult(job=job, parsed=parsed, raw=raw):
                     result = _parsed_to_label_result(job.key, parsed, raw, job.prompt)
+                    labels_so_far[job.key] = result
                     db.save_input_label(result)
-                    completed.add(job.key)
                     n_done += 1
                 case LLMError(job=job, error=e):
                     n_errors += 1
@@ -462,55 +473,25 @@ def _check_error_rate(n_errors: int, n_done: int) -> None:
         )
 
 
-def _save_edges_for_pass(
+def _save_edges(
     db: TopologicalInterpDB,
     component_key: str,
-    downstream: list[RelatedComponent],
-    upstream: list[RelatedComponent],
+    related: list[RelatedComponent],
+    direction: Literal["upstream", "downstream"],
     pass_name: Literal["output", "input"],
-    cofiring: list[RelatedComponent] | None = None,
 ) -> None:
-    edges: list[PromptEdge] = []
-
-    for n in downstream:
-        edges.append(
-            PromptEdge(
-                component_key=component_key,
-                related_key=n.component_key,
-                direction="downstream",
-                pass_name=pass_name,
-                attribution=n.attribution,
-                related_label=n.label,
-                related_confidence=n.confidence,
-            )
-        )
-
-    for n in upstream:
-        edges.append(
-            PromptEdge(
-                component_key=component_key,
-                related_key=n.component_key,
-                direction="upstream",
-                pass_name=pass_name,
-                attribution=n.attribution,
-                related_label=n.label,
-                related_confidence=n.confidence,
-            )
+    if not related:
+        return
+    edges = [
+        PromptEdge(
+            component_key=component_key,
+            related_key=r.component_key,
+            direction=direction,
+            pass_name=pass_name,
+            attribution=r.attribution,
+            related_label=r.label,
+            related_confidence=r.confidence,
         )
-
-    if cofiring:
-        for n in cofiring:
-            edges.append(
-                PromptEdge(
-                    component_key=component_key,
-                    related_key=n.component_key,
-                    direction="upstream",
-                    pass_name=pass_name,
-                    attribution=n.attribution,
-                    related_label=n.label,
-                    related_confidence=n.confidence,
-                )
-            )
-
-    if edges:
-        db.save_prompt_edges(edges)
+        for r in related
+    ]
+    db.save_prompt_edges(edges)

From b5d3a02341840abc975887e8438ee22fc29fa55b Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Thu, 19 Feb 2026 16:33:17 +0000
Subject: [PATCH 26/62] Request 1 GPU for autointerp/eval/intruder SLURM jobs

ComponentModel loading exceeds 160GB CPU-only allocation. Requesting a
GPU gives 24 CPUs + 240GB RAM via DefCpuPerGPU/DefMemPerCPU defaults.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/autointerp/scripts/run_slurm.py | 6 ++----
 spd/postprocess/__init__.py         | 3 +--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/spd/autointerp/scripts/run_slurm.py b/spd/autointerp/scripts/run_slurm.py
index c521ac439..278edf562 100644
--- a/spd/autointerp/scripts/run_slurm.py
+++ b/spd/autointerp/scripts/run_slurm.py
@@ -59,8 +59,7 @@ def submit_autointerp(
     interpret_slurm = SlurmConfig(
         job_name="spd-interpret",
         partition=config.partition,
-        n_gpus=0,
-        cpus_per_task=16,
+        n_gpus=1,
         time=config.time,
         snapshot_branch=snapshot_branch,
         dependency_job_id=dependency_job_id,
@@ -98,8 +97,7 @@ def submit_autointerp(
         eval_slurm = SlurmConfig(
             job_name=f"spd-{scorer}",
             partition=config.partition,
-            n_gpus=0,
-            cpus_per_task=16,
+            n_gpus=1,
             time=config.evals_time,
             snapshot_branch=snapshot_branch,
             dependency_job_id=interpret_result.job_id,
diff --git a/spd/postprocess/__init__.py b/spd/postprocess/__init__.py
index 616cd144c..ffe17616a 100644
--- a/spd/postprocess/__init__.py
+++ b/spd/postprocess/__init__.py
@@ -68,8 +68,7 @@ def postprocess(config: PostprocessConfig) -> Path:
         intruder_slurm = SlurmConfig(
             job_name="spd-intruder-eval",
             partition=config.intruder.partition,
-            n_gpus=0,
-            cpus_per_task=16,
+            n_gpus=1,
             time=config.intruder.time,
             snapshot_branch=snapshot_branch,
             dependency_job_id=harvest_result.merge_result.job_id,

From 09c8bd85bb140c176d583cdfd9475cdd95eb2758 Mon Sep 17 00:00:00 2001
From: Dan Braun <dan.braun@goodfire.ai>
Date: Thu, 19 Feb 2026 18:07:02 +0000
Subject: [PATCH 27/62] Fix YAML configs to use current schema and fix
 misleading error message (#408)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Update 7 YAML files: bare `lr:` → `lr_schedule:` block in PGD optimizer configs
- Update 5 YAML files: deprecated scope names (`batch_invariant` → `repeat_across_batch`,
  `unique_per_batch_per_token` → `per_batch_per_position`, `n_masks` → `n_sources`)
- Remove redundant `coeff: null` from eval_metric_configs across 16 YAML files
- Fix misleading error message in persistent_pgd.py (said "use fewer ranks" but
  fewer ranks makes the problem worse)

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../lm/pile_llama_simple_mlp-2L.yaml            | 16 +++++++++-------
 ...llama_simple_mlp-4L-finetune-s-788ccb89.yaml |  7 ++-----
 .../lm/pile_llama_simple_mlp-4L-lucius.yaml     | 17 +++++++++--------
 .../lm/pile_llama_simple_mlp-4L.yaml            | 14 ++++++++------
 spd/experiments/lm/ss_gpt2_simple-1L.yaml       |  6 ++----
 spd/experiments/lm/ss_gpt2_simple-2L.yaml       |  6 ++----
 spd/experiments/lm/ss_llama_simple-1L.yaml      |  6 ++----
 spd/experiments/lm/ss_llama_simple-2L.yaml      |  6 ++----
 spd/experiments/lm/ss_llama_simple_mlp-1L.yaml  |  6 ++----
 .../lm/ss_llama_simple_mlp-2L-wide.yaml         |  2 --
 ...llama_simple_mlp-2L-wide_global_reverse.yaml |  6 ++----
 spd/experiments/lm/ss_llama_simple_mlp-2L.yaml  |  6 ++----
 spd/experiments/lm/ss_llama_simple_mlp.yaml     |  6 ++----
 spd/experiments/lm/z-jan22.yaml                 |  6 ++----
 spd/experiments/lm/z-jan22_ppgd.yaml            |  9 +++++++--
 .../z-jan22_ppgd_reverse_resid_ablations.yaml   |  6 +++++-
 .../lm/z-jan22_ppgd_transformer_normed.yaml     |  9 +++++++--
 ...jan22_ppgd_transformer_normed_ablations.yaml |  9 +++++++--
 .../z-jan22_ppgd_transformer_normed_deep.yaml   |  6 +++++-
 .../resid_mlp1_global_reverse_config.yaml       |  1 -
 .../resid_mlp2_global_reverse_config.yaml       |  2 --
 spd/persistent_pgd.py                           |  2 +-
 22 files changed, 78 insertions(+), 76 deletions(-)

diff --git a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
index 0d57991a9..50f3b51f6 100644
--- a/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
+++ b/spd/experiments/lm/pile_llama_simple_mlp-2L.yaml
@@ -56,13 +56,17 @@ loss_metric_configs:
   classname: PersistentPGDReconSubsetLoss
   optimizer:
     type: adam
-    lr: 0.1
+    lr_schedule:
+      start_val: 0.1
+      warmup_pct: 0.0
+      final_val_frac: 1.0
+      fn_type: constant
     beta1: 0.9
     beta2: 0.99
     eps: 1.0e-08
   scope:
-    type: batch_invariant
-    n_masks: 8
+    type: repeat_across_batch
+    n_sources: 8
   routing:
     type: uniform_k_subset
 - coeff: 1000000.0
@@ -102,10 +106,8 @@ eval_metric_configs:
 - classname: CEandKLLosses
   rounding_threshold: 0.0
 - classname: CIMeanPerComponent
-- coeff: null
-  classname: StochasticHiddenActsReconLoss
-- coeff: null
-  init: random
+- classname: StochasticHiddenActsReconLoss
+- init: random
   step_size: 0.1
   n_steps: 20
   mask_scope: shared_across_batch
diff --git a/spd/experiments/lm/pile_llama_simple_mlp-4L-finetune-s-788ccb89.yaml b/spd/experiments/lm/pile_llama_simple_mlp-4L-finetune-s-788ccb89.yaml
index 463782776..e1430c4e0 100644
--- a/spd/experiments/lm/pile_llama_simple_mlp-4L-finetune-s-788ccb89.yaml
+++ b/spd/experiments/lm/pile_llama_simple_mlp-4L-finetune-s-788ccb89.yaml
@@ -74,7 +74,6 @@ lr_schedule:
   fn_type: constant
 steps: 5000
 batch_size: 64
-gradient_accumulation_steps: 1
 grad_clip_norm_components: 0.01
 grad_clip_norm_ci_fns: null
 faithfulness_warmup_steps: 0
@@ -106,10 +105,8 @@ eval_metric_configs:
 - classname: CEandKLLosses
   rounding_threshold: 0.0
 - classname: CIMeanPerComponent
-- coeff: null
-  classname: StochasticHiddenActsReconLoss
-- coeff: null
-  init: random
+- classname: StochasticHiddenActsReconLoss
+- init: random
   step_size: 0.1
   n_steps: 20
   mask_scope: shared_across_batch
diff --git a/spd/experiments/lm/pile_llama_simple_mlp-4L-lucius.yaml b/spd/experiments/lm/pile_llama_simple_mlp-4L-lucius.yaml
index 77d3d13f6..3e145fdc1 100644
--- a/spd/experiments/lm/pile_llama_simple_mlp-4L-lucius.yaml
+++ b/spd/experiments/lm/pile_llama_simple_mlp-4L-lucius.yaml
@@ -56,13 +56,17 @@ loss_metric_configs:
   classname: PersistentPGDReconSubsetLoss
   optimizer:
     type: adam
-    lr: 0.01
+    lr_schedule:
+      start_val: 0.01
+      warmup_pct: 0.0
+      final_val_frac: 1.0
+      fn_type: constant
     beta1: 0.8
     beta2: 0.99
     eps: 1.0e-08
   scope:
-    type: batch_invariant
-    n_masks: 8
+    type: repeat_across_batch
+    n_sources: 8
   routing:
     type: uniform_k_subset
 - coeff: 1000000.0
@@ -75,7 +79,6 @@ lr_schedule:
   fn_type: cosine
 steps: 400000
 batch_size: 128
-gradient_accumulation_steps: 1
 grad_clip_norm_components: 0.01
 grad_clip_norm_ci_fns: null
 faithfulness_warmup_steps: 400
@@ -107,10 +110,8 @@ eval_metric_configs:
 - classname: CEandKLLosses
   rounding_threshold: 0.0
 - classname: CIMeanPerComponent
-- coeff: null
-  classname: StochasticHiddenActsReconLoss
-- coeff: null
-  init: random
+- classname: StochasticHiddenActsReconLoss
+- init: random
   step_size: 0.1
   n_steps: 20
   mask_scope: shared_across_batch
diff --git a/spd/experiments/lm/pile_llama_simple_mlp-4L.yaml b/spd/experiments/lm/pile_llama_simple_mlp-4L.yaml
index 4b5e82bb8..5ec8190be 100644
--- a/spd/experiments/lm/pile_llama_simple_mlp-4L.yaml
+++ b/spd/experiments/lm/pile_llama_simple_mlp-4L.yaml
@@ -40,7 +40,7 @@ module_info:
 identity_module_info: null
 use_delta_component: true
 loss_metric_configs:
-- coeff: 0.0004
+- coeff: 0.0006
   classname: ImportanceMinimalityLoss
   pnorm: 2.0
   beta: 0.2
@@ -56,10 +56,14 @@ loss_metric_configs:
   classname: PersistentPGDReconLoss
   optimizer:
     type: adam
-    lr: 0.01
     beta1: 0.5
     beta2: 0.99
     eps: 1.0e-08
+    lr_schedule:
+      start_val: 0.01
+      warmup_pct: 0.025
+      final_val_frac: 0.1
+      fn_type: cosine
   scope:
     type: per_batch_per_position
   use_sigmoid_parameterization: false
@@ -105,10 +109,8 @@ eval_metric_configs:
 - classname: CEandKLLosses
   rounding_threshold: 0.0
 - classname: CIMeanPerComponent
-- coeff: null
-  classname: StochasticHiddenActsReconLoss
-- coeff: null
-  init: random
+- classname: StochasticHiddenActsReconLoss
+- init: random
   step_size: 0.1
   n_steps: 20
   mask_scope: shared_across_batch
diff --git a/spd/experiments/lm/ss_gpt2_simple-1L.yaml b/spd/experiments/lm/ss_gpt2_simple-1L.yaml
index 1bb58ab2a..de5f9a4d6 100644
--- a/spd/experiments/lm/ss_gpt2_simple-1L.yaml
+++ b/spd/experiments/lm/ss_gpt2_simple-1L.yaml
@@ -80,10 +80,8 @@ eval_metric_configs:
 - classname: CEandKLLosses
   rounding_threshold: 0.0
 - classname: CIMeanPerComponent
-- coeff: null
-  classname: StochasticHiddenActsReconLoss
-- coeff: null
-  init: random
+- classname: StochasticHiddenActsReconLoss
+- init: random
   step_size: 0.1
   n_steps: 20
   mask_scope: shared_across_batch
diff --git a/spd/experiments/lm/ss_gpt2_simple-2L.yaml b/spd/experiments/lm/ss_gpt2_simple-2L.yaml
index 7efa90d05..e1ff2a0d9 100644
--- a/spd/experiments/lm/ss_gpt2_simple-2L.yaml
+++ b/spd/experiments/lm/ss_gpt2_simple-2L.yaml
@@ -82,10 +82,8 @@ eval_metric_configs:
 - classname: CEandKLLosses
   rounding_threshold: 0.0
 - classname: CIMeanPerComponent
-- coeff: null
-  classname: StochasticHiddenActsReconLoss
-- coeff: null
-  init: random
+- classname: StochasticHiddenActsReconLoss
+- init: random
   step_size: 0.1
   n_steps: 20
   mask_scope: shared_across_batch
diff --git a/spd/experiments/lm/ss_llama_simple-1L.yaml b/spd/experiments/lm/ss_llama_simple-1L.yaml
index 11082574f..b9a330e48 100644
--- a/spd/experiments/lm/ss_llama_simple-1L.yaml
+++ b/spd/experiments/lm/ss_llama_simple-1L.yaml
@@ -80,10 +80,8 @@ eval_metric_configs:
 - classname: CEandKLLosses
   rounding_threshold: 0.0
 - classname: CIMeanPerComponent
-- coeff: null
-  classname: StochasticHiddenActsReconLoss
-- coeff: null
-  init: random
+- classname: StochasticHiddenActsReconLoss
+- init: random
   step_size: 0.1
   n_steps: 20
   mask_scope: shared_across_batch
diff --git a/spd/experiments/lm/ss_llama_simple-2L.yaml b/spd/experiments/lm/ss_llama_simple-2L.yaml
index 3ed7a7b0d..34d6106ff 100644
--- a/spd/experiments/lm/ss_llama_simple-2L.yaml
+++ b/spd/experiments/lm/ss_llama_simple-2L.yaml
@@ -82,10 +82,8 @@ eval_metric_configs:
 - classname: CEandKLLosses
   rounding_threshold: 0.0
 - classname: CIMeanPerComponent
-- coeff: null
-  classname: StochasticHiddenActsReconLoss
-- coeff: null
-  init: random
+- classname: StochasticHiddenActsReconLoss
+- init: random
   step_size: 0.1
   n_steps: 20
   mask_scope: shared_across_batch
diff --git a/spd/experiments/lm/ss_llama_simple_mlp-1L.yaml b/spd/experiments/lm/ss_llama_simple_mlp-1L.yaml
index 8af7f6601..146bc1362 100644
--- a/spd/experiments/lm/ss_llama_simple_mlp-1L.yaml
+++ b/spd/experiments/lm/ss_llama_simple_mlp-1L.yaml
@@ -74,10 +74,8 @@ eval_metric_configs:
 - classname: CEandKLLosses
   rounding_threshold: 0.0
 - classname: CIMeanPerComponent
-- coeff: null
-  classname: StochasticHiddenActsReconLoss
-- coeff: null
-  init: random
+- classname: StochasticHiddenActsReconLoss
+- init: random
   step_size: 0.1
   n_steps: 20
   mask_scope: shared_across_batch
diff --git a/spd/experiments/lm/ss_llama_simple_mlp-2L-wide.yaml b/spd/experiments/lm/ss_llama_simple_mlp-2L-wide.yaml
index 5e938cd5d..79ab8fea7 100644
--- a/spd/experiments/lm/ss_llama_simple_mlp-2L-wide.yaml
+++ b/spd/experiments/lm/ss_llama_simple_mlp-2L-wide.yaml
@@ -83,9 +83,7 @@ eval_metric_configs:
   rounding_threshold: 0
 - classname: CIMeanPerComponent
 - classname: StochasticHiddenActsReconLoss
-  coeff: null
 - classname: PGDReconLoss
-  coeff: null
   init: random
   step_size: 0.1
   n_steps: 20
diff --git a/spd/experiments/lm/ss_llama_simple_mlp-2L-wide_global_reverse.yaml b/spd/experiments/lm/ss_llama_simple_mlp-2L-wide_global_reverse.yaml
index c5d7d684e..84c63b083 100644
--- a/spd/experiments/lm/ss_llama_simple_mlp-2L-wide_global_reverse.yaml
+++ b/spd/experiments/lm/ss_llama_simple_mlp-2L-wide_global_reverse.yaml
@@ -101,10 +101,8 @@ eval_metric_configs:
 - classname: CEandKLLosses
   rounding_threshold: 0.0
 - classname: CIMeanPerComponent
-- coeff: null
-  classname: StochasticHiddenActsReconLoss
-- coeff: null
-  init: random
+- classname: StochasticHiddenActsReconLoss
+- init: random
   step_size: 0.1
   n_steps: 20
   mask_scope: shared_across_batch
diff --git a/spd/experiments/lm/ss_llama_simple_mlp-2L.yaml b/spd/experiments/lm/ss_llama_simple_mlp-2L.yaml
index ac123c4eb..16c9fa644 100644
--- a/spd/experiments/lm/ss_llama_simple_mlp-2L.yaml
+++ b/spd/experiments/lm/ss_llama_simple_mlp-2L.yaml
@@ -80,10 +80,8 @@ eval_metric_configs:
 - classname: CEandKLLosses
   rounding_threshold: 0.0
 - classname: CIMeanPerComponent
-- coeff: null
-  classname: StochasticHiddenActsReconLoss
-- coeff: null
-  init: random
+- classname: StochasticHiddenActsReconLoss
+- init: random
   step_size: 0.1
   n_steps: 20
   mask_scope: shared_across_batch
diff --git a/spd/experiments/lm/ss_llama_simple_mlp.yaml b/spd/experiments/lm/ss_llama_simple_mlp.yaml
index 4dcff849b..f77da81ff 100644
--- a/spd/experiments/lm/ss_llama_simple_mlp.yaml
+++ b/spd/experiments/lm/ss_llama_simple_mlp.yaml
@@ -105,10 +105,8 @@ eval_metric_configs:
     - h.2.*
     all_but_layer_3:
     - h.3.*
-- coeff: null
-  classname: StochasticHiddenActsReconLoss
-- coeff: null
-  init: random
+- classname: StochasticHiddenActsReconLoss
+- init: random
   step_size: 0.1
   n_steps: 20
   mask_scope: shared_across_batch
diff --git a/spd/experiments/lm/z-jan22.yaml b/spd/experiments/lm/z-jan22.yaml
index ffb11f3e8..e0d14f13f 100644
--- a/spd/experiments/lm/z-jan22.yaml
+++ b/spd/experiments/lm/z-jan22.yaml
@@ -104,10 +104,8 @@ eval_metric_configs:
 - classname: CEandKLLosses
   rounding_threshold: 0.0
 - classname: CIMeanPerComponent
-- coeff: null
-  classname: StochasticHiddenActsReconLoss
-- coeff: null
-  init: random
+- classname: StochasticHiddenActsReconLoss
+- init: random
   step_size: 0.1
   n_steps: 20
   mask_scope: shared_across_batch
diff --git a/spd/experiments/lm/z-jan22_ppgd.yaml b/spd/experiments/lm/z-jan22_ppgd.yaml
index ff1b442e4..d2f4b63e7 100644
--- a/spd/experiments/lm/z-jan22_ppgd.yaml
+++ b/spd/experiments/lm/z-jan22_ppgd.yaml
@@ -75,10 +75,15 @@ loss_metric_configs:
     coeff: 0.5
     optimizer:
       type: adam
-      lr: 0.01
+      lr_schedule:
+        start_val: 0.01
+        warmup_pct: 0.0
+        final_val_frac: 1.0
+        fn_type: constant
       beta1: 0.8
       beta2: 0.99
-    scope: unique_per_batch_per_token
+    scope:
+      type: per_batch_per_position
     routing:
       type: uniform_k_subset
 output_loss_type: kl
diff --git a/spd/experiments/lm/z-jan22_ppgd_reverse_resid_ablations.yaml b/spd/experiments/lm/z-jan22_ppgd_reverse_resid_ablations.yaml
index 58c948356..d5a102eaa 100644
--- a/spd/experiments/lm/z-jan22_ppgd_reverse_resid_ablations.yaml
+++ b/spd/experiments/lm/z-jan22_ppgd_reverse_resid_ablations.yaml
@@ -85,7 +85,11 @@ loss_metric_configs:
     coeff: 0.5
     optimizer:
       type: adam
-      lr: 0.01
+      lr_schedule:
+        start_val: 0.01
+        warmup_pct: 0.0
+        final_val_frac: 1.0
+        fn_type: constant
       beta1: 0.8
       beta2: 0.99
     scope:
diff --git a/spd/experiments/lm/z-jan22_ppgd_transformer_normed.yaml b/spd/experiments/lm/z-jan22_ppgd_transformer_normed.yaml
index 087d05ded..8487a9ce8 100644
--- a/spd/experiments/lm/z-jan22_ppgd_transformer_normed.yaml
+++ b/spd/experiments/lm/z-jan22_ppgd_transformer_normed.yaml
@@ -50,10 +50,15 @@ loss_metric_configs:
     coeff: 0.5
     optimizer:
       type: adam
-      lr: 0.01
+      lr_schedule:
+        start_val: 0.01
+        warmup_pct: 0.0
+        final_val_frac: 1.0
+        fn_type: constant
       beta1: 0.8
       beta2: 0.99
-    scope: unique_per_batch_per_token
+    scope:
+      type: per_batch_per_position
     routing:
       type: uniform_k_subset
 output_loss_type: kl
diff --git a/spd/experiments/lm/z-jan22_ppgd_transformer_normed_ablations.yaml b/spd/experiments/lm/z-jan22_ppgd_transformer_normed_ablations.yaml
index c74b9e739..b20a94d06 100644
--- a/spd/experiments/lm/z-jan22_ppgd_transformer_normed_ablations.yaml
+++ b/spd/experiments/lm/z-jan22_ppgd_transformer_normed_ablations.yaml
@@ -65,10 +65,15 @@ loss_metric_configs:
     coeff: 0.5
     optimizer:
       type: adam
-      lr: 0.01
+      lr_schedule:
+        start_val: 0.01
+        warmup_pct: 0.0
+        final_val_frac: 1.0
+        fn_type: constant
       beta1: 0.8
       beta2: 0.99
-    scope: unique_per_batch_per_token
+    scope:
+      type: per_batch_per_position
     routing:
       type: uniform_k_subset
 output_loss_type: kl
diff --git a/spd/experiments/lm/z-jan22_ppgd_transformer_normed_deep.yaml b/spd/experiments/lm/z-jan22_ppgd_transformer_normed_deep.yaml
index 239d435bb..de678b368 100644
--- a/spd/experiments/lm/z-jan22_ppgd_transformer_normed_deep.yaml
+++ b/spd/experiments/lm/z-jan22_ppgd_transformer_normed_deep.yaml
@@ -50,7 +50,11 @@ loss_metric_configs:
     coeff: 0.5
     optimizer:
       type: adam
-      lr: 0.01
+      lr_schedule:
+        start_val: 0.01
+        warmup_pct: 0.0
+        final_val_frac: 1.0
+        fn_type: constant
       beta1: 0.8
       beta2: 0.99
     scope:
diff --git a/spd/experiments/resid_mlp/resid_mlp1_global_reverse_config.yaml b/spd/experiments/resid_mlp/resid_mlp1_global_reverse_config.yaml
index dd721c9e9..4e708862e 100644
--- a/spd/experiments/resid_mlp/resid_mlp1_global_reverse_config.yaml
+++ b/spd/experiments/resid_mlp/resid_mlp1_global_reverse_config.yaml
@@ -94,7 +94,6 @@ eval_metric_configs:
     groups: null
   - classname: "CIMeanPerComponent"
   - classname: "StochasticHiddenActsReconLoss"
-    coeff: null
 
 # --- Pretrained model info ---
 pretrained_model_class: "spd.experiments.resid_mlp.models.ResidMLP"
diff --git a/spd/experiments/resid_mlp/resid_mlp2_global_reverse_config.yaml b/spd/experiments/resid_mlp/resid_mlp2_global_reverse_config.yaml
index bd111ff84..8c239ee60 100644
--- a/spd/experiments/resid_mlp/resid_mlp2_global_reverse_config.yaml
+++ b/spd/experiments/resid_mlp/resid_mlp2_global_reverse_config.yaml
@@ -97,9 +97,7 @@ eval_metric_configs:
     groups: null
   - classname: "CIMeanPerComponent"
   - classname: "StochasticHiddenActsReconLoss"
-    coeff: null
   - classname: "PGDReconLoss"
-    coeff: null
     init: random
     step_size: 0.1
     n_steps: 20
diff --git a/spd/persistent_pgd.py b/spd/persistent_pgd.py
index 03003d910..add3ba119 100644
--- a/spd/persistent_pgd.py
+++ b/spd/persistent_pgd.py
@@ -152,7 +152,7 @@ def __init__(
                 assert batch_dims[0] % n == 0, (
                     f"n_sources={n} must divide the per-rank microbatch size "
                     f"{batch_dims[0]}, not the global batch size. "
-                    f"With DDP, reduce n_sources or use fewer ranks."
+                    f"Adjust n_sources or batch_size to satisfy this."
                 )
                 source_leading_dims = [n] + list(batch_dims[1:])
             case PerBatchPerPositionScope():

From b95f6bf83fe95be91e979ff4fe381e64fdaaa124 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Fri, 20 Feb 2026 11:09:11 +0000
Subject: [PATCH 28/62] Fix SQLite issues on NFS: remove WAL, separate
 read/write connections
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove PRAGMA journal_mode=WAL from all 3 DB classes (harvest, autointerp,
  topological_interp). WAL requires POSIX file locking which breaks on NFS.
- Scoring scripts (detection, fuzzing) now accept a separate writable InterpDB
  instead of writing through the readonly InterpRepo.
- Intruder eval opens harvest readonly + separate writable HarvestDB for scores.
- Fix try/except → try/finally in interpret.py for proper connection cleanup.
- Bump autointerp/eval/intruder jobs to 2 GPUs for memory headroom.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/autointerp/db.py                                |  1 -
 spd/autointerp/interpret.py                         |  4 +---
 spd/autointerp/scoring/detection.py                 |  6 ++++--
 spd/autointerp/scoring/fuzzing.py                   |  6 ++++--
 spd/autointerp/scoring/scripts/run_label_scoring.py | 12 ++++++++++--
 spd/autointerp/scripts/run_slurm.py                 |  4 ++--
 spd/harvest/db.py                                   |  1 -
 spd/harvest/intruder.py                             |  8 ++++----
 spd/harvest/scripts/run_intruder.py                 |  7 +++++--
 spd/postprocess/__init__.py                         |  2 +-
 10 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/spd/autointerp/db.py b/spd/autointerp/db.py
index aa3aca4ee..66d681333 100644
--- a/spd/autointerp/db.py
+++ b/spd/autointerp/db.py
@@ -40,7 +40,6 @@ def __init__(self, db_path: Path, readonly: bool = False) -> None:
             )
         else:
             self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
-            self._conn.execute("PRAGMA journal_mode=WAL")
             self._conn.executescript(_SCHEMA)
         self._conn.row_factory = sqlite3.Row
 
diff --git a/spd/autointerp/interpret.py b/spd/autointerp/interpret.py
index 9a0eff3a6..89e992902 100644
--- a/spd/autointerp/interpret.py
+++ b/spd/autointerp/interpret.py
@@ -190,10 +190,8 @@ def build_jobs() -> Iterable[LLMJob]:
                         f"Error rate {error_rate:.0%} ({n_errors}/{len(remaining)}) exceeds 20% threshold"
                     )
 
-        except Exception as e:
-            logger.error(f"Error: {type(e).__name__}: {e}")
+        finally:
             db.close()
-            raise e
 
         logger.info(f"Completed {len(results)} interpretations -> {db_path}")
         return results
diff --git a/spd/autointerp/scoring/detection.py b/spd/autointerp/scoring/detection.py
index 7863d1598..afc00a163 100644
--- a/spd/autointerp/scoring/detection.py
+++ b/spd/autointerp/scoring/detection.py
@@ -16,6 +16,7 @@
 from spd.app.backend.app_tokenizer import AppTokenizer
 from spd.app.backend.utils import delimit_tokens
 from spd.autointerp.config import DetectionEvalConfig
+from spd.autointerp.db import InterpDB
 from spd.autointerp.llm_api import LLMError, LLMJob, LLMResult, map_llm_calls
 from spd.autointerp.repo import InterpRepo
 from spd.harvest.schemas import ActivationExample, ComponentData
@@ -122,6 +123,7 @@ class _TrialGroundTruth:
 async def run_detection_scoring(
     components: list[ComponentData],
     interp_repo: InterpRepo,
+    score_db: InterpDB,
     model: str,
     reasoning_effort: Effort,
     openrouter_api_key: str,
@@ -144,7 +146,7 @@ async def run_detection_scoring(
     if limit is not None:
         eligible = eligible[:limit]
 
-    existing_scores = interp_repo.get_scores("detection")
+    existing_scores = score_db.get_scores("detection")
     completed = set(existing_scores.keys())
     if completed:
         logger.info(f"Resuming: {len(completed)} already scored")
@@ -237,7 +239,7 @@ async def run_detection_scoring(
         score = sum(t.balanced_acc for t in trials) / len(trials) if trials else 0.0
         result = DetectionResult(component_key=ck, score=score, trials=trials, n_errors=n_err)
         results.append(result)
-        interp_repo.save_score(ck, "detection", score, json.dumps(asdict(result)))
+        score_db.save_score(ck, "detection", score, json.dumps(asdict(result)))
 
     logger.info(f"Scored {len(results)} components")
     return results
diff --git a/spd/autointerp/scoring/fuzzing.py b/spd/autointerp/scoring/fuzzing.py
index cafb8e746..9669a46cf 100644
--- a/spd/autointerp/scoring/fuzzing.py
+++ b/spd/autointerp/scoring/fuzzing.py
@@ -17,6 +17,7 @@
 from spd.app.backend.app_tokenizer import AppTokenizer
 from spd.app.backend.utils import delimit_tokens
 from spd.autointerp.config import FuzzingEvalConfig
+from spd.autointerp.db import InterpDB
 from spd.autointerp.llm_api import LLMError, LLMJob, LLMResult, map_llm_calls
 from spd.autointerp.repo import InterpRepo
 from spd.harvest.schemas import ActivationExample, ComponentData
@@ -116,6 +117,7 @@ class _TrialGroundTruth:
 async def run_fuzzing_scoring(
     components: list[ComponentData],
     interp_repo: InterpRepo,
+    score_db: InterpDB,
     model: str,
     reasoning_effort: Effort,
     openrouter_api_key: str,
@@ -140,7 +142,7 @@ async def run_fuzzing_scoring(
     if limit is not None:
         eligible = eligible[:limit]
 
-    existing_scores = interp_repo.get_scores("fuzzing")
+    existing_scores = score_db.get_scores("fuzzing")
     completed = set(existing_scores.keys())
     if completed:
         logger.info(f"Resuming: {len(completed)} already scored")
@@ -235,7 +237,7 @@ async def run_fuzzing_scoring(
         score = (tpr + tnr) / 2 if (total_pos > 0 and total_neg > 0) else 0.0
         result = FuzzingResult(component_key=ck, score=score, trials=trials, n_errors=n_err)
         results.append(result)
-        interp_repo.save_score(ck, "fuzzing", score, json.dumps(asdict(result)))
+        score_db.save_score(ck, "fuzzing", score, json.dumps(asdict(result)))
 
     logger.info(f"Scored {len(results)} components")
     return results
diff --git a/spd/autointerp/scoring/scripts/run_label_scoring.py b/spd/autointerp/scoring/scripts/run_label_scoring.py
index f2cf3acf1..be2efa388 100644
--- a/spd/autointerp/scoring/scripts/run_label_scoring.py
+++ b/spd/autointerp/scoring/scripts/run_label_scoring.py
@@ -12,6 +12,7 @@
 
 from spd.adapters import adapter_from_id
 from spd.autointerp.config import AutointerpEvalConfig
+from spd.autointerp.db import InterpDB
 from spd.autointerp.repo import InterpRepo
 from spd.autointerp.scoring.detection import run_detection_scoring
 from spd.autointerp.scoring.fuzzing import run_fuzzing_scoring
@@ -40,14 +41,17 @@ def main(
         f"No autointerp data for {decomposition_id}. Run autointerp first."
     )
 
+    # Separate writable DB for saving scores (the repo's DB is readonly/immutable)
+    score_db = InterpDB(interp_repo._subrun_dir / "interp.db")
+
     if harvest_subrun_id is not None:
         harvest = HarvestRepo(
             decomposition_id=decomposition_id,
             subrun_id=harvest_subrun_id,
-            readonly=False,
+            readonly=True,
         )
     else:
-        harvest = HarvestRepo.open_most_recent(decomposition_id, readonly=False)
+        harvest = HarvestRepo.open_most_recent(decomposition_id, readonly=True)
         assert harvest is not None, f"No harvest data for {decomposition_id}"
 
     components = harvest.get_all_components()
@@ -58,6 +62,7 @@ def main(
                 run_detection_scoring(
                     components=components,
                     interp_repo=interp_repo,
+                    score_db=score_db,
                     model=config.model,
                     reasoning_effort=config.reasoning_effort,
                     openrouter_api_key=openrouter_api_key,
@@ -74,6 +79,7 @@ def main(
                 run_fuzzing_scoring(
                     components=components,
                     interp_repo=interp_repo,
+                    score_db=score_db,
                     model=config.model,
                     reasoning_effort=config.reasoning_effort,
                     openrouter_api_key=openrouter_api_key,
@@ -86,6 +92,8 @@ def main(
                 )
             )
 
+    score_db.close()
+
 
 def get_command(
     decomposition_id: str,
diff --git a/spd/autointerp/scripts/run_slurm.py b/spd/autointerp/scripts/run_slurm.py
index 278edf562..bcd83b94c 100644
--- a/spd/autointerp/scripts/run_slurm.py
+++ b/spd/autointerp/scripts/run_slurm.py
@@ -59,7 +59,7 @@ def submit_autointerp(
     interpret_slurm = SlurmConfig(
         job_name="spd-interpret",
         partition=config.partition,
-        n_gpus=1,
+        n_gpus=2,
         time=config.time,
         snapshot_branch=snapshot_branch,
         dependency_job_id=dependency_job_id,
@@ -97,7 +97,7 @@ def submit_autointerp(
         eval_slurm = SlurmConfig(
             job_name=f"spd-{scorer}",
             partition=config.partition,
-            n_gpus=1,
+            n_gpus=2,
             time=config.evals_time,
             snapshot_branch=snapshot_branch,
             dependency_job_id=interpret_result.job_id,
diff --git a/spd/harvest/db.py b/spd/harvest/db.py
index 52556918f..10573c276 100644
--- a/spd/harvest/db.py
+++ b/spd/harvest/db.py
@@ -82,7 +82,6 @@ def __init__(self, db_path: Path, readonly: bool = False) -> None:
             )
         else:
             self._conn = sqlite3.connect(str(db_path))
-            self._conn.execute("PRAGMA journal_mode=WAL")
             self._conn.executescript(_SCHEMA)
         self._conn.row_factory = sqlite3.Row
 
diff --git a/spd/harvest/intruder.py b/spd/harvest/intruder.py
index b16f2d7f3..f91a5e0c2 100644
--- a/spd/harvest/intruder.py
+++ b/spd/harvest/intruder.py
@@ -19,7 +19,7 @@
 from spd.app.backend.utils import delimit_tokens
 from spd.autointerp.llm_api import LLMError, LLMJob, LLMResult, map_llm_calls
 from spd.harvest.config import IntruderEvalConfig
-from spd.harvest.repo import HarvestRepo
+from spd.harvest.db import HarvestDB
 from spd.harvest.schemas import ActivationExample, ComponentData
 from spd.log import logger
 
@@ -146,7 +146,7 @@ async def run_intruder_scoring(
     model: str,
     openrouter_api_key: str,
     tokenizer_name: str,
-    harvest: HarvestRepo,
+    score_db: HarvestDB,
     eval_config: IntruderEvalConfig,
     limit: int | None,
     cost_limit_usd: float | None,
@@ -163,7 +163,7 @@ async def run_intruder_scoring(
 
     density_index = DensityIndex(components, min_examples=n_real + 1)
 
-    existing_scores = harvest.get_scores("intruder")
+    existing_scores = score_db.get_scores("intruder")
     completed = set(existing_scores.keys())
     if completed:
         logger.info(f"Resuming: {len(completed)} already scored")
@@ -234,7 +234,7 @@ async def run_intruder_scoring(
         score = correct / len(trials) if trials else 0.0
         result = IntruderResult(component_key=ck, score=score, trials=trials, n_errors=n_err)
         results.append(result)
-        harvest.save_score(ck, "intruder", score, json.dumps(asdict(result)))
+        score_db.save_score(ck, "intruder", score, json.dumps(asdict(result)))
 
     logger.info(f"Scored {len(results)} components")
     return results
diff --git a/spd/harvest/scripts/run_intruder.py b/spd/harvest/scripts/run_intruder.py
index d14766251..4dbef810a 100644
--- a/spd/harvest/scripts/run_intruder.py
+++ b/spd/harvest/scripts/run_intruder.py
@@ -6,6 +6,7 @@
 
 from spd.adapters import adapter_from_id
 from spd.harvest.config import IntruderEvalConfig
+from spd.harvest.db import HarvestDB
 from spd.harvest.intruder import run_intruder_scoring
 from spd.harvest.repo import HarvestRepo
 
@@ -24,7 +25,8 @@ def main(
 
     tokenizer_name = adapter_from_id(decomposition_id).tokenizer_name
 
-    harvest = HarvestRepo(decomposition_id, subrun_id=harvest_subrun_id, readonly=False)
+    harvest = HarvestRepo(decomposition_id, subrun_id=harvest_subrun_id, readonly=True)
+    score_db = HarvestDB(harvest._dir / "harvest.db")
 
     components = harvest.get_all_components()
 
@@ -34,12 +36,13 @@ def main(
             model=eval_config.model,
             openrouter_api_key=openrouter_api_key,
             tokenizer_name=tokenizer_name,
-            harvest=harvest,
+            score_db=score_db,
             eval_config=eval_config,
             limit=eval_config.limit,
             cost_limit_usd=eval_config.cost_limit_usd,
         )
     )
+    score_db.close()
 
 
 def get_command(decomposition_id: str, config: IntruderEvalConfig, harvest_subrun_id: str) -> str:
diff --git a/spd/postprocess/__init__.py b/spd/postprocess/__init__.py
index ffe17616a..e2feab509 100644
--- a/spd/postprocess/__init__.py
+++ b/spd/postprocess/__init__.py
@@ -68,7 +68,7 @@ def postprocess(config: PostprocessConfig) -> Path:
         intruder_slurm = SlurmConfig(
             job_name="spd-intruder-eval",
             partition=config.intruder.partition,
-            n_gpus=1,
+            n_gpus=2,
             time=config.intruder.time,
             snapshot_branch=snapshot_branch,
             dependency_job_id=harvest_result.merge_result.job_id,

From 64556ba40749cef38255b3611f5081804093a893 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Fri, 20 Feb 2026 11:25:59 +0000
Subject: [PATCH 29/62] Fix attributions SLURM passing full config instead of
 inner config

Was serializing AttributionsSlurmConfig (with n_gpus, partition, etc.)
instead of just DatasetAttributionConfig to workers.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/dataset_attributions/scripts/run_slurm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spd/dataset_attributions/scripts/run_slurm.py b/spd/dataset_attributions/scripts/run_slurm.py
index e405b2bd9..3fdba505e 100644
--- a/spd/dataset_attributions/scripts/run_slurm.py
+++ b/spd/dataset_attributions/scripts/run_slurm.py
@@ -80,7 +80,7 @@ def submit_attributions(
     suffix = f"-{job_suffix}" if job_suffix else ""
     array_job_name = f"spd-attr{suffix}"
 
-    config_json = config.model_dump_json(exclude_none=True)
+    config_json = config.config.model_dump_json(exclude_none=True)
 
     # SLURM arrays are 1-indexed, so task ID 1 -> rank 0, etc.
     worker_commands = []

From 9e7d3ef345f5cbe0143dd3f45d5b17bee5a0ce2f Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 15:53:17 +0000
Subject: [PATCH 30/62] add worktrees to ignore

---
 .claude/worktrees/bold-elm-8kpb               |   1 +
 .claude/worktrees/bright-fox-a4i0             |   1 +
 .claude/worktrees/calm-owl-v4pj               |   1 +
 .claude/worktrees/cozy-frolicking-stream      |   1 +
 .claude/worktrees/stateless-dancing-blanket   |   1 +
 .claude/worktrees/swift-owl-yep9              |   1 +
 .claude/worktrees/swift-ray-amfs              |   1 +
 .claude/worktrees/vectorized-wiggling-whisper |   1 +
 .claude/worktrees/xenodochial-germain         |   1 +
 .gitignore                                    |   4 +-
 .../backend/routers/dataset_attributions.py   |  13 +-
 spd/dataset_attributions/harvest.py           | 109 ++++-----
 spd/dataset_attributions/harvester.py         | 227 +++++++++--------
 spd/dataset_attributions/repo.py              |   2 +-
 spd/dataset_attributions/storage.py           | 229 ++++++++++--------
 spd/topology/gradient_connectivity.py         |  19 +-
 tests/dataset_attributions/test_harvester.py  |   6 +-
 17 files changed, 325 insertions(+), 293 deletions(-)
 create mode 160000 .claude/worktrees/bold-elm-8kpb
 create mode 160000 .claude/worktrees/bright-fox-a4i0
 create mode 160000 .claude/worktrees/calm-owl-v4pj
 create mode 160000 .claude/worktrees/cozy-frolicking-stream
 create mode 160000 .claude/worktrees/stateless-dancing-blanket
 create mode 160000 .claude/worktrees/swift-owl-yep9
 create mode 160000 .claude/worktrees/swift-ray-amfs
 create mode 160000 .claude/worktrees/vectorized-wiggling-whisper
 create mode 160000 .claude/worktrees/xenodochial-germain

diff --git a/.claude/worktrees/bold-elm-8kpb b/.claude/worktrees/bold-elm-8kpb
new file mode 160000
index 000000000..356f8cfed
--- /dev/null
+++ b/.claude/worktrees/bold-elm-8kpb
@@ -0,0 +1 @@
+Subproject commit 356f8cfedab14e621c70de460971d6d148f44e80
diff --git a/.claude/worktrees/bright-fox-a4i0 b/.claude/worktrees/bright-fox-a4i0
new file mode 160000
index 000000000..356f8cfed
--- /dev/null
+++ b/.claude/worktrees/bright-fox-a4i0
@@ -0,0 +1 @@
+Subproject commit 356f8cfedab14e621c70de460971d6d148f44e80
diff --git a/.claude/worktrees/calm-owl-v4pj b/.claude/worktrees/calm-owl-v4pj
new file mode 160000
index 000000000..dbe0668a4
--- /dev/null
+++ b/.claude/worktrees/calm-owl-v4pj
@@ -0,0 +1 @@
+Subproject commit dbe0668a4119885b7fe952ed820b4ba8b4a3d693
diff --git a/.claude/worktrees/cozy-frolicking-stream b/.claude/worktrees/cozy-frolicking-stream
new file mode 160000
index 000000000..356f8cfed
--- /dev/null
+++ b/.claude/worktrees/cozy-frolicking-stream
@@ -0,0 +1 @@
+Subproject commit 356f8cfedab14e621c70de460971d6d148f44e80
diff --git a/.claude/worktrees/stateless-dancing-blanket b/.claude/worktrees/stateless-dancing-blanket
new file mode 160000
index 000000000..356f8cfed
--- /dev/null
+++ b/.claude/worktrees/stateless-dancing-blanket
@@ -0,0 +1 @@
+Subproject commit 356f8cfedab14e621c70de460971d6d148f44e80
diff --git a/.claude/worktrees/swift-owl-yep9 b/.claude/worktrees/swift-owl-yep9
new file mode 160000
index 000000000..356f8cfed
--- /dev/null
+++ b/.claude/worktrees/swift-owl-yep9
@@ -0,0 +1 @@
+Subproject commit 356f8cfedab14e621c70de460971d6d148f44e80
diff --git a/.claude/worktrees/swift-ray-amfs b/.claude/worktrees/swift-ray-amfs
new file mode 160000
index 000000000..356f8cfed
--- /dev/null
+++ b/.claude/worktrees/swift-ray-amfs
@@ -0,0 +1 @@
+Subproject commit 356f8cfedab14e621c70de460971d6d148f44e80
diff --git a/.claude/worktrees/vectorized-wiggling-whisper b/.claude/worktrees/vectorized-wiggling-whisper
new file mode 160000
index 000000000..cb18c86a7
--- /dev/null
+++ b/.claude/worktrees/vectorized-wiggling-whisper
@@ -0,0 +1 @@
+Subproject commit cb18c86a77720f94a292e7421a19694082813c8c
diff --git a/.claude/worktrees/xenodochial-germain b/.claude/worktrees/xenodochial-germain
new file mode 160000
index 000000000..4b52a4869
--- /dev/null
+++ b/.claude/worktrees/xenodochial-germain
@@ -0,0 +1 @@
+Subproject commit 4b52a4869474bd80365c573434855d091abbbb5b
diff --git a/.gitignore b/.gitignore
index 4780cbd03..b5601daf4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -177,4 +177,6 @@ cython_debug/
 #.idea/
 
 **/*.db
-**/*.db*
\ No newline at end of file
+**/*.db*
+
+.claude/worktrees
\ No newline at end of file
diff --git a/spd/app/backend/routers/dataset_attributions.py b/spd/app/backend/routers/dataset_attributions.py
index 4c3d07753..c459d29ae 100644
--- a/spd/app/backend/routers/dataset_attributions.py
+++ b/spd/app/backend/routers/dataset_attributions.py
@@ -33,8 +33,9 @@ class DatasetAttributionMetadata(BaseModel):
     n_batches_processed: int | None
     n_tokens_processed: int | None
     n_component_layer_keys: int | None
-    vocab_size: int | None
-    d_model: int | None
+    # TODO(oli): remove these from frontend
+    # vocab_size: int | None
+    # d_model: int | None
     ci_threshold: float | None
 
 
@@ -127,8 +128,8 @@ def get_attribution_metadata(loaded: DepLoadedRun) -> DatasetAttributionMetadata
             n_batches_processed=None,
             n_tokens_processed=None,
             n_component_layer_keys=None,
-            vocab_size=None,
-            d_model=None,
+            # vocab_size=None,
+            # d_model=None,
             ci_threshold=None,
         )
     storage = loaded.attributions.get_attributions()
@@ -137,8 +138,8 @@ def get_attribution_metadata(loaded: DepLoadedRun) -> DatasetAttributionMetadata
         n_batches_processed=storage.n_batches_processed,
         n_tokens_processed=storage.n_tokens_processed,
         n_component_layer_keys=storage.n_components,
-        vocab_size=storage.vocab_size,
-        d_model=storage.d_model,
+        # vocab_size=storage.vocab_size,
+        # d_model=storage.d_model,
         ci_threshold=storage.ci_threshold,
     )
 
diff --git a/spd/dataset_attributions/harvest.py b/spd/dataset_attributions/harvest.py
index 15e4f5b19..84a3d608d 100644
--- a/spd/dataset_attributions/harvest.py
+++ b/spd/dataset_attributions/harvest.py
@@ -14,6 +14,7 @@
 
 import itertools
 from pathlib import Path
+from typing import Any, cast
 
 import torch
 import tqdm
@@ -51,9 +52,10 @@ def _build_alive_masks(
     model: ComponentModel,
     run_id: str,
     harvest_subrun_id: str | None,
-    n_components: int,
+    # n_components: int,
     vocab_size: int,
-) -> tuple[Bool[Tensor, " n_sources"], Bool[Tensor, " n_components"]]:
+    # ) -> tuple[Bool[Tensor, " n_sources"], Bool[Tensor, " n_components"]]:
+) -> dict[str, Bool[Tensor, " n_components"]]:
     """Build masks of alive components (mean_activation > threshold) for sources and targets.
 
     Falls back to all-alive if harvest summary not available.
@@ -63,43 +65,48 @@ def _build_alive_masks(
     - Targets: [0, n_components) = component layers (output handled via out_residual)
     """
 
-    n_sources = vocab_size + n_components
-
-    source_alive = torch.zeros(n_sources, dtype=torch.bool)
-    target_alive = torch.zeros(n_components, dtype=torch.bool)
-
-    # All wte tokens are always alive (source indices [0, vocab_size))
-    source_alive[:vocab_size] = True
+    component_alive = {
+        "wte": torch.ones(vocab_size, dtype=torch.bool),  # All wte tokens are always alive
+        **{
+            layer: torch.zeros(model.module_to_c[layer], dtype=torch.bool)
+            for layer in model.target_module_paths
+        },
+    }
+    # # All wte tokens are always alive (source indices [0, vocab_size))
+    # source_alive[:vocab_size] = True
+
+    # target_alive = {
+    #     layer: torch.zeros(model.module_to_c[layer], dtype=torch.bool)
+    #     for layer in model.target_module_paths
+    # }
 
     if harvest_subrun_id is not None:
         harvest = HarvestRepo(decomposition_id=run_id, subrun_id=harvest_subrun_id, readonly=True)
     else:
         harvest = HarvestRepo.open_most_recent(run_id, readonly=True)
         assert harvest is not None, f"No harvest data for {run_id}"
+
     summary = harvest.get_summary()
     assert summary is not None, "Harvest summary not available"
 
     # Build masks for component layers
-    source_idx = vocab_size  # Start after wte tokens
-    target_idx = 0
+    # source_idx = vocab_size  # Start after wte tokens
+    # target_idx = 0
 
     for layer in model.target_module_paths:
         n_layer_components = model.module_to_c[layer]
         for c_idx in range(n_layer_components):
             component_key = f"{layer}:{c_idx}"
             is_alive = component_key in summary and summary[component_key].firing_density > 0.0
-            source_alive[source_idx] = is_alive
-            target_alive[target_idx] = is_alive
-            source_idx += 1
-            target_idx += 1
+            component_alive[layer][c_idx] = is_alive
 
-    n_source_alive = int(source_alive.sum().item())
-    n_target_alive = int(target_alive.sum().item())
-    logger.info(
-        f"Alive components: {n_source_alive}/{n_sources} sources, "
-        f"{n_target_alive}/{n_components} component targets (firing density > 0.0)"
-    )
-    return source_alive, target_alive
+    # n_source_alive = int(source_alive.sum().item())
+    # n_target_alive = int(target_alive.sum().item())
+    # logger.info(
+    #     f"Alive components: {n_source_alive}/{n_sources} sources, "
+    #     f"{n_target_alive}/{n_components} component targets (firing density > 0.0)"
+    # )
+    return component_alive
 
 
 def harvest_attributions(
@@ -140,16 +147,14 @@ def harvest_attributions(
     logger.info(f"Vocab size: {vocab_size}")
 
     # Build component keys and alive masks
-    component_layer_keys = _build_component_layer_keys(model)
-    n_components = len(component_layer_keys)
-    source_alive, target_alive = _build_alive_masks(
-        model, run_id, harvest_subrun_id, n_components, vocab_size
-    )
-    source_alive = source_alive.to(device)
-    target_alive = target_alive.to(device)
+    # component_layer_keys = _build_component_layer_keys(model)
+    # n_components = len(component_layer_keys)
+    component_alive = _build_alive_masks(model, run_id, harvest_subrun_id, vocab_size)
+    # source_alive = source_alive.to(device)
+    # target_alive = target_alive.to(device)
 
-    n_sources = vocab_size + n_components
-    logger.info(f"Component layers: {n_components}, Sources: {n_sources}")
+    # n_sources = vocab_size + n_components
+    # logger.info(f"Component layers: {n_components}, Sources: {n_sources}")
 
     # Get gradient connectivity
     logger.info("Computing sources_by_target...")
@@ -160,8 +165,8 @@ def harvest_attributions(
     # - Valid targets: component layers + output
     # - Valid sources: wte + component layers
     component_layers = set(model.target_module_paths)
-    valid_sources = component_layers | {"wte"}
-    valid_targets = component_layers | {"output"}
+    valid_sources = component_layers.union({"wte"})
+    valid_targets = component_layers.union({"output"})
 
     sources_by_target = {}
     for target, sources in sources_by_target_raw.items():
@@ -176,15 +181,13 @@ def harvest_attributions(
     harvester = AttributionHarvester(
         model=model,
         sources_by_target=sources_by_target,
-        n_components=n_components,
+        # n_components=n_components,
         vocab_size=vocab_size,
-        source_alive=source_alive,
-        target_alive=target_alive,
+        component_alive=component_alive,
         sampling=spd_config.sampling,
         embedding_module=topology.embedding_module,
         unembed_module=topology.unembed_module,
         device=device,
-        show_progress=True,
     )
 
     # Process batches
@@ -211,16 +214,12 @@ def harvest_attributions(
     )
 
     # Normalize by n_tokens to get per-token average attribution
-    normalized_comp = harvester.comp_accumulator / harvester.n_tokens
-    normalized_out_residual = harvester.out_residual_accumulator / harvester.n_tokens
+    # normalized_comp = harvester.comp_accumulator / harvester.n_tokens
+    # normalized_out_residual = harvester.out_residual_accumulator / harvester.n_tokens
 
     # Build and save storage
     storage = DatasetAttributionStorage(
-        component_layer_keys=component_layer_keys,
         vocab_size=vocab_size,
-        d_model=harvester.d_model,
-        source_to_component=normalized_comp.cpu(),
-        source_to_out_residual=normalized_out_residual.cpu(),
         n_batches_processed=harvester.n_batches,
         n_tokens_processed=harvester.n_tokens,
         ci_threshold=config.ci_threshold,
@@ -233,7 +232,7 @@ def harvest_attributions(
     else:
         output_dir.mkdir(parents=True, exist_ok=True)
         output_path = output_dir / "dataset_attributions.pt"
-    storage.save(output_path)
+    # storage.save(output_path)
     logger.info(f"Saved dataset attributions to {output_path}")
 
 
@@ -252,7 +251,7 @@ def merge_attributions(output_dir: Path) -> None:
 
     # Load first file to get metadata and initialize accumulators
     # Use double precision for accumulation to prevent precision loss with billions of tokens
-    first = DatasetAttributionStorage.load(rank_files[0])
+    first = cast(DatasetAttributionStorage, None)  # DatasetAttributionStorage.load(rank_files[0])
     total_comp = (first.source_to_component * first.n_tokens_processed).double()
     total_out_residual = (first.source_to_out_residual * first.n_tokens_processed).double()
     total_tokens = first.n_tokens_processed
@@ -261,14 +260,13 @@ def merge_attributions(output_dir: Path) -> None:
 
     # Stream remaining files one at a time
     for rank_file in tqdm.tqdm(rank_files[1:], desc="Merging rank files"):
-        storage = DatasetAttributionStorage.load(rank_file)
+        storage = cast(DatasetAttributionStorage, None)  # DatasetAttributionStorage.load(rank_file)
 
         # Validate consistency
         assert storage.component_layer_keys == first.component_layer_keys, (
             "Component layer keys mismatch"
         )
-        assert storage.vocab_size == first.vocab_size, "Vocab size mismatch"
-        assert storage.d_model == first.d_model, "d_model mismatch"
+        # assert storage.d_model == first.d_model, "d_model mismatch"
         assert storage.ci_threshold == first.ci_threshold, "CI threshold mismatch"
 
         # Accumulate de-normalized values
@@ -283,18 +281,21 @@ def merge_attributions(output_dir: Path) -> None:
 
     # Save merged result
     merged = DatasetAttributionStorage(
-        component_layer_keys=first.component_layer_keys,
-        vocab_size=first.vocab_size,
-        d_model=first.d_model,
-        source_to_component=merged_comp,
-        source_to_out_residual=merged_out_residual,
+        # component_layer_keys=first.component_layer_keys,
+        # # d_model=first.d_model,
+        # source_to_component=merged_comp,
+        # source_to_out_residual=merged_out_residual,
+        # n_batches_processed=total_batches,
+        # n_tokens_processed=total_tokens,
+        # ci_threshold=first.ci_threshold,
+        vocab_size=0, # vocab_size,
         n_batches_processed=total_batches,
         n_tokens_processed=total_tokens,
         ci_threshold=first.ci_threshold,
     )
 
     output_path = output_dir / "dataset_attributions.pt"
-    merged.save(output_path)
+    # merged.save(output_path)
     assert output_path.stat().st_size > 0, f"Merge output is empty: {output_path}"
     logger.info(f"Merged {len(rank_files)} files -> {output_path}")
     logger.info(f"Total: {total_batches} batches, {total_tokens:,} tokens")
diff --git a/spd/dataset_attributions/harvester.py b/spd/dataset_attributions/harvester.py
index 5bef0af63..f4be04f25 100644
--- a/spd/dataset_attributions/harvester.py
+++ b/spd/dataset_attributions/harvester.py
@@ -13,9 +13,8 @@
 from typing import Any
 
 import torch
-from jaxtyping import Bool, Float, Int
+from jaxtyping import Bool, Int
 from torch import Tensor, nn
-from tqdm.auto import tqdm
 
 from spd.configs import SamplingType
 from spd.models.component_model import ComponentModel, OutputWithCache
@@ -48,81 +47,67 @@ def __init__(
         self,
         model: ComponentModel,
         sources_by_target: dict[str, list[str]],
-        n_components: int,
         vocab_size: int,
-        source_alive: Bool[Tensor, " n_sources"],
-        target_alive: Bool[Tensor, " n_components"],
+        component_alive: dict[str, Bool[Tensor, " n_components"]],
         sampling: SamplingType,
         embedding_module: nn.Embedding,
         unembed_module: nn.Linear,
         device: torch.device,
-        show_progress: bool = False,
     ):
         self.model = model
         self.sources_by_target = sources_by_target
-        self.n_components = n_components
         self.vocab_size = vocab_size
-        self.source_alive = source_alive
-        self.target_alive = target_alive
+        self.component_alive = component_alive
         self.sampling = sampling
         self.embedding_module = embedding_module
         self.unembed_module = unembed_module
         self.device = device
-        self.show_progress = show_progress
 
-        self.n_sources = vocab_size + n_components
         self.n_batches = 0
         self.n_tokens = 0
+        self.output_d_model = unembed_module.in_features
 
         # Split accumulators for component and output targets
-        self.comp_accumulator = torch.zeros(self.n_sources, n_components, device=device)
+        self.component_attr_accumulator = self._get_component_attr_accumulator(
+            sources_by_target,
+            component_alive,
+            unembed_module,
+            vocab_size,
+            device,
+        )
 
-        # For output targets: store attributions to output residual dimensions
-        self.d_model = unembed_module.in_features
-        self.out_residual_accumulator = torch.zeros(self.n_sources, self.d_model, device=device)
+    def _get_component_attr_accumulator(
+        self,
+        sources_by_target: dict[str, list[str]],
+        component_alive: dict[str, Bool[Tensor, " n_components"]],
+        unembed_module: nn.Linear,
+        vocab_size: int,
+        device: torch.device,
+    ) -> dict[str, dict[str, Tensor]]:
+        component_attr_accumulator: dict[str, dict[str, Tensor]] = {}
 
-        # Build per-layer index ranges for sources
-        self.component_layer_names = list(model.target_module_paths)
-        self.source_layer_to_idx_range = self._build_source_layer_index_ranges()
-        self.target_layer_to_idx_range = self._build_target_layer_index_ranges()
+        for target_layer, source_layers in sources_by_target.items():
+            if target_layer == "output":
+                target_d = unembed_module.in_features
+            else:
+                (target_c,) = component_alive[target_layer].shape
+                target_d = target_c
 
-        # Pre-compute alive indices per layer
-        self.alive_source_idxs_per_layer = self._build_alive_indices(
-            self.source_layer_to_idx_range, source_alive
-        )
-        self.alive_target_idxs_per_layer = self._build_alive_indices(
-            self.target_layer_to_idx_range, target_alive
-        )
+            source_attr_accumulator: dict[str, Tensor] = {}
+            for source_layer in source_layers:
+                if source_layer == "wte":
+                    source_d = vocab_size
+                else:
+                    (source_c,) = component_alive[source_layer].shape
+                    source_d = source_c
+
+                source_attr_accumulator[source_layer] = torch.zeros(
+                    (target_d, source_d), device=device
+                )
 
-    def _build_source_layer_index_ranges(self) -> dict[str, tuple[int, int]]:
-        """Source order: wte tokens [0, vocab_size), then component layers."""
-        ranges: dict[str, tuple[int, int]] = {"wte": (0, self.vocab_size)}
-        idx = self.vocab_size
-        for layer in self.component_layer_names:
-            n = self.model.module_to_c[layer]
-            ranges[layer] = (idx, idx + n)
-            idx += n
-        return ranges
-
-    def _build_target_layer_index_ranges(self) -> dict[str, tuple[int, int]]:
-        """Target order: component layers [0, n_components). Output handled separately."""
-        ranges: dict[str, tuple[int, int]] = {}
-        idx = 0
-        for layer in self.component_layer_names:
-            n = self.model.module_to_c[layer]
-            ranges[layer] = (idx, idx + n)
-            idx += n
-        # Note: "output" not included - handled via out_residual_accumulator
-        return ranges
-
-    def _build_alive_indices(
-        self, layer_ranges: dict[str, tuple[int, int]], alive_mask: Bool[Tensor, " n"]
-    ) -> dict[str, list[int]]:
-        """Get alive local indices for each layer."""
-        return {
-            layer: torch.where(alive_mask[start:end])[0].tolist()
-            for layer, (start, end) in layer_ranges.items()
-        }
+            component_attr_accumulator[target_layer] = source_attr_accumulator
+
+        return component_attr_accumulator
 
     def process_batch(self, tokens: Int[Tensor, "batch seq"]) -> None:
         """Accumulate attributions from one batch."""
@@ -153,6 +138,7 @@ def pre_unembed_hook(_mod: nn.Module, args: tuple[Any, ...], _kwargs: Any) -> No
             ci = self.model.calc_causal_importances(
                 pre_weight_acts=out.cache, sampling=self.sampling, detach_inputs=False
             )
+
         mask_infos = make_mask_infos(
             component_masks={k: torch.ones_like(v) for k, v in ci.lower_leaky.items()},
             routing_masks="all",
@@ -170,47 +156,20 @@ def pre_unembed_hook(_mod: nn.Module, args: tuple[Any, ...], _kwargs: Any) -> No
         cache = comp_output.cache
         cache["wte_post_detach"] = wte_out[0]
         cache["pre_unembed"] = pre_unembed[0]
-        cache["tokens"] = tokens
+        # cache["tokens"] = tokens
 
         # Process each target layer
-        layers = list(self.sources_by_target.items())
-        pbar = tqdm(layers, desc="Targets", disable=not self.show_progress, leave=False)
-        for target_layer, source_layers in pbar:
+        for target_layer in self.sources_by_target:
             if target_layer == "output":
-                self._process_output_targets(source_layers, cache)
+                self._process_output_targets(cache, ci.lower_leaky, tokens)
             else:
-                self._process_component_targets(target_layer, source_layers, cache)
-
-    def _process_component_targets(
-        self,
-        target_layer: str,
-        source_layers: list[str],
-        cache: dict[str, Tensor],
-    ) -> None:
-        """Process attributions to a component layer."""
-        target_start, _ = self.target_layer_to_idx_range[target_layer]
-        alive_targets = self.alive_target_idxs_per_layer[target_layer]
-        if not alive_targets:
-            return
-
-        # Sum over batch and sequence
-        target_acts = cache[f"{target_layer}_pre_detach"].sum(dim=(0, 1))
-        source_acts = [cache[f"{s}_post_detach"] for s in source_layers]
-
-        for t_idx in alive_targets:
-            grads = torch.autograd.grad(target_acts[t_idx], source_acts, retain_graph=True)
-            self._accumulate_attributions(
-                self.comp_accumulator[:, target_start + t_idx],
-                source_layers,
-                grads,
-                source_acts,
-                cache["tokens"],
-            )
+                self._process_component_targets(target_layer, ci.lower_leaky, cache, tokens)
 
     def _process_output_targets(
         self,
-        source_layers: list[str],
         cache: dict[str, Tensor],
+        ci: dict[str, Tensor],
+        tokens: Int[Tensor, "batch seq"],
     ) -> None:
         """Process output attributions via output-residual-space storage.
 
@@ -220,40 +179,80 @@ def _process_output_targets(
         """
         # Sum output residual over batch and sequence -> [d_model]
         out_residual = cache["pre_unembed"].sum(dim=(0, 1))
+
+        source_layers = self.sources_by_target["output"]
         source_acts = [cache[f"{s}_post_detach"] for s in source_layers]
 
-        for d_idx in range(self.d_model):
+        for d_idx in range(self.output_d_model):
             grads = torch.autograd.grad(out_residual[d_idx], source_acts, retain_graph=True)
+            source_acts_grads = list(zip(source_layers, source_acts, grads, strict=True))
+
+            self._accumulate_attributions(
+                "output",
+                d_idx,
+                source_acts_grads,
+                ci,
+                tokens,
+            )
+
+    def _process_component_targets(
+        self,
+        target_layer: str,
+        ci: dict[str, Tensor],
+        cache: dict[str, Tensor],
+        tokens: Int[Tensor, "batch seq"],
+    ) -> None:
+        """Process attributions to a component layer."""
+        alive_targets = self.component_alive[target_layer]
+        if not alive_targets.any():
+            return
+
+        # Sum over batch and sequence
+
+        target_acts_raw = cache[f"{target_layer}_pre_detach"]
+        ci_weighted_target_acts = (target_acts_raw * ci[target_layer]).sum(dim=(0, 1))
+
+        source_layers = self.sources_by_target[target_layer]
+        source_acts = [cache[f"{s}_post_detach"] for s in source_layers]
+
+        for t_idx in alive_targets.tolist():
+            grads = torch.autograd.grad(
+                ci_weighted_target_acts[t_idx], source_acts, retain_graph=True
+            )
+
+            source_acts_grads = list(zip(source_layers, source_acts, grads, strict=True))
+
             self._accumulate_attributions(
-                self.out_residual_accumulator[:, d_idx],
-                source_layers,
-                grads,
-                source_acts,
-                cache["tokens"],
+                target_layer,
+                t_idx,
+                source_acts_grads,
+                ci,
+                tokens,
             )
 
+    @torch.no_grad()
     def _accumulate_attributions(
         self,
-        target_col: Float[Tensor, " n_sources"],
-        source_layers: list[str],
-        grads: tuple[Tensor, ...],
-        source_acts: list[Tensor],
+        target_layer: str,
+        target_idx: int,
+        source_acts_grads: list[tuple[str, Tensor, Tensor]],
+        ci: dict[str, Tensor],
         tokens: Int[Tensor, "batch seq"],
     ) -> None:
         """Accumulate grad*act attributions from sources to a target column."""
-        with torch.no_grad():
-            for layer, grad, act in zip(source_layers, grads, source_acts, strict=True):
-                alive = self.alive_source_idxs_per_layer[layer]
-                if not alive:
-                    continue
-
-                if layer == "wte":
-                    # Per-token: sum grad*act over d_model, scatter by token id
-                    attr = (grad * act).sum(dim=-1).flatten()
-                    target_col.scatter_add_(0, tokens.flatten(), attr)
-                else:
-                    # Per-component: sum grad*act over batch and sequence
-                    start, _ = self.source_layer_to_idx_range[layer]
-                    attr = (grad * act).sum(dim=(0, 1))
-                    for c in alive:
-                        target_col[start + c] += attr[c]
+        target_accs = self.component_attr_accumulator[target_layer]
+
+        for source_layer, act, grad in source_acts_grads:
+            attr_accumulator = target_accs[source_layer][target_idx]
+
+            ci_weighted_attr = grad * act * ci[source_layer]
+
+            if source_layer == "wte":
+                # Per-token: sum grad*act*ci over d_model, scatter by token id
+                # TODO(oli): figure out why this works
+                attr = ci_weighted_attr.sum(dim=-1).flatten()
+                attr_accumulator.scatter_add_(0, tokens.flatten(), attr)
+            else:
+                # Per-component: sum grad*act*ci over batch and sequence
+                attr = ci_weighted_attr.sum(dim=(0, 1))
+                attr_accumulator.add_(attr)
diff --git a/spd/dataset_attributions/repo.py b/spd/dataset_attributions/repo.py
index 697036ba3..bd73c5f63 100644
--- a/spd/dataset_attributions/repo.py
+++ b/spd/dataset_attributions/repo.py
@@ -49,7 +49,7 @@ def open(cls, run_id: str) -> "AttributionRepo | None":
         path = subrun_dir / "dataset_attributions.pt"
         if not path.exists():
             return None
-        return cls(DatasetAttributionStorage.load(path), subrun_id=subrun_dir.name)
+        return None # return cls(DatasetAttributionStorage.load(path), subrun_id=subrun_dir.name)
 
     def get_attributions(self) -> DatasetAttributionStorage:
         return self._storage
diff --git a/spd/dataset_attributions/storage.py b/spd/dataset_attributions/storage.py
index 16181201d..3027a6519 100644
--- a/spd/dataset_attributions/storage.py
+++ b/spd/dataset_attributions/storage.py
@@ -28,7 +28,6 @@ class DatasetAttributionEntry:
     value: float
 
 
-@dataclass
 class DatasetAttributionStorage:
     """Dataset-aggregated attribution strengths between components.
 
@@ -52,54 +51,73 @@ class DatasetAttributionStorage:
         - output tokens: "output:{token_id}"
     """
 
-    component_layer_keys: list[str]
-    """Component layer keys in order: ["h.0.attn.q_proj:0", "h.0.attn.q_proj:1", ...]"""
-
-    vocab_size: int
-    """Vocabulary size (number of wte and output tokens)"""
-
-    d_model: int
-    """Model hidden dimension (residual stream size)"""
-
-    source_to_component: Float[Tensor, "n_sources n_components"]
-    """Attributions from sources to component targets. Shape: (vocab_size + n_components, n_components)"""
-
-    source_to_out_residual: Float[Tensor, "n_sources d_model"]
-    """Attributions from sources to output residual dimensions. Shape: (vocab_size + n_components, d_model)"""
-
-    n_batches_processed: int
-    n_tokens_processed: int
-    ci_threshold: float
-
-    _component_key_to_idx: dict[str, int] = dataclasses.field(
-        default_factory=dict, repr=False, init=False
-    )
-
-    def __post_init__(self) -> None:
-        self._component_key_to_idx = {k: i for i, k in enumerate(self.component_layer_keys)}
-
-        n_components = len(self.component_layer_keys)
-        n_sources = self.vocab_size + n_components
+    @property
+    def source_to_component(self) -> Float[Tensor, "n_sources n_components"]:
+        """Attributions from sources to component targets. Shape: (vocab_size + n_components,
+        n_components)"""
+        raise NotImplementedError("source_to_component is not implemented with new storage format")
 
-        expected_comp_shape = (n_sources, n_components)
-        assert self.source_to_component.shape == expected_comp_shape, (
-            f"source_to_component shape {self.source_to_component.shape} "
-            f"doesn't match expected {expected_comp_shape}"
+    @property
+    def source_to_out_residual(self) -> Float[Tensor, "n_sources d_model"]:
+        """Attributions from sources to output residual dimensions. Shape: (vocab_size + n_components,
+        d_model)"""
+        raise NotImplementedError(
+            "source_to_out_residual is not implemented with new storage format"
         )
 
-        expected_resid_shape = (n_sources, self.d_model)
-        assert self.source_to_out_residual.shape == expected_resid_shape, (
-            f"source_to_out_residual shape {self.source_to_out_residual.shape} "
-            f"doesn't match expected {expected_resid_shape}"
-        )
+    @property
+    def component_layer_keys(self) -> list[str]:
+        """Component layer keys in order: ["h.0.attn.q_proj:0", "h.0.attn.q_proj:1", ...]"""
+        raise NotImplementedError("component_layer_keys is not implemented with new storage format")
 
     @property
     def n_components(self) -> int:
-        return len(self.component_layer_keys)
+        """Number of component layers."""
+        raise NotImplementedError("n_components is not implemented with new storage format")
+        # return len(self.component_layer_keys)
 
-    @property
-    def n_sources(self) -> int:
-        return self.vocab_size + self.n_components
+    def __init__(
+        self,
+        ci_threshold: float,
+        vocab_size: int,  #  d_model: int
+        # TODO(oli): check these are needed
+        n_batches_processed: int,
+        n_tokens_processed: int,
+    ):
+        self.ci_threshold = ci_threshold
+        self._REMOVE_ME_vocab_size = vocab_size
+        self.n_batches_processed = n_batches_processed
+        self.n_tokens_processed = n_tokens_processed
+
+    # _component_key_to_idx: dict[str, int] = dataclasses.field(
+    #     default_factory=dict, repr=False, init=False
+    # )
+
+    # def __post_init__(self) -> None:
+    #     self._component_key_to_idx = {k: i for i, k in enumerate(self.component_layer_keys)}
+
+    #     n_components = len(self.component_layer_keys)
+    #     n_sources = self.vocab_size + n_components
+
+    #     expected_comp_shape = (n_sources, n_components)
+    #     assert self.source_to_component.shape == expected_comp_shape, (
+    #         f"source_to_component shape {self.source_to_component.shape} "
+    #         f"doesn't match expected {expected_comp_shape}"
+    #     )
+
+    #     expected_resid_shape = (n_sources, self.d_model)
+    #     assert self.source_to_out_residual.shape == expected_resid_shape, (
+    #         f"source_to_out_residual shape {self.source_to_out_residual.shape} "
+    #         f"doesn't match expected {expected_resid_shape}"
+    #     )
+
+    # @property
+    # def n_components(self) -> int:
+    #     return len(self.component_layer_keys)
+
+    # @property
+    # def n_sources(self) -> int:
+    #     return self.vocab_size + self.n_components
 
     def _parse_key(self, key: str) -> tuple[str, int]:
         """Parse a key into (layer, idx)."""
@@ -111,14 +129,14 @@ def _source_idx(self, key: str) -> int:
         layer, idx = self._parse_key(key)
         match layer:
             case "wte":
-                assert 0 <= idx < self.vocab_size, (
-                    f"wte index {idx} out of range [0, {self.vocab_size})"
+                assert 0 <= idx < self._REMOVE_ME_vocab_size, (
+                    f"wte index {idx} out of range [0, {self._REMOVE_ME_vocab_size})"
                 )
                 return idx
             case "output":
                 raise KeyError(f"output tokens cannot be sources: {key}")
             case _:
-                return self.vocab_size + self._component_key_to_idx[key]
+                return self._REMOVE_ME_vocab_size + self._component_key_to_idx[key]
 
     def _component_target_idx(self, key: str) -> int:
         """Get target index for a component key. Raises KeyError if output or invalid."""
@@ -128,9 +146,9 @@ def _component_target_idx(self, key: str) -> int:
 
     def _source_idx_to_key(self, idx: int) -> str:
         """Convert source (row) index to key."""
-        if idx < self.vocab_size:
+        if idx < self._REMOVE_ME_vocab_size:
             return f"wte:{idx}"
-        return self.component_layer_keys[idx - self.vocab_size]
+        return self.component_layer_keys[idx - self._REMOVE_ME_vocab_size]
 
     def _component_target_idx_to_key(self, idx: int) -> str:
         """Convert component target index to key."""
@@ -147,7 +165,7 @@ def _is_output_target(self, key: str) -> bool:
     def _output_token_id(self, key: str) -> int:
         """Extract token_id from an output key like 'output:123'. Asserts valid range."""
         _, token_id = self._parse_key(key)
-        assert 0 <= token_id < self.vocab_size, f"output index {token_id} out of range"
+        assert 0 <= token_id < self._REMOVE_ME_vocab_size, f"output index {token_id} out of range"
         return token_id
 
     def has_source(self, key: str) -> bool:
@@ -155,7 +173,7 @@ def has_source(self, key: str) -> bool:
         layer, idx = self._parse_key(key)
         match layer:
             case "wte":
-                return 0 <= idx < self.vocab_size
+                return 0 <= idx < self._REMOVE_ME_vocab_size
             case "output":
                 return False
             case _:
@@ -168,41 +186,40 @@ def has_target(self, key: str) -> bool:
             case "wte":
                 return False
             case "output":
-                return 0 <= idx < self.vocab_size
+                return 0 <= idx < self._REMOVE_ME_vocab_size
             case _:
                 return key in self._component_key_to_idx
 
-    def save(self, path: Path) -> None:
-        path.parent.mkdir(parents=True, exist_ok=True)
-        torch.save(
-            {
-                "component_layer_keys": self.component_layer_keys,
-                "vocab_size": self.vocab_size,
-                "d_model": self.d_model,
-                "source_to_component": self.source_to_component.cpu(),
-                "source_to_out_residual": self.source_to_out_residual.cpu(),
-                "n_batches_processed": self.n_batches_processed,
-                "n_tokens_processed": self.n_tokens_processed,
-                "ci_threshold": self.ci_threshold,
-            },
-            path,
-        )
-        size_mb = path.stat().st_size / (1024 * 1024)
-        logger.info(f"Saved dataset attributions to {path} ({size_mb:.1f} MB)")
-
-    @classmethod
-    def load(cls, path: Path) -> "DatasetAttributionStorage":
-        data = torch.load(path, weights_only=True, mmap=True)
-        return cls(
-            component_layer_keys=data["component_layer_keys"],
-            vocab_size=data["vocab_size"],
-            d_model=data["d_model"],
-            source_to_component=data["source_to_component"],
-            source_to_out_residual=data["source_to_out_residual"],
-            n_batches_processed=data["n_batches_processed"],
-            n_tokens_processed=data["n_tokens_processed"],
-            ci_threshold=data["ci_threshold"],
-        )
+    # TODO redo with new storage format
+    # def save(self, path: Path) -> None:
+    #     path.parent.mkdir(parents=True, exist_ok=True)
+    #     torch.save(
+    #         {
+    #             "component_layer_keys": self.component_layer_keys,
+    #             "vocab_size": self._REMOVE_ME_vocab_size,
+    #             "source_to_component": self.source_to_component.cpu(),
+    #             "source_to_out_residual": self.source_to_out_residual.cpu(),
+    #             "n_batches_processed": self.n_batches_processed,
+    #             "n_tokens_processed": self.n_tokens_processed,
+    #             "ci_threshold": self.ci_threshold,
+    #         },
+    #         path,
+    #     )
+    #     size_mb = path.stat().st_size / (1024 * 1024)
+    #     logger.info(f"Saved dataset attributions to {path} ({size_mb:.1f} MB)")
+    # @classmethod
+    # def load(cls, path: Path) -> "DatasetAttributionStorage":
+    #     data = torch.load(path, weights_only=True, mmap=True)
+    #     return cls(
+    #         component_layer_keys=data["component_layer_keys"],
+    #         vocab_size=data["vocab_size"],
+    #         d_model=data["d_model"],
+    #         source_to_component=data["source_to_component"],
+    #         source_to_out_residual=data["source_to_out_residual"],
+    #         n_batches_processed=data["n_batches_processed"],
+    #         n_tokens_processed=data["n_tokens_processed"],
+    #         ci_threshold=data["ci_threshold"],
+    #     )
 
     def get_attribution(
         self,
@@ -319,27 +336,29 @@ def combined_idx_to_key(idx: int) -> str:
 
         return self._get_top_k(comp_values, k, sign, self._component_target_idx_to_key)
 
-    def get_top_component_targets(
-        self,
-        source_key: str,
-        k: int,
-        sign: Literal["positive", "negative"],
-    ) -> list[DatasetAttributionEntry]:
-        """Get top-k component targets (excluding outputs) this source attributes TO.
-
-        Convenience method that doesn't require w_unembed.
-        """
-        return self.get_top_targets(source_key, k, sign, w_unembed=None, include_outputs=False)
-
-    def get_top_output_targets(
-        self,
-        source_key: str,
-        k: int,
-        sign: Literal["positive", "negative"],
-        w_unembed: Float[Tensor, "d_model vocab"],
-    ) -> list[DatasetAttributionEntry]:
-        """Get top-k output token targets this source attributes TO."""
-        src_idx = self._source_idx(source_key)
-        w_unembed = w_unembed.to(self.source_to_out_residual.device)
-        output_values = self.source_to_out_residual[src_idx, :] @ w_unembed  # (vocab,)
-        return self._get_top_k(output_values, k, sign, self._output_target_idx_to_key)
+    # Unused apart from tests
+    # def get_top_component_targets(
+    #     self,
+    #     source_key: str,
+    #     k: int,
+    #     sign: Literal["positive", "negative"],
+    # ) -> list[DatasetAttributionEntry]:
+    #     """Get top-k component targets (excluding outputs) this source attributes TO.
+
+    #     Convenience method that doesn't require w_unembed.
+    #     """
+    #     return self.get_top_targets(source_key, k, sign, w_unembed=None, include_outputs=False)
+
+    # Unused
+    # def get_top_output_targets(
+    #     self,
+    #     source_key: str,
+    #     k: int,
+    #     sign: Literal["positive", "negative"],
+    #     w_unembed: Float[Tensor, "d_model vocab"],
+    # ) -> list[DatasetAttributionEntry]:
+    #     """Get top-k output token targets this source attributes TO."""
+    #     src_idx = self._source_idx(source_key)
+    #     w_unembed = w_unembed.to(self.source_to_out_residual.device)
+    #     output_values = self.source_to_out_residual[src_idx, :] @ w_unembed  # (vocab,)
+    #     return self._get_top_k(output_values, k, sign, self._output_target_idx_to_key)
diff --git a/spd/topology/gradient_connectivity.py b/spd/topology/gradient_connectivity.py
index bcaac8423..3337d208c 100644
--- a/spd/topology/gradient_connectivity.py
+++ b/spd/topology/gradient_connectivity.py
@@ -74,19 +74,20 @@ def embed_hook(
     cache[f"{embed_path}_post_detach"] = embed_cache[f"{embed_path}_post_detach"]
     cache[f"{unembed_path}_pre_detach"] = comp_output_with_cache.output
 
-    layers = [embed_path, *model.target_module_paths, unembed_path]
+    source_layers = [embed_path, *model.target_module_paths] # Don't include "output" as source
+    target_layers = [*model.target_module_paths, unembed_path] # Don't include embed as target
 
     # Test all distinct pairs for gradient flow
     test_pairs = []
-    for in_layer in layers[:-1]:  # Don't include "output" as source
-        for out_layer in layers[1:]:  # Don't include embed as target
-            if in_layer != out_layer:
-                test_pairs.append((in_layer, out_layer))
+    for source_layer in source_layers:
+        for target_layer in target_layers:
+            if source_layer != target_layer:
+                test_pairs.append((source_layer, target_layer))
 
     sources_by_target: dict[str, list[str]] = defaultdict(list)
-    for in_layer, out_layer in test_pairs:
-        out_pre_detach = cache[f"{out_layer}_pre_detach"]
-        in_post_detach = cache[f"{in_layer}_post_detach"]
+    for source_layer, target_layer in test_pairs:
+        out_pre_detach = cache[f"{target_layer}_pre_detach"]
+        in_post_detach = cache[f"{source_layer}_post_detach"]
         out_value = out_pre_detach[0, 0, 0]
         grads = torch.autograd.grad(
             outputs=out_value,
@@ -97,5 +98,5 @@ def embed_hook(
         assert len(grads) == 1
         grad = grads[0]
         if grad is not None:  # pyright: ignore[reportUnnecessaryComparison]
-            sources_by_target[out_layer].append(in_layer)
+            sources_by_target[target_layer].append(source_layer)
     return dict(sources_by_target)
diff --git a/tests/dataset_attributions/test_harvester.py b/tests/dataset_attributions/test_harvester.py
index 96ebc5df8..3df88a508 100644
--- a/tests/dataset_attributions/test_harvester.py
+++ b/tests/dataset_attributions/test_harvester.py
@@ -23,7 +23,7 @@ def _make_storage(
 
     return DatasetAttributionStorage(
         component_layer_keys=[f"layer1:{i}" for i in range(n_components)],
-        vocab_size=vocab_size,
+        _REMOVE_ME_vocab_size=vocab_size,
         d_model=d_model,
         source_to_component=source_to_component,
         source_to_out_residual=source_to_out_residual,
@@ -241,7 +241,7 @@ def test_save_and_load(self, tmp_path: Path) -> None:
 
         original = DatasetAttributionStorage(
             component_layer_keys=["layer:0", "layer:1"],
-            vocab_size=vocab_size,
+            _REMOVE_ME_vocab_size=vocab_size,
             d_model=d_model,
             source_to_component=torch.randn(n_sources, n_components),
             source_to_out_residual=torch.randn(n_sources, d_model),
@@ -256,7 +256,7 @@ def test_save_and_load(self, tmp_path: Path) -> None:
         loaded = DatasetAttributionStorage.load(path)
 
         assert loaded.component_layer_keys == original.component_layer_keys
-        assert loaded.vocab_size == original.vocab_size
+        assert loaded._REMOVE_ME_vocab_size == original._REMOVE_ME_vocab_size
         assert loaded.d_model == original.d_model
         assert loaded.n_batches_processed == original.n_batches_processed
         assert loaded.n_tokens_processed == original.n_tokens_processed

From e8cd454d0c3b040bd5288360eb2d4dc30d330a88 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 17:16:34 +0000
Subject: [PATCH 31/62] Rewrite dataset attribution storage: dict-of-dicts,
 canonical names, 3 metrics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Storage uses attrs[target_layer][source_layer] = Tensor[target_d, source_d]
with canonical layer names (embed, output, 0.glu.up, etc.). Harvester stays
concrete internally; translation at storage boundary via topology.target_to_canon.

Three attribution metrics accumulated:
- attr: E[grad*act] (signed mean)
- attr_abs: E[grad*|act|] (attribution to absolute target value)
- mean_squared_attr: E[(grad*act)²] (pre-sqrt, mergeable across workers)

Other changes:
- Fix filter bug: used "output" instead of concrete unembed path (e.g. "lm_head")
- Harvester parameterised with embed_path/unembed_path instead of magic strings
- Storage.merge() classmethod with correct weighted-average semantics
- Router simplified: no topology translation needed with canonical storage
- Query methods stubbed with ValueError (frontend not yet updated)
- Re-enable AttributionRepo.open() load
- Remove outdated test_harvester.py (uses old flat-index API)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/routers/dataset_attributions.py   |  61 +--
 spd/dataset_attributions/harvest.py           | 166 ++-----
 spd/dataset_attributions/harvester.py         | 173 ++++---
 spd/dataset_attributions/repo.py              |   2 +-
 spd/dataset_attributions/storage.py           | 454 +++++++-----------
 spd/topology/gradient_connectivity.py         |   4 +-
 tests/dataset_attributions/test_harvester.py  | 265 ----------
 7 files changed, 342 insertions(+), 783 deletions(-)
 delete mode 100644 tests/dataset_attributions/test_harvester.py

diff --git a/spd/app/backend/routers/dataset_attributions.py b/spd/app/backend/routers/dataset_attributions.py
index c459d29ae..3c5bd87ff 100644
--- a/spd/app/backend/routers/dataset_attributions.py
+++ b/spd/app/backend/routers/dataset_attributions.py
@@ -55,16 +55,9 @@ class ComponentAttributions(BaseModel):
 )
 
 
-def _to_concrete_key(canonical_layer: str, component_idx: int, loaded: DepLoadedRun) -> str:
-    """Translate canonical layer + idx to concrete storage key.
-
-    "embed" maps to the concrete embedding path (e.g. "wte") in storage.
-    "output" is a pseudo-layer used as-is in storage.
-    """
-    if canonical_layer == "output":
-        return f"output:{component_idx}"
-    concrete = loaded.topology.canon_to_target(canonical_layer)
-    return f"{concrete}:{component_idx}"
+def _storage_key(canonical_layer: str, component_idx: int) -> str:
+    """Format a canonical layer + idx as a storage key."""
+    return f"{canonical_layer}:{component_idx}"
 
 
 def _require_storage(loaded: DepLoadedRun) -> DatasetAttributionStorage:
@@ -97,20 +90,12 @@ def _get_w_unembed(loaded: DepLoadedRun) -> Float[Tensor, "d_model vocab"]:
     return loaded.topology.get_unembed_weight()
 
 
-def _to_api_entries(
-    loaded: DepLoadedRun, entries: list[StorageEntry]
-) -> list[DatasetAttributionEntry]:
-    """Convert storage entries to API response format with canonical keys."""
-
-    def _canonicalize_layer(layer: str) -> str:
-        if layer == "output":
-            return layer
-        return loaded.topology.target_to_canon(layer)
-
+def _to_api_entries(entries: list[StorageEntry]) -> list[DatasetAttributionEntry]:
+    """Convert storage entries to API response format."""
     return [
         DatasetAttributionEntry(
-            component_key=f"{_canonicalize_layer(e.layer)}:{e.component_idx}",
-            layer=_canonicalize_layer(e.layer),
+            component_key=e.component_key,
+            layer=e.layer,
             component_idx=e.component_idx,
             value=e.value,
         )
@@ -128,8 +113,6 @@ def get_attribution_metadata(loaded: DepLoadedRun) -> DatasetAttributionMetadata
             n_batches_processed=None,
             n_tokens_processed=None,
             n_component_layer_keys=None,
-            # vocab_size=None,
-            # d_model=None,
             ci_threshold=None,
         )
     storage = loaded.attributions.get_attributions()
@@ -138,8 +121,6 @@ def get_attribution_metadata(loaded: DepLoadedRun) -> DatasetAttributionMetadata
         n_batches_processed=storage.n_batches_processed,
         n_tokens_processed=storage.n_tokens_processed,
         n_component_layer_keys=storage.n_components,
-        # vocab_size=storage.vocab_size,
-        # d_model=storage.d_model,
         ci_threshold=storage.ci_threshold,
     )
 
@@ -154,7 +135,7 @@ def get_component_attributions(
 ) -> ComponentAttributions:
     """Get all attribution data for a component (sources and targets, positive and negative)."""
     storage = _require_storage(loaded)
-    component_key = _to_concrete_key(layer, component_idx, loaded)
+    component_key = _storage_key(layer, component_idx)
 
     # Component can be both a source and a target, so we need to check both
     is_source = storage.has_source(component_key)
@@ -169,18 +150,13 @@ def get_component_attributions(
     w_unembed = _get_w_unembed(loaded) if is_source else None
 
     return ComponentAttributions(
-        positive_sources=_to_api_entries(
-            loaded, storage.get_top_sources(component_key, k, "positive")
-        )
+        positive_sources=_to_api_entries(storage.get_top_sources(component_key, k, "positive"))
         if is_target
         else [],
-        negative_sources=_to_api_entries(
-            loaded, storage.get_top_sources(component_key, k, "negative")
-        )
+        negative_sources=_to_api_entries(storage.get_top_sources(component_key, k, "negative"))
         if is_target
         else [],
         positive_targets=_to_api_entries(
-            loaded,
             storage.get_top_targets(
                 component_key,
                 k,
@@ -192,7 +168,6 @@ def get_component_attributions(
         if is_source
         else [],
         negative_targets=_to_api_entries(
-            loaded,
             storage.get_top_targets(
                 component_key,
                 k,
@@ -217,14 +192,12 @@ def get_attribution_sources(
 ) -> list[DatasetAttributionEntry]:
     """Get top-k source components that attribute TO this target over the dataset."""
     storage = _require_storage(loaded)
-    target_key = _to_concrete_key(layer, component_idx, loaded)
+    target_key = _storage_key(layer, component_idx)
     _require_target(storage, target_key)
 
     w_unembed = _get_w_unembed(loaded) if layer == "output" else None
 
-    return _to_api_entries(
-        loaded, storage.get_top_sources(target_key, k, sign, w_unembed=w_unembed)
-    )
+    return _to_api_entries(storage.get_top_sources(target_key, k, sign, w_unembed=w_unembed))
 
 
 @router.get("/{layer}/{component_idx}/targets")
@@ -238,14 +211,12 @@ def get_attribution_targets(
 ) -> list[DatasetAttributionEntry]:
     """Get top-k target components this source attributes TO over the dataset."""
     storage = _require_storage(loaded)
-    source_key = _to_concrete_key(layer, component_idx, loaded)
+    source_key = _storage_key(layer, component_idx)
     _require_source(storage, source_key)
 
     w_unembed = _get_w_unembed(loaded)
 
-    return _to_api_entries(
-        loaded, storage.get_top_targets(source_key, k, sign, w_unembed=w_unembed)
-    )
+    return _to_api_entries(storage.get_top_targets(source_key, k, sign, w_unembed=w_unembed))
 
 
 @router.get("/between/{source_layer}/{source_idx}/{target_layer}/{target_idx}")
@@ -259,8 +230,8 @@ def get_attribution_between(
 ) -> float:
     """Get attribution strength from source component to target component."""
     storage = _require_storage(loaded)
-    source_key = _to_concrete_key(source_layer, source_idx, loaded)
-    target_key = _to_concrete_key(target_layer, target_idx, loaded)
+    source_key = _storage_key(source_layer, source_idx)
+    target_key = _storage_key(target_layer, target_idx)
     _require_source(storage, source_key)
     _require_target(storage, target_key)
 
diff --git a/spd/dataset_attributions/harvest.py b/spd/dataset_attributions/harvest.py
index 84a3d608d..6c55a83aa 100644
--- a/spd/dataset_attributions/harvest.py
+++ b/spd/dataset_attributions/harvest.py
@@ -14,7 +14,6 @@
 
 import itertools
 from pathlib import Path
-from typing import Any, cast
 
 import torch
 import tqdm
@@ -34,27 +33,12 @@
 from spd.utils.wandb_utils import parse_wandb_run_path
 
 
-def _build_component_layer_keys(model: ComponentModel) -> list[str]:
-    """Build list of component layer keys in canonical order.
-
-    Returns keys like ["h.0.attn.q_proj:0", "h.0.attn.q_proj:1", ...] for all layers.
-    wte and output keys are not included - they're constructed from vocab_size.
-    """
-    component_layer_keys = []
-    for layer in model.target_module_paths:
-        n_components = model.module_to_c[layer]
-        for c_idx in range(n_components):
-            component_layer_keys.append(f"{layer}:{c_idx}")
-    return component_layer_keys
-
-
 def _build_alive_masks(
     model: ComponentModel,
     run_id: str,
     harvest_subrun_id: str | None,
-    # n_components: int,
+    embed_path: str,
     vocab_size: int,
-    # ) -> tuple[Bool[Tensor, " n_sources"], Bool[Tensor, " n_components"]]:
 ) -> dict[str, Bool[Tensor, " n_components"]]:
     """Build masks of alive components (mean_activation > threshold) for sources and targets.
 
@@ -66,19 +50,12 @@ def _build_alive_masks(
     """
 
     component_alive = {
-        "wte": torch.ones(vocab_size, dtype=torch.bool),  # All wte tokens are always alive
+        embed_path: torch.ones(vocab_size, dtype=torch.bool),
         **{
             layer: torch.zeros(model.module_to_c[layer], dtype=torch.bool)
             for layer in model.target_module_paths
         },
     }
-    # # All wte tokens are always alive (source indices [0, vocab_size))
-    # source_alive[:vocab_size] = True
-
-    # target_alive = {
-    #     layer: torch.zeros(model.module_to_c[layer], dtype=torch.bool)
-    #     for layer in model.target_module_paths
-    # }
 
     if harvest_subrun_id is not None:
         harvest = HarvestRepo(decomposition_id=run_id, subrun_id=harvest_subrun_id, readonly=True)
@@ -89,10 +66,6 @@ def _build_alive_masks(
     summary = harvest.get_summary()
     assert summary is not None, "Harvest summary not available"
 
-    # Build masks for component layers
-    # source_idx = vocab_size  # Start after wte tokens
-    # target_idx = 0
-
     for layer in model.target_module_paths:
         n_layer_components = model.module_to_c[layer]
         for c_idx in range(n_layer_components):
@@ -100,12 +73,6 @@ def _build_alive_masks(
             is_alive = component_key in summary and summary[component_key].firing_density > 0.0
             component_alive[layer][c_idx] = is_alive
 
-    # n_source_alive = int(source_alive.sum().item())
-    # n_target_alive = int(target_alive.sum().item())
-    # logger.info(
-    #     f"Alive components: {n_source_alive}/{n_sources} sources, "
-    #     f"{n_target_alive}/{n_components} component targets (firing density > 0.0)"
-    # )
     return component_alive
 
 
@@ -146,29 +113,21 @@ def harvest_attributions(
     assert isinstance(vocab_size, int), f"vocab_size must be int, got {type(vocab_size)}"
     logger.info(f"Vocab size: {vocab_size}")
 
-    # Build component keys and alive masks
-    # component_layer_keys = _build_component_layer_keys(model)
-    # n_components = len(component_layer_keys)
-    component_alive = _build_alive_masks(model, run_id, harvest_subrun_id, vocab_size)
-    # source_alive = source_alive.to(device)
-    # target_alive = target_alive.to(device)
-
-    # n_sources = vocab_size + n_components
-    # logger.info(f"Component layers: {n_components}, Sources: {n_sources}")
-
     # Get gradient connectivity
     logger.info("Computing sources_by_target...")
     topology = TransformerTopology(model.target_model)
+    embed_path = topology.path_schema.embedding_path
+    unembed_path = topology.path_schema.unembed_path
     sources_by_target_raw = get_sources_by_target(model, topology, str(device), spd_config.sampling)
 
-    # Filter sources_by_target:
-    # - Valid targets: component layers + output
-    # - Valid sources: wte + component layers
+    # Filter to valid source/target pairs:
+    # - Valid sources: embedding + component layers
+    # - Valid targets: component layers + unembed
     component_layers = set(model.target_module_paths)
-    valid_sources = component_layers.union({"wte"})
-    valid_targets = component_layers.union({"output"})
+    valid_sources = component_layers | {embed_path}
+    valid_targets = component_layers | {unembed_path}
 
-    sources_by_target = {}
+    sources_by_target: dict[str, list[str]] = {}
     for target, sources in sources_by_target_raw.items():
         if target not in valid_targets:
             continue
@@ -177,15 +136,19 @@ def harvest_attributions(
             sources_by_target[target] = filtered_sources
     logger.info(f"Found {len(sources_by_target)} target layers with gradient connections")
 
-    # Create harvester
+    # Build alive masks
+    component_alive = _build_alive_masks(model, run_id, harvest_subrun_id, embed_path, vocab_size)
+
+    # Create harvester (all concrete paths internally)
     harvester = AttributionHarvester(
         model=model,
         sources_by_target=sources_by_target,
-        # n_components=n_components,
         vocab_size=vocab_size,
         component_alive=component_alive,
         sampling=spd_config.sampling,
+        embed_path=embed_path,
         embedding_module=topology.embedding_module,
+        unembed_path=unembed_path,
         unembed_module=topology.unembed_module,
         device=device,
     )
@@ -197,15 +160,18 @@ def harvest_attributions(
             batch_range = range(n_batches)
         case "whole_dataset":
             batch_range = itertools.count()
+
     for batch_idx in tqdm.tqdm(batch_range, desc="Attribution batches"):
         try:
             batch_data = next(train_iter)
         except StopIteration:
             logger.info(f"Dataset exhausted at batch {batch_idx}. Processing complete.")
             break
+
         # Skip batches not assigned to this rank
         if world_size is not None and batch_idx % world_size != rank:
             continue
+
         batch = extract_batch_data(batch_data).to(device)
         harvester.process_batch(batch)
 
@@ -213,16 +179,24 @@ def harvest_attributions(
         f"Processing complete. Tokens: {harvester.n_tokens:,}, Batches: {harvester.n_batches}"
     )
 
-    # Normalize by n_tokens to get per-token average attribution
-    # normalized_comp = harvester.comp_accumulator / harvester.n_tokens
-    # normalized_out_residual = harvester.out_residual_accumulator / harvester.n_tokens
+    # Translate concrete paths to canonical for storage
+    to_canon = topology.target_to_canon
+    normalized = harvester.normalized_attrs()
+
+    def canonicalize(d: dict[str, dict[str, Tensor]]) -> dict[str, dict[str, Tensor]]:
+        return {
+            to_canon(target): {to_canon(src): tensor for src, tensor in src_attrs.items()}
+            for target, src_attrs in d.items()
+        }
 
-    # Build and save storage
     storage = DatasetAttributionStorage(
+        attr=canonicalize(normalized.attr),
+        attr_abs=canonicalize(normalized.attr_abs),
+        mean_squared_attr=canonicalize(normalized.mean_squared_attr),
         vocab_size=vocab_size,
+        ci_threshold=config.ci_threshold,
         n_batches_processed=harvester.n_batches,
         n_tokens_processed=harvester.n_tokens,
-        ci_threshold=config.ci_threshold,
     )
 
     if rank is not None:
@@ -232,75 +206,27 @@ def harvest_attributions(
     else:
         output_dir.mkdir(parents=True, exist_ok=True)
         output_path = output_dir / "dataset_attributions.pt"
-    # storage.save(output_path)
-    logger.info(f"Saved dataset attributions to {output_path}")
+    storage.save(output_path)
 
 
 def merge_attributions(output_dir: Path) -> None:
-    """Merge partial attribution files from parallel workers.
-
-    Looks for worker_states/dataset_attributions_rank_*.pt files and merges them
-    into dataset_attributions.pt in the output_dir.
-
-    Uses streaming merge to avoid OOM - loads one file at a time instead of all at once.
-    """
+    """Merge partial attribution files from parallel workers."""
     worker_dir = output_dir / "worker_states"
     rank_files = sorted(worker_dir.glob("dataset_attributions_rank_*.pt"))
     assert rank_files, f"No rank files found in {worker_dir}"
     logger.info(f"Found {len(rank_files)} rank files to merge")
 
-    # Load first file to get metadata and initialize accumulators
-    # Use double precision for accumulation to prevent precision loss with billions of tokens
-    first = cast(DatasetAttributionStorage, None)  # DatasetAttributionStorage.load(rank_files[0])
-    total_comp = (first.source_to_component * first.n_tokens_processed).double()
-    total_out_residual = (first.source_to_out_residual * first.n_tokens_processed).double()
-    total_tokens = first.n_tokens_processed
-    total_batches = first.n_batches_processed
-    logger.info(f"Loaded rank 0: {first.n_tokens_processed:,} tokens")
-
-    # Stream remaining files one at a time
-    for rank_file in tqdm.tqdm(rank_files[1:], desc="Merging rank files"):
-        storage = cast(DatasetAttributionStorage, None)  # DatasetAttributionStorage.load(rank_file)
-
-        # Validate consistency
-        assert storage.component_layer_keys == first.component_layer_keys, (
-            "Component layer keys mismatch"
-        )
-        # assert storage.d_model == first.d_model, "d_model mismatch"
-        assert storage.ci_threshold == first.ci_threshold, "CI threshold mismatch"
-
-        # Accumulate de-normalized values
-        total_comp += storage.source_to_component * storage.n_tokens_processed
-        total_out_residual += storage.source_to_out_residual * storage.n_tokens_processed
-        total_tokens += storage.n_tokens_processed
-        total_batches += storage.n_batches_processed
-
-    # Normalize by total tokens and convert back to float32 for storage
-    merged_comp = (total_comp / total_tokens).float()
-    merged_out_residual = (total_out_residual / total_tokens).float()
-
-    # Save merged result
-    merged = DatasetAttributionStorage(
-        # component_layer_keys=first.component_layer_keys,
-        # # d_model=first.d_model,
-        # source_to_component=merged_comp,
-        # source_to_out_residual=merged_out_residual,
-        # n_batches_processed=total_batches,
-        # n_tokens_processed=total_tokens,
-        # ci_threshold=first.ci_threshold,
-        vocab_size=0, # vocab_size,
-        n_batches_processed=total_batches,
-        n_tokens_processed=total_tokens,
-        ci_threshold=first.ci_threshold,
-    )
+    merged = DatasetAttributionStorage.merge(rank_files)
 
     output_path = output_dir / "dataset_attributions.pt"
-    # merged.save(output_path)
-    assert output_path.stat().st_size > 0, f"Merge output is empty: {output_path}"
-    logger.info(f"Merged {len(rank_files)} files -> {output_path}")
-    logger.info(f"Total: {total_batches} batches, {total_tokens:,} tokens")
-
-    for rank_file in rank_files:
-        rank_file.unlink()
-    worker_dir.rmdir()
-    logger.info(f"Deleted {len(rank_files)} per-rank files and worker_states/")
+    merged.save(output_path)
+    logger.info(
+        f"Total: {merged.n_batches_processed} batches, {merged.n_tokens_processed:,} tokens"
+    )
+
+    # TODO(oli): reenable this
+    # disabled deletion for testing, posterity and retries
+    # for rank_file in rank_files:
+    #     rank_file.unlink()
+    # worker_dir.rmdir()
+    # logger.info(f"Deleted {len(rank_files)} per-rank files and worker_states/")
diff --git a/spd/dataset_attributions/harvester.py b/spd/dataset_attributions/harvester.py
index f4be04f25..947937f0c 100644
--- a/spd/dataset_attributions/harvester.py
+++ b/spd/dataset_attributions/harvester.py
@@ -4,12 +4,26 @@
 training dataset using gradient x activation formula, summed over all positions
 and batches.
 
+Three metrics are accumulated:
+- attr:         E[∂y/∂x · x]           (signed mean attribution)
+- attr_abs:     E[∂|y|/∂x · x]         (attribution to absolute value of target)
+- squared_attr: E[(∂y/∂x · x)²]        (mean squared attribution, for RMS)
+
+Naming convention: modifier before "attr" applies to the target (e.g. attr_abs =
+attribution to |target|), modifier after applies to the attribution itself
+(e.g. squared_attr = squared attribution).
+
 Uses residual-based storage for scalability:
-- Component targets: accumulated directly to comp_accumulator
-- Output targets: accumulated as attributions to output residual stream (source_to_out_residual)
-  Output attributions computed on-the-fly at query time via w_unembed
+- Component targets: accumulated directly
+- Output targets: accumulated as attributions to output residual stream,
+  computed on-the-fly at query time via w_unembed
+
+All layer keys are concrete module paths (e.g. "wte", "h.0.attn.q_proj", "lm_head").
+Translation to canonical names happens at the storage boundary in harvest.py.
 """
 
+from collections import defaultdict
+from dataclasses import dataclass
 from typing import Any
 
 import torch
@@ -23,7 +37,7 @@
 
 
 class AttributionHarvester:
-    """Accumulates attribution strengths across batches.
+    """Accumulates attribution strengths across batches using concrete module paths.
 
     The attribution formula is:
         attribution[src, tgt] = Σ_batch Σ_pos (∂out[pos, tgt] / ∂in[pos, src]) × in_act[pos, src]
@@ -34,11 +48,6 @@ class AttributionHarvester:
     2. For output targets, store attributions to the pre-unembed residual
        (d_model dimensions) instead of vocab tokens. This eliminates the expensive
        O((V+C) × d_model × V) matmul during harvesting and reduces storage.
-
-    Index structure:
-        - Sources: wte tokens [0, vocab_size) + component layers [vocab_size, ...)
-        - Component targets: [0, n_components) in comp_accumulator
-        - Output targets: via out_residual_accumulator (computed on-the-fly at query time)
     """
 
     sampling: SamplingType
@@ -50,7 +59,9 @@ def __init__(
         vocab_size: int,
         component_alive: dict[str, Bool[Tensor, " n_components"]],
         sampling: SamplingType,
+        embed_path: str,
         embedding_module: nn.Embedding,
+        unembed_path: str,
         unembed_module: nn.Linear,
         device: torch.device,
     ):
@@ -59,7 +70,9 @@ def __init__(
         self.vocab_size = vocab_size
         self.component_alive = component_alive
         self.sampling = sampling
+        self.embed_path = embed_path
         self.embedding_module = embedding_module
+        self.unembed_path = unembed_path
         self.unembed_module = unembed_module
         self.device = device
 
@@ -67,16 +80,17 @@ def __init__(
         self.n_tokens = 0
         self.output_d_model = unembed_module.in_features
 
-        # Split accumulators for component and output targets
-        self.component_attr_accumulator = self._get_component_attr_accumulator(
-            sources_by_target,
-            component_alive,
-            unembed_module,
-            vocab_size,
-            device,
+        self.attr_accumulator = self._build_accumulator(
+            sources_by_target, component_alive, unembed_module, vocab_size, device
+        )
+        self.attr_abs_accumulator = self._build_accumulator(
+            sources_by_target, component_alive, unembed_module, vocab_size, device
+        )
+        self.square_attr_accumulator = self._build_accumulator(
+            sources_by_target, component_alive, unembed_module, vocab_size, device
         )
 
-    def _get_component_attr_accumulator(
+    def _build_accumulator(
         self,
         sources_by_target: dict[str, list[str]],
         component_alive: dict[str, Bool[Tensor, " n_components"]],
@@ -84,44 +98,71 @@ def _get_component_attr_accumulator(
         vocab_size: int,
         device: torch.device,
     ) -> dict[str, dict[str, Tensor]]:
-        component_attr_accumulator: dict[str, dict[str, Tensor]] = {}
+        accumulator: dict[str, dict[str, Tensor]] = {}
 
         for target_layer, source_layers in sources_by_target.items():
-            if target_layer == "output":
+            if target_layer == self.unembed_path:
                 target_d = unembed_module.in_features
             else:
                 (target_c,) = component_alive[target_layer].shape
                 target_d = target_c
 
-            source_attr_accumulator: dict[str, Tensor] = {}
+            source_acc: dict[str, Tensor] = {}
             for source_layer in source_layers:
-                if source_layer == "wte":
+                if source_layer == self.embed_path:
                     source_d = vocab_size
                 else:
                     (source_c,) = component_alive[source_layer].shape
                     source_d = source_c
 
-                source_attr_accumulator[source_layer] = torch.zeros(
-                    (target_d, source_d), device=device
-                )
+                source_acc[source_layer] = torch.zeros((target_d, source_d), device=device)
+
+            accumulator[target_layer] = source_acc
+
+        return accumulator
+
+    @dataclass
+    class NormalizedAttrs:
+        attr: dict[str, dict[str, Tensor]]
+        attr_abs: dict[str, dict[str, Tensor]]
+        mean_squared_attr: dict[str, dict[str, Tensor]]
 
-            component_attr_accumulator[target_layer] = source_attr_accumulator
+    def normalized_attrs(self) -> NormalizedAttrs:
+        """Return the accumulated attributions normalized by n_tokens.
 
-        return component_attr_accumulator
+        mean_squared_attr is pre-sqrt so it can be merged across workers.
+        """
+        attr = defaultdict[str, dict[str, Tensor]](dict)
+        attr_abs = defaultdict[str, dict[str, Tensor]](dict)
+        mean_squared_attr = defaultdict[str, dict[str, Tensor]](dict)
+
+        for target in self.attr_accumulator:
+            for source in self.sources_by_target[target]:
+                attr[target][source] = self.attr_accumulator[target][source] / self.n_tokens
+                attr_abs[target][source] = self.attr_abs_accumulator[target][source] / self.n_tokens
+                mean_squared_attr[target][source] = (
+                    self.square_attr_accumulator[target][source] / self.n_tokens
+                )
+
+        return self.NormalizedAttrs(
+            attr=attr,
+            attr_abs=attr_abs,
+            mean_squared_attr=mean_squared_attr,
+        )
 
     def process_batch(self, tokens: Int[Tensor, "batch seq"]) -> None:
         """Accumulate attributions from one batch."""
         self.n_batches += 1
         self.n_tokens += tokens.numel()
 
-        # Setup hooks to capture wte output and pre-unembed residual
-        wte_out: list[Tensor] = []
+        # Setup hooks to capture embedding output and pre-unembed residual
+        embed_out: list[Tensor] = []
         pre_unembed: list[Tensor] = []
 
-        def wte_hook(_mod: nn.Module, _args: Any, _kwargs: Any, out: Tensor) -> Tensor:
+        def embed_hook(_mod: nn.Module, _args: Any, _kwargs: Any, out: Tensor) -> Tensor:
             out.requires_grad_(True)
-            wte_out.clear()
-            wte_out.append(out)
+            embed_out.clear()
+            embed_out.append(out)
             return out
 
         def pre_unembed_hook(_mod: nn.Module, args: tuple[Any, ...], _kwargs: Any) -> None:
@@ -129,7 +170,7 @@ def pre_unembed_hook(_mod: nn.Module, args: tuple[Any, ...], _kwargs: Any) -> No
             pre_unembed.clear()
             pre_unembed.append(args[0])
 
-        h1 = self.embedding_module.register_forward_hook(wte_hook, with_kwargs=True)
+        h1 = self.embedding_module.register_forward_hook(embed_hook, with_kwargs=True)
         h2 = self.unembed_module.register_forward_pre_hook(pre_unembed_hook, with_kwargs=True)
 
         # Get masks with all components active
@@ -154,13 +195,11 @@ def pre_unembed_hook(_mod: nn.Module, args: tuple[Any, ...], _kwargs: Any) -> No
         h2.remove()
 
         cache = comp_output.cache
-        cache["wte_post_detach"] = wte_out[0]
-        cache["pre_unembed"] = pre_unembed[0]
-        # cache["tokens"] = tokens
+        cache[f"{self.embed_path}_post_detach"] = embed_out[0]
+        cache[f"{self.unembed_path}_pre_detach"] = pre_unembed[0]
 
-        # Process each target layer
         for target_layer in self.sources_by_target:
-            if target_layer == "output":
+            if target_layer == self.unembed_path:
                 self._process_output_targets(cache, ci.lower_leaky, tokens)
             else:
                 self._process_component_targets(target_layer, ci.lower_leaky, cache, tokens)
@@ -171,26 +210,21 @@ def _process_output_targets(
         ci: dict[str, Tensor],
         tokens: Int[Tensor, "batch seq"],
     ) -> None:
-        """Process output attributions via output-residual-space storage.
-
-        Instead of computing and storing attributions to vocab tokens directly,
-        we store attributions to output residual dimensions. Output attributions are
-        computed on-the-fly at query time via: attr[src, token] = out_residual[src] @ w_unembed[:, token]
-        """
-        # Sum output residual over batch and sequence -> [d_model]
-        out_residual = cache["pre_unembed"].sum(dim=(0, 1))
+        """Process output attributions via output-residual-space storage."""
+        out_residual = cache[f"{self.unembed_path}_pre_detach"].sum(dim=(0, 1))
 
-        source_layers = self.sources_by_target["output"]
+        source_layers = self.sources_by_target[self.unembed_path]
         source_acts = [cache[f"{s}_post_detach"] for s in source_layers]
 
         for d_idx in range(self.output_d_model):
             grads = torch.autograd.grad(out_residual[d_idx], source_acts, retain_graph=True)
-            source_acts_grads = list(zip(source_layers, source_acts, grads, strict=True))
 
             self._accumulate_attributions(
-                "output",
+                self.unembed_path,
                 d_idx,
-                source_acts_grads,
+                source_layers,
+                source_acts,
+                list(grads),
                 ci,
                 tokens,
             )
@@ -207,8 +241,6 @@ def _process_component_targets(
         if not alive_targets.any():
             return
 
-        # Sum over batch and sequence
-
         target_acts_raw = cache[f"{target_layer}_pre_detach"]
         ci_weighted_target_acts = (target_acts_raw * ci[target_layer]).sum(dim=(0, 1))
 
@@ -220,39 +252,54 @@ def _process_component_targets(
                 ci_weighted_target_acts[t_idx], source_acts, retain_graph=True
             )
 
-            source_acts_grads = list(zip(source_layers, source_acts, grads, strict=True))
-
             self._accumulate_attributions(
                 target_layer,
                 t_idx,
-                source_acts_grads,
+                source_layers,
+                source_acts,
+                list(grads),
                 ci,
                 tokens,
             )
 
-    @torch.no_grad()
+    @torch.no_grad()  # pyright: ignore[reportUntypedFunctionDecorator]
     def _accumulate_attributions(
         self,
         target_layer: str,
         target_idx: int,
-        source_acts_grads: list[tuple[str, Tensor, Tensor]],
+        source_layers: list[str],
+        source_acts: list[Tensor],
+        source_grads: list[Tensor],
         ci: dict[str, Tensor],
         tokens: Int[Tensor, "batch seq"],
     ) -> None:
         """Accumulate grad*act attributions from sources to a target column."""
-        target_accs = self.component_attr_accumulator[target_layer]
 
-        for source_layer, act, grad in source_acts_grads:
-            attr_accumulator = target_accs[source_layer][target_idx]
+        attr_accumulator = self.attr_accumulator[target_layer]
+        attr_abr_accumulator = self.attr_abs_accumulator[target_layer]
+        square_attr_accumulator = self.square_attr_accumulator[target_layer]
+
+        for source_layer, act, grad in zip(source_layers, source_acts, source_grads, strict=True):
+            attr_acc = attr_accumulator[source_layer][target_idx]
+            attr_abs_acc = attr_abr_accumulator[source_layer][target_idx]
+            square_attr_acc = square_attr_accumulator[source_layer][target_idx]
 
             ci_weighted_attr = grad * act * ci[source_layer]
+            ci_weighted_attr_abs = torch.where(act > 0, ci_weighted_attr, -ci_weighted_attr)
+            ci_weighted_squared_attr = ci_weighted_attr.square()
 
-            if source_layer == "wte":
+            if source_layer == self.embed_path:
                 # Per-token: sum grad*act*ci over d_model, scatter by token id
                 # TODO(oli): figure out why this works
                 attr = ci_weighted_attr.sum(dim=-1).flatten()
-                attr_accumulator.scatter_add_(0, tokens.flatten(), attr)
+                attr_abs = ci_weighted_attr_abs.sum(dim=-1).flatten()
+                attr_squared = ci_weighted_squared_attr.sum(dim=-1).flatten()
+
+                attr_acc.scatter_add_(0, tokens.flatten(), attr)
+                attr_abs_acc.scatter_add_(0, tokens.flatten(), attr_abs)
+                square_attr_acc.scatter_add_(0, tokens.flatten(), attr_squared)
             else:
                 # Per-component: sum grad*act*ci over batch and sequence
-                attr = ci_weighted_attr.sum(dim=(0, 1))
-                attr_accumulator.add_(attr)
+                attr_acc.add_(ci_weighted_attr.sum(dim=(0, 1)))
+                attr_abs_acc.add_(ci_weighted_attr_abs.sum(dim=(0, 1)))
+                square_attr_acc.add_(ci_weighted_squared_attr.sum(dim=(0, 1)))
diff --git a/spd/dataset_attributions/repo.py b/spd/dataset_attributions/repo.py
index bd73c5f63..697036ba3 100644
--- a/spd/dataset_attributions/repo.py
+++ b/spd/dataset_attributions/repo.py
@@ -49,7 +49,7 @@ def open(cls, run_id: str) -> "AttributionRepo | None":
         path = subrun_dir / "dataset_attributions.pt"
         if not path.exists():
             return None
-        return None # return cls(DatasetAttributionStorage.load(path), subrun_id=subrun_dir.name)
+        return cls(DatasetAttributionStorage.load(path), subrun_id=subrun_dir.name)
 
     def get_attributions(self) -> DatasetAttributionStorage:
         return self._storage
diff --git a/spd/dataset_attributions/storage.py b/spd/dataset_attributions/storage.py
index 3027a6519..ca1c8e59d 100644
--- a/spd/dataset_attributions/storage.py
+++ b/spd/dataset_attributions/storage.py
@@ -1,22 +1,28 @@
 """Storage classes for dataset attributions.
 
-Uses a residual-based storage approach for scalability:
-- Component targets: stored directly in source_to_component matrix
-- Output targets: stored as attributions to residual stream, computed on-the-fly via w_unembed
+Stored as nested dicts: attrs[target_layer][source_layer] = Tensor[target_d, source_d]
+
+Three attribution metrics are stored:
+- attr: mean attribution of source to target (signed)
+- attr_abs: mean attribution of source to |target| (always positive for positive activations)
+- mean_squared_attr: mean of squared attributions (pre-sqrt, for mergeable RMS)
+
+For output targets, target_d = d_model (residual stream dimension).
+Output token attributions are computed on-the-fly via w_unembed.
 """
 
-import dataclasses
-from collections.abc import Callable
+import bisect
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Literal
 
 import torch
-from jaxtyping import Float
 from torch import Tensor
 
 from spd.log import logger
 
+AttrDict = dict[str, dict[str, Tensor]]
+
 
 @dataclass
 class DatasetAttributionEntry:
@@ -31,334 +37,208 @@ class DatasetAttributionEntry:
 class DatasetAttributionStorage:
     """Dataset-aggregated attribution strengths between components.
 
-    Uses residual-based storage for scalability with large vocabularies:
-    - source_to_component: direct attributions to component targets
-    - source_to_out_residual: attributions to output residual stream (for computing output attributions)
-
-    Output attributions are computed on-the-fly: attr[src, output_token] = out_residual[src] @ w_unembed[:, token]
-
-    Source indexing (rows):
-        - [0, vocab_size): wte tokens
-        - [vocab_size, vocab_size + n_components): component layers
-
-    Target indexing:
-        - Component targets: [0, n_components) in source_to_component
-        - Output targets: computed via source_to_out_residual @ w_unembed
+    All layer names use canonical addressing (e.g., "embed", "0.glu.up", "output").
 
     Key formats:
-        - wte tokens: "wte:{token_id}"
-        - component layers: "layer:c_idx" (e.g., "h.0.attn.q_proj:5")
+        - embed tokens: "embed:{token_id}"
+        - component layers: "canonical_layer:c_idx" (e.g., "0.glu.up:5")
         - output tokens: "output:{token_id}"
     """
 
-    @property
-    def source_to_component(self) -> Float[Tensor, "n_sources n_components"]:
-        """Attributions from sources to component targets. Shape: (vocab_size + n_components,
-        n_components)"""
-        raise NotImplementedError("source_to_component is not implemented with new storage format")
-
-    @property
-    def source_to_out_residual(self) -> Float[Tensor, "n_sources d_model"]:
-        """Attributions from sources to output residual dimensions. Shape: (vocab_size + n_components,
-        d_model)"""
-        raise NotImplementedError(
-            "source_to_out_residual is not implemented with new storage format"
-        )
-
-    @property
-    def component_layer_keys(self) -> list[str]:
-        """Component layer keys in order: ["h.0.attn.q_proj:0", "h.0.attn.q_proj:1", ...]"""
-        raise NotImplementedError("component_layer_keys is not implemented with new storage format")
-
-    @property
-    def n_components(self) -> int:
-        """Number of component layers."""
-        raise NotImplementedError("n_components is not implemented with new storage format")
-        # return len(self.component_layer_keys)
-
     def __init__(
         self,
+        attr: AttrDict,
+        attr_abs: AttrDict,
+        mean_squared_attr: AttrDict,
+        vocab_size: int,
         ci_threshold: float,
-        vocab_size: int,  #  d_model: int
-        # TODO(oli): check these are needed
         n_batches_processed: int,
         n_tokens_processed: int,
     ):
+        self.attr = attr
+        self.attr_abs = attr_abs
+        self.mean_squared_attr = mean_squared_attr
+        self.vocab_size = vocab_size
         self.ci_threshold = ci_threshold
-        self._REMOVE_ME_vocab_size = vocab_size
         self.n_batches_processed = n_batches_processed
         self.n_tokens_processed = n_tokens_processed
 
-    # _component_key_to_idx: dict[str, int] = dataclasses.field(
-    #     default_factory=dict, repr=False, init=False
-    # )
-
-    # def __post_init__(self) -> None:
-    #     self._component_key_to_idx = {k: i for i, k in enumerate(self.component_layer_keys)}
-
-    #     n_components = len(self.component_layer_keys)
-    #     n_sources = self.vocab_size + n_components
-
-    #     expected_comp_shape = (n_sources, n_components)
-    #     assert self.source_to_component.shape == expected_comp_shape, (
-    #         f"source_to_component shape {self.source_to_component.shape} "
-    #         f"doesn't match expected {expected_comp_shape}"
-    #     )
-
-    #     expected_resid_shape = (n_sources, self.d_model)
-    #     assert self.source_to_out_residual.shape == expected_resid_shape, (
-    #         f"source_to_out_residual shape {self.source_to_out_residual.shape} "
-    #         f"doesn't match expected {expected_resid_shape}"
-    #     )
-
-    # @property
-    # def n_components(self) -> int:
-    #     return len(self.component_layer_keys)
-
-    # @property
-    # def n_sources(self) -> int:
-    #     return self.vocab_size + self.n_components
-
-    def _parse_key(self, key: str) -> tuple[str, int]:
-        """Parse a key into (layer, idx)."""
+    @property
+    def n_components(self) -> int:
+        total = 0
+        for target_layer in self.attr:
+            if target_layer == "output":
+                continue
+            first_source = next(iter(self.attr[target_layer].values()))
+            total += first_source.shape[0]
+        return total
+
+    @staticmethod
+    def _parse_key(key: str) -> tuple[str, int]:
         layer, idx_str = key.rsplit(":", 1)
         return layer, int(idx_str)
 
-    def _source_idx(self, key: str) -> int:
-        """Get source (row) index for a key. Raises KeyError if not a valid source."""
-        layer, idx = self._parse_key(key)
-        match layer:
-            case "wte":
-                assert 0 <= idx < self._REMOVE_ME_vocab_size, (
-                    f"wte index {idx} out of range [0, {self._REMOVE_ME_vocab_size})"
-                )
-                return idx
-            case "output":
-                raise KeyError(f"output tokens cannot be sources: {key}")
-            case _:
-                return self._REMOVE_ME_vocab_size + self._component_key_to_idx[key]
-
-    def _component_target_idx(self, key: str) -> int:
-        """Get target index for a component key. Raises KeyError if output or invalid."""
-        if key.startswith(("wte:", "output:")):
-            raise KeyError(f"Not a component target: {key}")
-        return self._component_key_to_idx[key]
-
-    def _source_idx_to_key(self, idx: int) -> str:
-        """Convert source (row) index to key."""
-        if idx < self._REMOVE_ME_vocab_size:
-            return f"wte:{idx}"
-        return self.component_layer_keys[idx - self._REMOVE_ME_vocab_size]
-
-    def _component_target_idx_to_key(self, idx: int) -> str:
-        """Convert component target index to key."""
-        return self.component_layer_keys[idx]
-
-    def _output_target_idx_to_key(self, idx: int) -> str:
-        """Convert output token index to key."""
-        return f"output:{idx}"
-
-    def _is_output_target(self, key: str) -> bool:
-        """Check if key is an output target."""
-        return key.startswith("output:")
-
-    def _output_token_id(self, key: str) -> int:
-        """Extract token_id from an output key like 'output:123'. Asserts valid range."""
-        _, token_id = self._parse_key(key)
-        assert 0 <= token_id < self._REMOVE_ME_vocab_size, f"output index {token_id} out of range"
-        return token_id
-
     def has_source(self, key: str) -> bool:
-        """Check if a key can be a source (wte token or component layer)."""
         layer, idx = self._parse_key(key)
-        match layer:
-            case "wte":
-                return 0 <= idx < self._REMOVE_ME_vocab_size
-            case "output":
-                return False
-            case _:
-                return key in self._component_key_to_idx
+        if layer == "output":
+            return False
+        for target_sources in self.attr.values():
+            if layer in target_sources:
+                return 0 <= idx < target_sources[layer].shape[1]
+        return False
 
     def has_target(self, key: str) -> bool:
-        """Check if a key can be a target (component layer or output token)."""
         layer, idx = self._parse_key(key)
         match layer:
-            case "wte":
+            case "embed":
                 return False
             case "output":
-                return 0 <= idx < self._REMOVE_ME_vocab_size
+                return 0 <= idx < self.vocab_size
             case _:
-                return key in self._component_key_to_idx
-
-    # TODO redo with new storage format
-    # def save(self, path: Path) -> None:
-    #     path.parent.mkdir(parents=True, exist_ok=True)
-    #     torch.save(
-    #         {
-    #             "component_layer_keys": self.component_layer_keys,
-    #             "vocab_size": self._REMOVE_ME_vocab_size,
-    #             "source_to_component": self.source_to_component.cpu(),
-    #             "source_to_out_residual": self.source_to_out_residual.cpu(),
-    #             "n_batches_processed": self.n_batches_processed,
-    #             "n_tokens_processed": self.n_tokens_processed,
-    #             "ci_threshold": self.ci_threshold,
-    #         },
-    #         path,
-    #     )
-    #     size_mb = path.stat().st_size / (1024 * 1024)
-    #     logger.info(f"Saved dataset attributions to {path} ({size_mb:.1f} MB)")
-    # @classmethod
-    # def load(cls, path: Path) -> "DatasetAttributionStorage":
-    #     data = torch.load(path, weights_only=True, mmap=True)
-    #     return cls(
-    #         component_layer_keys=data["component_layer_keys"],
-    #         vocab_size=data["vocab_size"],
-    #         d_model=data["d_model"],
-    #         source_to_component=data["source_to_component"],
-    #         source_to_out_residual=data["source_to_out_residual"],
-    #         n_batches_processed=data["n_batches_processed"],
-    #         n_tokens_processed=data["n_tokens_processed"],
-    #         ci_threshold=data["ci_threshold"],
-    #     )
-
-    def get_attribution(
-        self,
-        source_key: str,
-        target_key: str,
-        w_unembed: Float[Tensor, "d_model vocab"] | None = None,
-    ) -> float:
-        """Get attribution strength from source to target.
-
-        Args:
-            source_key: Source component key (wte or component layer)
-            target_key: Target component key (component layer or output token)
-            w_unembed: Unembedding matrix, required if target is an output token
-        """
-        src_idx = self._source_idx(source_key)
+                if layer not in self.attr:
+                    return False
+                first_source = next(iter(self.attr[layer].values()))
+                return 0 <= idx < first_source.shape[0]
 
-        if self._is_output_target(target_key):
-            assert w_unembed is not None, "w_unembed required for output target queries"
-            token_id = self._output_token_id(target_key)
-            w_unembed = w_unembed.to(self.source_to_out_residual.device)
-            return (self.source_to_out_residual[src_idx] @ w_unembed[:, token_id]).item()
+    # TODO: these methods need a metric parameter to select which of the 3 attr dicts to query
+    def get_attribution(self, *_args: object, **_kwargs: object) -> float:
+        raise ValueError("TODO: get_attribution needs metric selection")
 
-        tgt_idx = self._component_target_idx(target_key)
-        return self.source_to_component[src_idx, tgt_idx].item()
+    def get_top_sources(self, *_args: object, **_kwargs: object) -> list[DatasetAttributionEntry]:
+        raise ValueError("TODO: get_top_sources needs metric selection")
 
-    def _get_top_k(
+    def get_top_targets(self, *_args: object, **_kwargs: object) -> list[DatasetAttributionEntry]:
+        raise ValueError("TODO: get_top_targets needs metric selection")
+
+    def _top_k_from_segments(
         self,
-        values: Tensor,
+        value_segments: list[Tensor],
+        layer_names: list[str],
         k: int,
         sign: Literal["positive", "negative"],
-        idx_to_key: Callable[[int], str],
     ) -> list[DatasetAttributionEntry]:
-        """Get top-k entries from a 1D tensor of attribution values."""
+        if not value_segments:
+            return []
+
+        all_values = torch.cat(value_segments)
+        offsets = [0]
+        for seg in value_segments:
+            offsets.append(offsets[-1] + len(seg))
+
         is_positive = sign == "positive"
-        top_vals, top_idxs = torch.topk(values, min(k, len(values)), largest=is_positive)
+        top_vals, top_idxs = torch.topk(all_values, min(k, len(all_values)), largest=is_positive)
 
-        # Filter to only values matching the requested sign
         mask = top_vals > 0 if is_positive else top_vals < 0
         top_vals, top_idxs = top_vals[mask], top_idxs[mask]
 
         results = []
-        for idx, val in zip(top_idxs.tolist(), top_vals.tolist(), strict=True):
-            key = idx_to_key(idx)
-            layer, c_idx = self._parse_key(key)
+        for flat_idx, val in zip(top_idxs.tolist(), top_vals.tolist(), strict=True):
+            seg_idx = bisect.bisect_right(offsets, flat_idx) - 1
+            local_idx = flat_idx - offsets[seg_idx]
+            layer = layer_names[seg_idx]
             results.append(
                 DatasetAttributionEntry(
-                    component_key=key,
+                    component_key=f"{layer}:{local_idx}",
                     layer=layer,
-                    component_idx=c_idx,
+                    component_idx=local_idx,
                     value=val,
                 )
             )
         return results
 
-    def get_top_sources(
-        self,
-        target_key: str,
-        k: int,
-        sign: Literal["positive", "negative"],
-        w_unembed: Float[Tensor, "d_model vocab"] | None = None,
-    ) -> list[DatasetAttributionEntry]:
-        """Get top-k source components that attribute TO this target.
+    def save(self, path: Path) -> None:
+        path.parent.mkdir(parents=True, exist_ok=True)
+
+        def to_cpu(d: AttrDict) -> AttrDict:
+            return {
+                target: {source: tensor.cpu() for source, tensor in sources.items()}
+                for target, sources in d.items()
+            }
+
+        torch.save(
+            {
+                "attr": to_cpu(self.attr),
+                "attr_abs": to_cpu(self.attr_abs),
+                "mean_squared_attr": to_cpu(self.mean_squared_attr),
+                "vocab_size": self.vocab_size,
+                "ci_threshold": self.ci_threshold,
+                "n_batches_processed": self.n_batches_processed,
+                "n_tokens_processed": self.n_tokens_processed,
+            },
+            path,
+        )
+        size_mb = path.stat().st_size / (1024 * 1024)
+        logger.info(f"Saved dataset attributions to {path} ({size_mb:.1f} MB)")
+
+    @classmethod
+    def load(cls, path: Path) -> "DatasetAttributionStorage":
+        data = torch.load(path, weights_only=True)
+        return cls(
+            attr=data["attr"],
+            attr_abs=data["attr_abs"],
+            mean_squared_attr=data["mean_squared_attr"],
+            vocab_size=data["vocab_size"],
+            ci_threshold=data["ci_threshold"],
+            n_batches_processed=data["n_batches_processed"],
+            n_tokens_processed=data["n_tokens_processed"],
+        )
 
-        Args:
-            target_key: Target component key (component layer or output token)
-            k: Number of top sources to return
-            sign: "positive" for strongest positive, "negative" for strongest negative
-            w_unembed: Unembedding matrix, required if target is an output token
-        """
-        if self._is_output_target(target_key):
-            assert w_unembed is not None, "w_unembed required for output target queries"
-            token_id = self._output_token_id(target_key)
-            w_unembed = w_unembed.to(self.source_to_out_residual.device)
-            values = self.source_to_out_residual @ w_unembed[:, token_id]  # (n_sources,)
-        else:
-            tgt_idx = self._component_target_idx(target_key)
-            values = self.source_to_component[:, tgt_idx]
-
-        return self._get_top_k(values, k, sign, self._source_idx_to_key)
-
-    def get_top_targets(
-        self,
-        source_key: str,
-        k: int,
-        sign: Literal["positive", "negative"],
-        w_unembed: Float[Tensor, "d_model vocab"] | None = None,
-        include_outputs: bool = True,
-    ) -> list[DatasetAttributionEntry]:
-        """Get top-k target components this source attributes TO.
-
-        Args:
-            source_key: Source component key (wte or component layer)
-            k: Number of top targets to return
-            sign: "positive" for strongest positive, "negative" for strongest negative
-            w_unembed: Unembedding matrix, required if include_outputs=True
-            include_outputs: Whether to include output tokens in results
+    @classmethod
+    def merge(cls, paths: list[Path]) -> "DatasetAttributionStorage":
+        """Merge partial attribution files from parallel workers.
+
+        All three metrics are means, so merge is weighted average by n_tokens.
+        (mean_squared_attr is E[x²], not sqrt(E[x²]), so this works.)
         """
-        src_idx = self._source_idx(source_key)
-        comp_values = self.source_to_component[src_idx, :]  # (n_components,)
-
-        if include_outputs:
-            assert w_unembed is not None, "w_unembed required when include_outputs=True"
-            # Compute attributions to all output tokens
-            w_unembed = w_unembed.to(self.source_to_out_residual.device)
-            output_values = self.source_to_out_residual[src_idx, :] @ w_unembed  # (vocab,)
-            all_values = torch.cat([comp_values, output_values])
-
-            def combined_idx_to_key(idx: int) -> str:
-                if idx < self.n_components:
-                    return self._component_target_idx_to_key(idx)
-                return self._output_target_idx_to_key(idx - self.n_components)
-
-            return self._get_top_k(all_values, k, sign, combined_idx_to_key)
-
-        return self._get_top_k(comp_values, k, sign, self._component_target_idx_to_key)
-
-    # Unused apart from tests
-    # def get_top_component_targets(
-    #     self,
-    #     source_key: str,
-    #     k: int,
-    #     sign: Literal["positive", "negative"],
-    # ) -> list[DatasetAttributionEntry]:
-    #     """Get top-k component targets (excluding outputs) this source attributes TO.
-
-    #     Convenience method that doesn't require w_unembed.
-    #     """
-    #     return self.get_top_targets(source_key, k, sign, w_unembed=None, include_outputs=False)
-
-    # Unused
-    # def get_top_output_targets(
-    #     self,
-    #     source_key: str,
-    #     k: int,
-    #     sign: Literal["positive", "negative"],
-    #     w_unembed: Float[Tensor, "d_model vocab"],
-    # ) -> list[DatasetAttributionEntry]:
-    #     """Get top-k output token targets this source attributes TO."""
-    #     src_idx = self._source_idx(source_key)
-    #     w_unembed = w_unembed.to(self.source_to_out_residual.device)
-    #     output_values = self.source_to_out_residual[src_idx, :] @ w_unembed  # (vocab,)
-    #     return self._get_top_k(output_values, k, sign, self._output_target_idx_to_key)
+        assert paths, "No files to merge"
+
+        first = cls.load(paths[0])
+        n = first.n_tokens_processed
+
+        def denormalize(d: AttrDict, n_tokens: int) -> AttrDict:
+            return {
+                target: {source: (tensor * n_tokens).double() for source, tensor in sources.items()}
+                for target, sources in d.items()
+            }
+
+        total_attr = denormalize(first.attr, n)
+        total_attr_abs = denormalize(first.attr_abs, n)
+        total_mean_squared_attr = denormalize(first.mean_squared_attr, n)
+        total_tokens = n
+        total_batches = first.n_batches_processed
+
+        for path in paths[1:]:
+            storage = cls.load(path)
+            assert storage.ci_threshold == first.ci_threshold, "CI threshold mismatch"
+            assert storage.attr.keys() == first.attr.keys(), "Target layer mismatch"
+            n = storage.n_tokens_processed
+
+            for target, sources in storage.attr.items():
+                for source, tensor in sources.items():
+                    total_attr[target][source] += (tensor * n).double()
+                    total_attr_abs[target][source] += (
+                        storage.attr_abs[target][source] * n
+                    ).double()
+                    total_mean_squared_attr[target][source] += (
+                        storage.mean_squared_attr[target][source] * n
+                    ).double()
+            total_tokens += n
+            total_batches += storage.n_batches_processed
+
+        def normalize(d: AttrDict) -> AttrDict:
+            return {
+                target: {
+                    source: (tensor / total_tokens).float() for source, tensor in sources.items()
+                }
+                for target, sources in d.items()
+            }
+
+        return cls(
+            attr=normalize(total_attr),
+            attr_abs=normalize(total_attr_abs),
+            mean_squared_attr=normalize(total_mean_squared_attr),
+            vocab_size=first.vocab_size,
+            ci_threshold=first.ci_threshold,
+            n_batches_processed=total_batches,
+            n_tokens_processed=total_tokens,
+        )
diff --git a/spd/topology/gradient_connectivity.py b/spd/topology/gradient_connectivity.py
index 3337d208c..31ba61b5a 100644
--- a/spd/topology/gradient_connectivity.py
+++ b/spd/topology/gradient_connectivity.py
@@ -74,8 +74,8 @@ def embed_hook(
     cache[f"{embed_path}_post_detach"] = embed_cache[f"{embed_path}_post_detach"]
     cache[f"{unembed_path}_pre_detach"] = comp_output_with_cache.output
 
-    source_layers = [embed_path, *model.target_module_paths] # Don't include "output" as source
-    target_layers = [*model.target_module_paths, unembed_path] # Don't include embed as target
+    source_layers = [embed_path, *model.target_module_paths]  # Don't include "output" as source
+    target_layers = [*model.target_module_paths, unembed_path]  # Don't include embed as target
 
     # Test all distinct pairs for gradient flow
     test_pairs = []
diff --git a/tests/dataset_attributions/test_harvester.py b/tests/dataset_attributions/test_harvester.py
deleted file mode 100644
index 3df88a508..000000000
--- a/tests/dataset_attributions/test_harvester.py
+++ /dev/null
@@ -1,265 +0,0 @@
-"""Tests for dataset attribution harvester logic."""
-
-from pathlib import Path
-
-import torch
-
-from spd.dataset_attributions.storage import DatasetAttributionStorage
-
-
-def _make_storage(
-    n_components: int = 2,
-    vocab_size: int = 3,
-    d_model: int = 4,
-    source_to_component: torch.Tensor | None = None,
-    source_to_out_residual: torch.Tensor | None = None,
-) -> DatasetAttributionStorage:
-    """Helper to create storage with default values."""
-    n_sources = vocab_size + n_components
-    if source_to_component is None:
-        source_to_component = torch.zeros(n_sources, n_components)
-    if source_to_out_residual is None:
-        source_to_out_residual = torch.zeros(n_sources, d_model)
-
-    return DatasetAttributionStorage(
-        component_layer_keys=[f"layer1:{i}" for i in range(n_components)],
-        _REMOVE_ME_vocab_size=vocab_size,
-        d_model=d_model,
-        source_to_component=source_to_component,
-        source_to_out_residual=source_to_out_residual,
-        n_batches_processed=10,
-        n_tokens_processed=1000,
-        ci_threshold=0.0,
-    )
-
-
-class TestDatasetAttributionStorage:
-    """Tests for DatasetAttributionStorage.
-
-    Storage structure:
-    - source_to_component: (n_sources, n_components) for component target attributions
-    - source_to_out_residual: (n_sources, d_model) for output target attributions (via w_unembed)
-    """
-
-    def test_has_source_and_target(self) -> None:
-        """Test has_source and has_target methods."""
-        storage = _make_storage(n_components=2, vocab_size=3)
-
-        # wte tokens can only be sources
-        assert storage.has_source("wte:0")
-        assert storage.has_source("wte:2")
-        assert not storage.has_source("wte:3")  # Out of vocab
-        assert not storage.has_target("wte:0")  # wte can't be target
-
-        # Component layers can be both sources and targets
-        assert storage.has_source("layer1:0")
-        assert storage.has_source("layer1:1")
-        assert storage.has_target("layer1:0")
-        assert storage.has_target("layer1:1")
-        assert not storage.has_source("layer1:2")
-        assert not storage.has_target("layer1:2")
-
-        # output tokens can only be targets
-        assert storage.has_target("output:0")
-        assert storage.has_target("output:2")
-        assert not storage.has_target("output:3")  # Out of vocab
-        assert not storage.has_source("output:0")  # output can't be source
-
-    def test_get_attribution_component_target(self) -> None:
-        """Test get_attribution for component targets (no w_unembed needed)."""
-        # 2 component layers: layer1:0, layer1:1
-        # vocab_size=2, d_model=4
-        # n_sources = 2 + 2 = 4
-        # source_to_component shape: (4, 2)
-        source_to_component = torch.tensor(
-            [
-                [1.0, 2.0],  # wte:0 -> components
-                [3.0, 4.0],  # wte:1 -> components
-                [5.0, 6.0],  # layer1:0 -> components
-                [7.0, 8.0],  # layer1:1 -> components
-            ]
-        )
-        storage = _make_storage(
-            n_components=2, vocab_size=2, source_to_component=source_to_component
-        )
-
-        # wte:0 -> layer1:0
-        assert storage.get_attribution("wte:0", "layer1:0") == 1.0
-        # wte:1 -> layer1:1
-        assert storage.get_attribution("wte:1", "layer1:1") == 4.0
-        # layer1:0 -> layer1:1
-        assert storage.get_attribution("layer1:0", "layer1:1") == 6.0
-
-    def test_get_attribution_output_target(self) -> None:
-        """Test get_attribution for output targets (requires w_unembed)."""
-        # source_to_out_residual shape: (4, 4) for n_sources=4, d_model=4
-        source_to_out_residual = torch.tensor(
-            [
-                [1.0, 0.0, 0.0, 0.0],  # wte:0 -> out_residual
-                [0.0, 1.0, 0.0, 0.0],  # wte:1 -> out_residual
-                [0.0, 0.0, 1.0, 0.0],  # layer1:0 -> out_residual
-                [0.0, 0.0, 0.0, 1.0],  # layer1:1 -> out_residual
-            ]
-        )
-        # w_unembed shape: (d_model=4, vocab=2)
-        w_unembed = torch.tensor(
-            [
-                [1.0, 2.0],  # d0 -> outputs
-                [3.0, 4.0],  # d1 -> outputs
-                [5.0, 6.0],  # d2 -> outputs
-                [7.0, 8.0],  # d3 -> outputs
-            ]
-        )
-        storage = _make_storage(
-            n_components=2, vocab_size=2, d_model=4, source_to_out_residual=source_to_out_residual
-        )
-
-        # wte:0 -> output:0 = out_residual[0] @ w_unembed[:, 0] = [1,0,0,0] @ [1,3,5,7] = 1.0
-        assert storage.get_attribution("wte:0", "output:0", w_unembed=w_unembed) == 1.0
-        # wte:1 -> output:1 = [0,1,0,0] @ [2,4,6,8] = 4.0
-        assert storage.get_attribution("wte:1", "output:1", w_unembed=w_unembed) == 4.0
-        # layer1:0 -> output:0 = [0,0,1,0] @ [1,3,5,7] = 5.0
-        assert storage.get_attribution("layer1:0", "output:0", w_unembed=w_unembed) == 5.0
-
-    def test_get_top_sources_component_target(self) -> None:
-        """Test get_top_sources for component targets."""
-        source_to_component = torch.tensor(
-            [
-                [1.0, 2.0],  # wte:0
-                [5.0, 3.0],  # wte:1
-                [2.0, 4.0],  # layer1:0
-                [3.0, 1.0],  # layer1:1
-            ]
-        )
-        storage = _make_storage(
-            n_components=2, vocab_size=2, source_to_component=source_to_component
-        )
-
-        # Top sources TO layer1:0 (column 0): wte:0=1.0, wte:1=5.0, layer1:0=2.0, layer1:1=3.0
-        sources = storage.get_top_sources("layer1:0", k=2, sign="positive")
-        assert len(sources) == 2
-        assert sources[0].component_key == "wte:1"
-        assert sources[0].value == 5.0
-        assert sources[1].component_key == "layer1:1"
-        assert sources[1].value == 3.0
-
-    def test_get_top_sources_negative(self) -> None:
-        """Test get_top_sources with negative sign."""
-        source_to_component = torch.tensor(
-            [
-                [-1.0, 2.0],
-                [-5.0, 3.0],
-                [-2.0, 4.0],
-                [-3.0, 1.0],
-            ]
-        )
-        storage = _make_storage(
-            n_components=2, vocab_size=2, source_to_component=source_to_component
-        )
-
-        sources = storage.get_top_sources("layer1:0", k=2, sign="negative")
-        assert len(sources) == 2
-        # wte:1 has most negative (-5.0), then layer1:1 (-3.0)
-        assert sources[0].component_key == "wte:1"
-        assert sources[0].value == -5.0
-        assert sources[1].component_key == "layer1:1"
-        assert sources[1].value == -3.0
-
-    def test_get_top_component_targets(self) -> None:
-        """Test get_top_component_targets (no w_unembed needed)."""
-        source_to_component = torch.tensor(
-            [
-                [0.0, 0.0],
-                [0.0, 0.0],
-                [2.0, 4.0],  # layer1:0 -> components
-                [0.0, 0.0],
-            ]
-        )
-        storage = _make_storage(
-            n_components=2, vocab_size=2, source_to_component=source_to_component
-        )
-
-        targets = storage.get_top_component_targets("layer1:0", k=2, sign="positive")
-        assert len(targets) == 2
-        assert targets[0].component_key == "layer1:1"
-        assert targets[0].value == 4.0
-        assert targets[1].component_key == "layer1:0"
-        assert targets[1].value == 2.0
-
-    def test_get_top_targets_with_outputs(self) -> None:
-        """Test get_top_targets including outputs (requires w_unembed)."""
-        source_to_component = torch.tensor(
-            [
-                [0.0, 0.0],
-                [0.0, 0.0],
-                [2.0, 4.0],  # layer1:0 -> components
-                [0.0, 0.0],
-            ]
-        )
-        # Make out_residual attribution that produces high output values
-        source_to_out_residual = torch.tensor(
-            [
-                [0.0, 0.0, 0.0, 0.0],
-                [0.0, 0.0, 0.0, 0.0],
-                [1.0, 1.0, 1.0, 1.0],  # layer1:0 -> out_residual (sum=4 per output)
-                [0.0, 0.0, 0.0, 0.0],
-            ]
-        )
-        # w_unembed that gives output:0=10, output:1=5
-        w_unembed = torch.tensor(
-            [
-                [2.5, 1.25],
-                [2.5, 1.25],
-                [2.5, 1.25],
-                [2.5, 1.25],
-            ]
-        )
-        storage = _make_storage(
-            n_components=2,
-            vocab_size=2,
-            d_model=4,
-            source_to_component=source_to_component,
-            source_to_out_residual=source_to_out_residual,
-        )
-
-        targets = storage.get_top_targets("layer1:0", k=3, sign="positive", w_unembed=w_unembed)
-        assert len(targets) == 3
-        # output:0 = 10.0, output:1 = 5.0, layer1:1 = 4.0
-        assert targets[0].component_key == "output:0"
-        assert targets[0].value == 10.0
-        assert targets[1].component_key == "output:1"
-        assert targets[1].value == 5.0
-        assert targets[2].component_key == "layer1:1"
-        assert targets[2].value == 4.0
-
-    def test_save_and_load(self, tmp_path: Path) -> None:
-        """Test save and load roundtrip."""
-        n_components = 2
-        vocab_size = 3
-        d_model = 4
-        n_sources = vocab_size + n_components
-
-        original = DatasetAttributionStorage(
-            component_layer_keys=["layer:0", "layer:1"],
-            _REMOVE_ME_vocab_size=vocab_size,
-            d_model=d_model,
-            source_to_component=torch.randn(n_sources, n_components),
-            source_to_out_residual=torch.randn(n_sources, d_model),
-            n_batches_processed=100,
-            n_tokens_processed=10000,
-            ci_threshold=0.01,
-        )
-
-        path = tmp_path / "test_attributions.pt"
-        original.save(path)
-
-        loaded = DatasetAttributionStorage.load(path)
-
-        assert loaded.component_layer_keys == original.component_layer_keys
-        assert loaded._REMOVE_ME_vocab_size == original._REMOVE_ME_vocab_size
-        assert loaded.d_model == original.d_model
-        assert loaded.n_batches_processed == original.n_batches_processed
-        assert loaded.n_tokens_processed == original.n_tokens_processed
-        assert loaded.ci_threshold == original.ci_threshold
-        assert torch.allclose(loaded.source_to_component, original.source_to_component)
-        assert torch.allclose(loaded.source_to_out_residual, original.source_to_out_residual)

From a116ddd58e70bb5fe108135cfe975b4b576703f6 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 17:32:53 +0000
Subject: [PATCH 32/62] Fix alive_targets iteration: use torch.where for
 indices, not bool tolist

alive_targets is a bool tensor; .tolist() gives [True, False, ...] not indices.
torch.autograd.grad needs a scalar output, so index with actual int indices.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/dataset_attributions/harvester.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spd/dataset_attributions/harvester.py b/spd/dataset_attributions/harvester.py
index 947937f0c..c0997f095 100644
--- a/spd/dataset_attributions/harvester.py
+++ b/spd/dataset_attributions/harvester.py
@@ -247,7 +247,7 @@ def _process_component_targets(
         source_layers = self.sources_by_target[target_layer]
         source_acts = [cache[f"{s}_post_detach"] for s in source_layers]
 
-        for t_idx in alive_targets.tolist():
+        for t_idx in torch.where(alive_targets)[0].tolist():
             grads = torch.autograd.grad(
                 ci_weighted_target_acts[t_idx], source_acts, retain_graph=True
             )

From 5f98d815f8623311b94e4b5d285d21446e3e2c03 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 17:51:14 +0000
Subject: [PATCH 33/62] Fix KeyError for embed source: CI dict doesn't include
 embedding layer

Embed tokens have no CI (always active), so skip CI weighting for embed sources.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/dataset_attributions/harvester.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spd/dataset_attributions/harvester.py b/spd/dataset_attributions/harvester.py
index c0997f095..27c6d5d43 100644
--- a/spd/dataset_attributions/harvester.py
+++ b/spd/dataset_attributions/harvester.py
@@ -284,7 +284,9 @@ def _accumulate_attributions(
             attr_abs_acc = attr_abr_accumulator[source_layer][target_idx]
             square_attr_acc = square_attr_accumulator[source_layer][target_idx]
 
-            ci_weighted_attr = grad * act * ci[source_layer]
+            # Embed has no CI (all tokens always active)
+            source_ci = ci[source_layer] if source_layer != self.embed_path else 1.0
+            ci_weighted_attr = grad * act * source_ci
             ci_weighted_attr_abs = torch.where(act > 0, ci_weighted_attr, -ci_weighted_attr)
             ci_weighted_squared_attr = ci_weighted_attr.square()
 

From 01633c5c4bd0626b49cd76665aa426bb5a130785 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 18:05:54 +0000
Subject: [PATCH 34/62] Fix scatter_add OOB: use embedding num_embeddings
 instead of tokenizer vocab_size

tokenizer.vocab_size (50254) < len(tokenizer) (50277) due to added tokens.
Token IDs >= vocab_size cause scatter_add_ index out of bounds in the embed
accumulator. Use embedding_module.num_embeddings which matches the actual
token ID space.

Also add Path type annotation to test tmp_path params.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/dataset_attributions/harvest.py        |   7 +-
 tests/dataset_attributions/test_storage.py | 201 +++++++++++++++++++++
 2 files changed, 204 insertions(+), 4 deletions(-)
 create mode 100644 tests/dataset_attributions/test_storage.py

diff --git a/spd/dataset_attributions/harvest.py b/spd/dataset_attributions/harvest.py
index 6c55a83aa..d55fc52c9 100644
--- a/spd/dataset_attributions/harvest.py
+++ b/spd/dataset_attributions/harvest.py
@@ -108,16 +108,15 @@ def harvest_attributions(
     model.eval()
 
     spd_config = run_info.config
-    train_loader, tokenizer = train_loader_and_tokenizer(spd_config, config.batch_size)
-    vocab_size = tokenizer.vocab_size
-    assert isinstance(vocab_size, int), f"vocab_size must be int, got {type(vocab_size)}"
-    logger.info(f"Vocab size: {vocab_size}")
+    train_loader, _ = train_loader_and_tokenizer(spd_config, config.batch_size)
 
     # Get gradient connectivity
     logger.info("Computing sources_by_target...")
     topology = TransformerTopology(model.target_model)
     embed_path = topology.path_schema.embedding_path
     unembed_path = topology.path_schema.unembed_path
+    vocab_size = topology.embedding_module.num_embeddings
+    logger.info(f"Vocab size: {vocab_size}")
     sources_by_target_raw = get_sources_by_target(model, topology, str(device), spd_config.sampling)
 
     # Filter to valid source/target pairs:
diff --git a/tests/dataset_attributions/test_storage.py b/tests/dataset_attributions/test_storage.py
new file mode 100644
index 000000000..4f394f509
--- /dev/null
+++ b/tests/dataset_attributions/test_storage.py
@@ -0,0 +1,201 @@
+"""Tests for DatasetAttributionStorage."""
+
+from pathlib import Path
+
+import torch
+from torch import Tensor
+
+from spd.dataset_attributions.storage import DatasetAttributionStorage
+
+VOCAB_SIZE = 4
+D_MODEL = 4
+LAYER_0 = "0.glu.up"
+LAYER_1 = "1.glu.up"
+C0 = 3  # components in layer 0
+C1 = 2  # components in layer 1
+
+
+def _make_attr_dict(seed: int = 0) -> dict[str, dict[str, Tensor]]:
+    """Build attr dict for the test topology.
+
+    Sources by target:
+        "0.glu.up": ["embed"]             -> shape (C0, VOCAB_SIZE)
+        "1.glu.up": ["embed", "0.glu.up"] -> shape (C1, VOCAB_SIZE), (C1, C0)
+        "output":   ["0.glu.up", "1.glu.up"] -> shape (D_MODEL, C0), (D_MODEL, C1)
+    """
+    g = torch.Generator().manual_seed(seed)
+
+    def rand(*shape: int) -> Tensor:
+        return torch.randn(*shape, generator=g)
+
+    return {
+        LAYER_0: {"embed": rand(C0, VOCAB_SIZE)},
+        LAYER_1: {"embed": rand(C1, VOCAB_SIZE), LAYER_0: rand(C1, C0)},
+        "output": {LAYER_0: rand(D_MODEL, C0), LAYER_1: rand(D_MODEL, C1)},
+    }
+
+
+def _make_storage(
+    seed: int = 0, n_batches: int = 10, n_tokens: int = 640
+) -> DatasetAttributionStorage:
+    return DatasetAttributionStorage(
+        attr=_make_attr_dict(seed),
+        attr_abs=_make_attr_dict(seed + 100),
+        mean_squared_attr=_make_attr_dict(seed + 200),
+        vocab_size=VOCAB_SIZE,
+        ci_threshold=1e-6,
+        n_batches_processed=n_batches,
+        n_tokens_processed=n_tokens,
+    )
+
+
+class TestNComponents:
+    def test_counts_non_output_targets(self):
+        storage = _make_storage()
+        assert storage.n_components == C0 + C1
+
+
+class TestHasSource:
+    def test_embed_token(self):
+        storage = _make_storage()
+        assert storage.has_source("embed:0")
+        assert storage.has_source(f"embed:{VOCAB_SIZE - 1}")
+
+    def test_embed_oob(self):
+        storage = _make_storage()
+        assert not storage.has_source(f"embed:{VOCAB_SIZE}")
+        assert not storage.has_source("embed:-1")
+
+    def test_component_source(self):
+        storage = _make_storage()
+        assert storage.has_source(f"{LAYER_0}:0")
+        assert storage.has_source(f"{LAYER_0}:{C0 - 1}")
+
+    def test_component_source_oob(self):
+        storage = _make_storage()
+        assert not storage.has_source(f"{LAYER_0}:{C0}")
+
+    def test_output_never_source(self):
+        storage = _make_storage()
+        assert not storage.has_source("output:0")
+
+    def test_layer_not_present(self):
+        storage = _make_storage()
+        assert not storage.has_source("nonexistent:0")
+
+
+class TestHasTarget:
+    def test_component_target(self):
+        storage = _make_storage()
+        assert storage.has_target(f"{LAYER_0}:0")
+        assert storage.has_target(f"{LAYER_1}:{C1 - 1}")
+
+    def test_component_target_oob(self):
+        storage = _make_storage()
+        assert not storage.has_target(f"{LAYER_0}:{C0}")
+        assert not storage.has_target(f"{LAYER_1}:{C1}")
+
+    def test_output_target(self):
+        storage = _make_storage()
+        assert storage.has_target("output:0")
+        assert storage.has_target(f"output:{VOCAB_SIZE - 1}")
+
+    def test_output_target_oob(self):
+        storage = _make_storage()
+        assert not storage.has_target(f"output:{VOCAB_SIZE}")
+
+    def test_embed_never_target(self):
+        storage = _make_storage()
+        assert not storage.has_target("embed:0")
+
+    def test_layer_not_present(self):
+        storage = _make_storage()
+        assert not storage.has_target("nonexistent:0")
+
+
+class TestSaveLoad:
+    def test_roundtrip(self, tmp_path: Path):
+        original = _make_storage()
+        path = tmp_path / "attrs.pt"
+        original.save(path)
+
+        loaded = DatasetAttributionStorage.load(path)
+
+        assert loaded.vocab_size == original.vocab_size
+        assert loaded.ci_threshold == original.ci_threshold
+        assert loaded.n_batches_processed == original.n_batches_processed
+        assert loaded.n_tokens_processed == original.n_tokens_processed
+        assert loaded.n_components == original.n_components
+
+        for attr_name in ("attr", "attr_abs", "mean_squared_attr"):
+            orig_dict = getattr(original, attr_name)
+            load_dict = getattr(loaded, attr_name)
+            assert orig_dict.keys() == load_dict.keys()
+            for target in orig_dict:
+                assert orig_dict[target].keys() == load_dict[target].keys()
+                for source in orig_dict[target]:
+                    torch.testing.assert_close(load_dict[target][source], orig_dict[target][source])
+
+
+class TestMerge:
+    def test_two_workers_weighted_average(self, tmp_path: Path):
+        s1 = _make_storage(seed=0, n_batches=5, n_tokens=320)
+        s2 = _make_storage(seed=42, n_batches=5, n_tokens=320)
+
+        p1 = tmp_path / "rank_0.pt"
+        p2 = tmp_path / "rank_1.pt"
+        s1.save(p1)
+        s2.save(p2)
+
+        merged = DatasetAttributionStorage.merge([p1, p2])
+
+        assert merged.n_batches_processed == 10
+        assert merged.n_tokens_processed == 640
+        assert merged.vocab_size == VOCAB_SIZE
+        assert merged.ci_threshold == s1.ci_threshold
+
+        n1, n2 = s1.n_tokens_processed, s2.n_tokens_processed
+        total = n1 + n2
+        for target in s1.attr:
+            for source in s1.attr[target]:
+                expected = (s1.attr[target][source] * n1 + s2.attr[target][source] * n2) / total
+                torch.testing.assert_close(
+                    merged.attr[target][source], expected, atol=1e-5, rtol=1e-5
+                )
+
+    def test_unequal_token_counts(self, tmp_path: Path):
+        s1 = _make_storage(seed=0, n_batches=3, n_tokens=192)
+        s2 = _make_storage(seed=42, n_batches=7, n_tokens=448)
+
+        p1 = tmp_path / "rank_0.pt"
+        p2 = tmp_path / "rank_1.pt"
+        s1.save(p1)
+        s2.save(p2)
+
+        merged = DatasetAttributionStorage.merge([p1, p2])
+
+        assert merged.n_tokens_processed == 640
+        assert merged.n_batches_processed == 10
+
+        n1, n2 = s1.n_tokens_processed, s2.n_tokens_processed
+        total = n1 + n2
+        for target in s1.attr:
+            for source in s1.attr[target]:
+                expected = (s1.attr[target][source] * n1 + s2.attr[target][source] * n2) / total
+                torch.testing.assert_close(
+                    merged.attr[target][source], expected, atol=1e-5, rtol=1e-5
+                )
+
+    def test_single_file(self, tmp_path: Path):
+        original = _make_storage(seed=7, n_batches=10, n_tokens=640)
+        path = tmp_path / "rank_0.pt"
+        original.save(path)
+
+        merged = DatasetAttributionStorage.merge([path])
+
+        assert merged.n_tokens_processed == original.n_tokens_processed
+        for target in original.attr:
+            for source in original.attr[target]:
+                torch.testing.assert_close(
+                    merged.attr[target][source], original.attr[target][source]
+                )

From 9118c1e90e98eb94313a7bc5d78cc45711a1b685 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 18:19:17 +0000
Subject: [PATCH 35/62] Split run.py into run_worker.py and run_merge.py

Merge doesn't need config_json, worker does. Separate entrypoints avoid the
issue where Fire requires config_json for both paths.

Cherry-picked from feature/faster-dataset-attributions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/dataset_attributions/scripts/run_merge.py | 37 ++++++++++++++
 spd/dataset_attributions/scripts/run_slurm.py |  6 +--
 .../scripts/{run.py => run_worker.py}         | 49 ++++++-------------
 3 files changed, 55 insertions(+), 37 deletions(-)
 create mode 100644 spd/dataset_attributions/scripts/run_merge.py
 rename spd/dataset_attributions/scripts/{run.py => run_worker.py} (60%)

diff --git a/spd/dataset_attributions/scripts/run_merge.py b/spd/dataset_attributions/scripts/run_merge.py
new file mode 100644
index 000000000..913ea5374
--- /dev/null
+++ b/spd/dataset_attributions/scripts/run_merge.py
@@ -0,0 +1,37 @@
+"""Merge script for dataset attribution rank files.
+
+Combines per-rank attribution files into a single merged result.
+
+Usage:
+    python -m spd.dataset_attributions.scripts.run_merge --wandb_path <path> --subrun_id da-xxx
+"""
+
+from spd.dataset_attributions.harvest import merge_attributions
+from spd.dataset_attributions.repo import get_attributions_subrun_dir
+from spd.log import logger
+from spd.utils.wandb_utils import parse_wandb_run_path
+
+
+def main(
+    *,
+    wandb_path: str,
+    subrun_id: str,
+) -> None:
+    _, _, run_id = parse_wandb_run_path(wandb_path)
+    output_dir = get_attributions_subrun_dir(run_id, subrun_id)
+    logger.info(f"Merging attribution results for {wandb_path} (subrun {subrun_id})")
+    merge_attributions(output_dir)
+
+
+def get_command(wandb_path: str, subrun_id: str) -> str:
+    return (
+        f"python -m spd.dataset_attributions.scripts.run_merge "
+        f'--wandb_path "{wandb_path}" '
+        f"--subrun_id {subrun_id}"
+    )
+
+
+if __name__ == "__main__":
+    import fire
+
+    fire.Fire(main)
diff --git a/spd/dataset_attributions/scripts/run_slurm.py b/spd/dataset_attributions/scripts/run_slurm.py
index 3fdba505e..8420db56e 100644
--- a/spd/dataset_attributions/scripts/run_slurm.py
+++ b/spd/dataset_attributions/scripts/run_slurm.py
@@ -14,7 +14,7 @@
 from datetime import datetime
 
 from spd.dataset_attributions.config import AttributionsSlurmConfig
-from spd.dataset_attributions.scripts import run as attribution_run
+from spd.dataset_attributions.scripts import run_merge, run_worker
 from spd.log import logger
 from spd.utils.git_utils import create_git_snapshot
 from spd.utils.slurm import (
@@ -85,7 +85,7 @@ def submit_attributions(
     # SLURM arrays are 1-indexed, so task ID 1 -> rank 0, etc.
     worker_commands = []
     for rank in range(n_gpus):
-        cmd = attribution_run.get_worker_command(
+        cmd = run_worker.get_command(
             wandb_path,
             config_json,
             rank=rank,
@@ -115,7 +115,7 @@ def submit_attributions(
     )
 
     # Submit merge job with dependency on array completion
-    merge_cmd = attribution_run.get_merge_command(wandb_path, subrun_id)
+    merge_cmd = run_merge.get_command(wandb_path, subrun_id)
     merge_config = SlurmConfig(
         job_name="spd-attr-merge",
         partition=partition,
diff --git a/spd/dataset_attributions/scripts/run.py b/spd/dataset_attributions/scripts/run_worker.py
similarity index 60%
rename from spd/dataset_attributions/scripts/run.py
rename to spd/dataset_attributions/scripts/run_worker.py
index 5d060767e..995a9f22a 100644
--- a/spd/dataset_attributions/scripts/run.py
+++ b/spd/dataset_attributions/scripts/run_worker.py
@@ -4,19 +4,20 @@
 
 Usage:
     # Single GPU
-    python -m spd.dataset_attributions.scripts.run <path> --config_json '...'
+    python -m spd.dataset_attributions.scripts.run_worker <path>
+
+    # Single GPU with config
+    python -m spd.dataset_attributions.scripts.run_worker <path> --config_json '{"n_batches": 500}'
 
     # Multi-GPU (run in parallel)
-    python -m spd.dataset_attributions.scripts.run <path> --config_json '...' --rank 0 --world_size 4 --subrun_id da-20260211_120000
-    ...
-    python -m spd.dataset_attributions.scripts.run <path> --merge --subrun_id da-20260211_120000
+    python -m spd.dataset_attributions.scripts.run_worker <path> --rank 0 --world_size 4 --subrun_id da-xxx
 """
 
 from datetime import datetime
 from typing import Any
 
 from spd.dataset_attributions.config import DatasetAttributionConfig
-from spd.dataset_attributions.harvest import harvest_attributions, merge_attributions
+from spd.dataset_attributions.harvest import harvest_attributions
 from spd.dataset_attributions.repo import get_attributions_subrun_dir
 from spd.log import logger
 from spd.utils.wandb_utils import parse_wandb_run_path
@@ -24,31 +25,24 @@
 
 def main(
     wandb_path: str,
-    config_json: dict[str, Any],
+    config_json: dict[str, Any] | None = None,
     rank: int | None = None,
     world_size: int | None = None,
-    merge: bool = False,
     subrun_id: str | None = None,
     harvest_subrun_id: str | None = None,
 ) -> None:
-    assert isinstance(config_json, dict), f"Expected dict from fire, got {type(config_json)}"
     _, _, run_id = parse_wandb_run_path(wandb_path)
 
     if subrun_id is None:
         subrun_id = "da-" + datetime.now().strftime("%Y%m%d_%H%M%S")
 
+    config = (
+        DatasetAttributionConfig.model_validate(config_json)
+        if config_json
+        else DatasetAttributionConfig()
+    )
     output_dir = get_attributions_subrun_dir(run_id, subrun_id)
 
-    if merge:
-        assert rank is None and world_size is None, "Cannot specify rank/world_size with --merge"
-        logger.info(f"Merging attribution results for {wandb_path} (subrun {subrun_id})")
-        merge_attributions(output_dir)
-        return
-
-    assert (rank is None) == (world_size is None), "rank and world_size must both be set or unset"
-
-    config = DatasetAttributionConfig.model_validate(config_json)
-
     if world_size is not None:
         logger.info(
             f"Distributed harvest: {wandb_path} (rank {rank}/{world_size}, subrun {subrun_id})"
@@ -66,7 +60,7 @@ def main(
     )
 
 
-def get_worker_command(
+def get_command(
     wandb_path: str,
     config_json: str,
     rank: int,
@@ -75,7 +69,7 @@ def get_worker_command(
     harvest_subrun_id: str | None = None,
 ) -> str:
     cmd = (
-        f"python -m spd.dataset_attributions.scripts.run "
+        f"python -m spd.dataset_attributions.scripts.run_worker "
         f'"{wandb_path}" '
         f"--config_json '{config_json}' "
         f"--rank {rank} "
@@ -87,20 +81,7 @@ def get_worker_command(
     return cmd
 
 
-def get_merge_command(wandb_path: str, subrun_id: str) -> str:
-    return (
-        f"python -m spd.dataset_attributions.scripts.run "
-        f'"{wandb_path}" '
-        "--merge "
-        f"--subrun_id {subrun_id}"
-    )
-
-
-def cli() -> None:
+if __name__ == "__main__":
     import fire
 
     fire.Fire(main)
-
-
-if __name__ == "__main__":
-    cli()

From d0166d01135eb535928ec20e95f396e5d2378ab1 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 18:53:03 +0000
Subject: [PATCH 36/62] Correct attr_abs via backprop through |target|,
 reorganise method signatures

attr_abs now computed by backpropping through target_acts.abs() instead of
flipping by source activation sign. Requires 2 backward passes per target
component but is mathematically correct for cross-position (attention) interactions.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/dataset_attributions/harvester.py | 53 ++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/spd/dataset_attributions/harvester.py b/spd/dataset_attributions/harvester.py
index 27c6d5d43..82141c879 100644
--- a/spd/dataset_attributions/harvester.py
+++ b/spd/dataset_attributions/harvester.py
@@ -200,24 +200,39 @@ def pre_unembed_hook(_mod: nn.Module, args: tuple[Any, ...], _kwargs: Any) -> No
 
         for target_layer in self.sources_by_target:
             if target_layer == self.unembed_path:
-                self._process_output_targets(cache, ci.lower_leaky, tokens)
+                self._process_output_targets(
+                    cache,
+                    tokens,
+                    ci.lower_leaky,
+                )
             else:
-                self._process_component_targets(target_layer, ci.lower_leaky, cache, tokens)
+                self._process_component_targets(
+                    cache,
+                    tokens,
+                    ci.lower_leaky,
+                    target_layer,
+                )
 
     def _process_output_targets(
         self,
         cache: dict[str, Tensor],
-        ci: dict[str, Tensor],
         tokens: Int[Tensor, "batch seq"],
+        ci: dict[str, Tensor],
     ) -> None:
         """Process output attributions via output-residual-space storage."""
-        out_residual = cache[f"{self.unembed_path}_pre_detach"].sum(dim=(0, 1))
+        out_residual = cache[f"{self.unembed_path}_pre_detach"]
+
+        out_residual_sum = out_residual.sum(dim=(0, 1))
+        out_residual_sum_abs = out_residual.abs().sum(dim=(0, 1))
 
         source_layers = self.sources_by_target[self.unembed_path]
         source_acts = [cache[f"{s}_post_detach"] for s in source_layers]
 
         for d_idx in range(self.output_d_model):
-            grads = torch.autograd.grad(out_residual[d_idx], source_acts, retain_graph=True)
+            grads = torch.autograd.grad(out_residual_sum[d_idx], source_acts, retain_graph=True)
+            abs_grads = torch.autograd.grad(
+                out_residual_sum_abs[d_idx], source_acts, retain_graph=True
+            )
 
             self._accumulate_attributions(
                 self.unembed_path,
@@ -225,16 +240,17 @@ def _process_output_targets(
                 source_layers,
                 source_acts,
                 list(grads),
+                list(abs_grads),
                 ci,
                 tokens,
             )
 
     def _process_component_targets(
         self,
-        target_layer: str,
-        ci: dict[str, Tensor],
         cache: dict[str, Tensor],
         tokens: Int[Tensor, "batch seq"],
+        ci: dict[str, Tensor],
+        target_layer: str,
     ) -> None:
         """Process attributions to a component layer."""
         alive_targets = self.component_alive[target_layer]
@@ -242,7 +258,10 @@ def _process_component_targets(
             return
 
         target_acts_raw = cache[f"{target_layer}_pre_detach"]
-        ci_weighted_target_acts = (target_acts_raw * ci[target_layer]).sum(dim=(0, 1))
+
+        target_ci_detached = ci[target_layer].detach()
+        ci_weighted_target_acts = (target_acts_raw * target_ci_detached).sum(dim=(0, 1))
+        ci_weighted_target_acts_abs = (target_acts_raw.abs() * target_ci_detached).sum(dim=(0, 1))
 
         source_layers = self.sources_by_target[target_layer]
         source_acts = [cache[f"{s}_post_detach"] for s in source_layers]
@@ -252,12 +271,17 @@ def _process_component_targets(
                 ci_weighted_target_acts[t_idx], source_acts, retain_graph=True
             )
 
+            abs_grads = torch.autograd.grad(
+                ci_weighted_target_acts_abs[t_idx], source_acts, retain_graph=True
+            )
+
             self._accumulate_attributions(
                 target_layer,
                 t_idx,
                 source_layers,
                 source_acts,
                 list(grads),
+                list(abs_grads),
                 ci,
                 tokens,
             )
@@ -270,29 +294,32 @@ def _accumulate_attributions(
         source_layers: list[str],
         source_acts: list[Tensor],
         source_grads: list[Tensor],
+        source_abs_grads: list[Tensor],
         ci: dict[str, Tensor],
         tokens: Int[Tensor, "batch seq"],
     ) -> None:
         """Accumulate grad*act attributions from sources to a target column."""
 
         attr_accumulator = self.attr_accumulator[target_layer]
-        attr_abr_accumulator = self.attr_abs_accumulator[target_layer]
+        attr_abs_accumulator = self.attr_abs_accumulator[target_layer]
         square_attr_accumulator = self.square_attr_accumulator[target_layer]
 
-        for source_layer, act, grad in zip(source_layers, source_acts, source_grads, strict=True):
+        for source_layer, act, grad, abs_grad in zip(
+            source_layers, source_acts, source_grads, source_abs_grads, strict=True
+        ):
             attr_acc = attr_accumulator[source_layer][target_idx]
-            attr_abs_acc = attr_abr_accumulator[source_layer][target_idx]
+            attr_abs_acc = attr_abs_accumulator[source_layer][target_idx]
             square_attr_acc = square_attr_accumulator[source_layer][target_idx]
 
             # Embed has no CI (all tokens always active)
             source_ci = ci[source_layer] if source_layer != self.embed_path else 1.0
+
             ci_weighted_attr = grad * act * source_ci
-            ci_weighted_attr_abs = torch.where(act > 0, ci_weighted_attr, -ci_weighted_attr)
+            ci_weighted_attr_abs = abs_grad * act * source_ci
             ci_weighted_squared_attr = ci_weighted_attr.square()
 
             if source_layer == self.embed_path:
                 # Per-token: sum grad*act*ci over d_model, scatter by token id
-                # TODO(oli): figure out why this works
                 attr = ci_weighted_attr.sum(dim=-1).flatten()
                 attr_abs = ci_weighted_attr_abs.sum(dim=-1).flatten()
                 attr_squared = ci_weighted_squared_attr.sum(dim=-1).flatten()

From fd42030d8d19af824e6535d13f1ef098e989a47d Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 19:03:03 +0000
Subject: [PATCH 37/62] Add merge_mem config (default 200G) to prevent merge
 OOM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

3 metrics × dict-of-dicts makes rank files ~15GB each. Merge loads all
in double precision, needs much more than the default 10GB.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/dataset_attributions/config.py            | 1 +
 spd/dataset_attributions/scripts/run_slurm.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/spd/dataset_attributions/config.py b/spd/dataset_attributions/config.py
index a1de165fb..3d84fcbd8 100644
--- a/spd/dataset_attributions/config.py
+++ b/spd/dataset_attributions/config.py
@@ -26,3 +26,4 @@ class AttributionsSlurmConfig(BaseConfig):
     partition: str = DEFAULT_PARTITION_NAME
     time: str = "48:00:00"
     merge_time: str = "01:00:00"
+    merge_mem: str = "200G"
diff --git a/spd/dataset_attributions/scripts/run_slurm.py b/spd/dataset_attributions/scripts/run_slurm.py
index 8420db56e..6adc4bd52 100644
--- a/spd/dataset_attributions/scripts/run_slurm.py
+++ b/spd/dataset_attributions/scripts/run_slurm.py
@@ -119,8 +119,9 @@ def submit_attributions(
     merge_config = SlurmConfig(
         job_name="spd-attr-merge",
         partition=partition,
-        n_gpus=0,  # No GPU needed for merge
+        n_gpus=0,
         time=config.merge_time,
+        mem=config.merge_mem,
         snapshot_branch=snapshot_branch,
         dependency_job_id=array_result.job_id,
         comment=wandb_url,

From 223afd49ba275e165e459502bd3bf5edcf6a99d1 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 19:18:02 +0000
Subject: [PATCH 38/62] Add 3-metric selection to dataset attributions in app

Backend: implement storage query methods with AttrMetric parameter, bulk
endpoint returns all 3 metrics (attr, attr_abs, mean_squared_attr), other
endpoints accept optional ?metric= query param.

Frontend: 3-way radio toggle (Signed / Abs Target / RMS) in
DatasetAttributionsSection. All metrics fetched at once, selection is local
state that switches which ComponentAttributions to display.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/routers/dataset_attributions.py   | 130 ++++++++++--------
 .../ui/DatasetAttributionsSection.svelte      | 105 +++++++++++---
 .../src/lib/api/datasetAttributions.ts        |  16 ++-
 .../src/lib/useComponentData.svelte.ts        |   6 +-
 .../useComponentDataExpectCached.svelte.ts    |   6 +-
 spd/dataset_attributions/storage.py           | 100 +++++++++++++-
 6 files changed, 275 insertions(+), 88 deletions(-)

diff --git a/spd/app/backend/routers/dataset_attributions.py b/spd/app/backend/routers/dataset_attributions.py
index 3c5bd87ff..19df369b8 100644
--- a/spd/app/backend/routers/dataset_attributions.py
+++ b/spd/app/backend/routers/dataset_attributions.py
@@ -13,13 +13,13 @@
 
 from spd.app.backend.dependencies import DepLoadedRun
 from spd.app.backend.utils import log_errors
+from spd.dataset_attributions.storage import AttrMetric, DatasetAttributionStorage
 from spd.dataset_attributions.storage import DatasetAttributionEntry as StorageEntry
-from spd.dataset_attributions.storage import DatasetAttributionStorage
 
+ATTR_METRICS: list[AttrMetric] = ["attr", "attr_abs", "mean_squared_attr"]
 
-class DatasetAttributionEntry(BaseModel):
-    """A single entry in attribution results."""
 
+class DatasetAttributionEntry(BaseModel):
     component_key: str
     layer: str
     component_idx: int
@@ -27,27 +27,26 @@ class DatasetAttributionEntry(BaseModel):
 
 
 class DatasetAttributionMetadata(BaseModel):
-    """Metadata about dataset attributions availability."""
-
     available: bool
     n_batches_processed: int | None
     n_tokens_processed: int | None
     n_component_layer_keys: int | None
-    # TODO(oli): remove these from frontend
-    # vocab_size: int | None
-    # d_model: int | None
     ci_threshold: float | None
 
 
 class ComponentAttributions(BaseModel):
-    """All attribution data for a single component (sources and targets, positive and negative)."""
-
     positive_sources: list[DatasetAttributionEntry]
     negative_sources: list[DatasetAttributionEntry]
     positive_targets: list[DatasetAttributionEntry]
     negative_targets: list[DatasetAttributionEntry]
 
 
+class AllMetricAttributions(BaseModel):
+    attr: ComponentAttributions
+    attr_abs: ComponentAttributions
+    mean_squared_attr: ComponentAttributions
+
+
 router = APIRouter(prefix="/api/dataset_attributions", tags=["dataset_attributions"])
 
 NOT_AVAILABLE_MSG = (
@@ -56,19 +55,16 @@ class ComponentAttributions(BaseModel):
 
 
 def _storage_key(canonical_layer: str, component_idx: int) -> str:
-    """Format a canonical layer + idx as a storage key."""
     return f"{canonical_layer}:{component_idx}"
 
 
 def _require_storage(loaded: DepLoadedRun) -> DatasetAttributionStorage:
-    """Get storage or raise 404."""
     if loaded.attributions is None:
         raise HTTPException(status_code=404, detail=NOT_AVAILABLE_MSG)
     return loaded.attributions.get_attributions()
 
 
 def _require_source(storage: DatasetAttributionStorage, component_key: str) -> None:
-    """Validate component exists as a source or raise 404."""
     if not storage.has_source(component_key):
         raise HTTPException(
             status_code=404,
@@ -77,7 +73,6 @@ def _require_source(storage: DatasetAttributionStorage, component_key: str) -> N
 
 
 def _require_target(storage: DatasetAttributionStorage, component_key: str) -> None:
-    """Validate component exists as a target or raise 404."""
     if not storage.has_target(component_key):
         raise HTTPException(
             status_code=404,
@@ -86,12 +81,10 @@ def _require_target(storage: DatasetAttributionStorage, component_key: str) -> N
 
 
 def _get_w_unembed(loaded: DepLoadedRun) -> Float[Tensor, "d_model vocab"]:
-    """Get the unembedding matrix from the loaded model."""
     return loaded.topology.get_unembed_weight()
 
 
 def _to_api_entries(entries: list[StorageEntry]) -> list[DatasetAttributionEntry]:
-    """Convert storage entries to API response format."""
     return [
         DatasetAttributionEntry(
             component_key=e.component_key,
@@ -103,10 +96,56 @@ def _to_api_entries(entries: list[StorageEntry]) -> list[DatasetAttributionEntry
     ]
 
 
+def _get_component_attributions_for_metric(
+    storage: DatasetAttributionStorage,
+    component_key: str,
+    k: int,
+    metric: AttrMetric,
+    is_source: bool,
+    is_target: bool,
+    w_unembed: Float[Tensor, "d_model vocab"] | None,
+) -> ComponentAttributions:
+    return ComponentAttributions(
+        positive_sources=_to_api_entries(
+            storage.get_top_sources(component_key, k, "positive", metric)
+        )
+        if is_target
+        else [],
+        negative_sources=_to_api_entries(
+            storage.get_top_sources(component_key, k, "negative", metric)
+        )
+        if is_target
+        else [],
+        positive_targets=_to_api_entries(
+            storage.get_top_targets(
+                component_key,
+                k,
+                "positive",
+                metric,
+                w_unembed=w_unembed,
+                include_outputs=w_unembed is not None,
+            ),
+        )
+        if is_source
+        else [],
+        negative_targets=_to_api_entries(
+            storage.get_top_targets(
+                component_key,
+                k,
+                "negative",
+                metric,
+                w_unembed=w_unembed,
+                include_outputs=w_unembed is not None,
+            ),
+        )
+        if is_source
+        else [],
+    )
+
+
 @router.get("/metadata")
 @log_errors
 def get_attribution_metadata(loaded: DepLoadedRun) -> DatasetAttributionMetadata:
-    """Get metadata about dataset attributions availability."""
     if loaded.attributions is None:
         return DatasetAttributionMetadata(
             available=False,
@@ -132,12 +171,11 @@ def get_component_attributions(
     component_idx: int,
     loaded: DepLoadedRun,
     k: Annotated[int, Query(ge=1)] = 10,
-) -> ComponentAttributions:
-    """Get all attribution data for a component (sources and targets, positive and negative)."""
+) -> AllMetricAttributions:
+    """Get all attribution data for a component across all 3 metrics."""
     storage = _require_storage(loaded)
     component_key = _storage_key(layer, component_idx)
 
-    # Component can be both a source and a target, so we need to check both
     is_source = storage.has_source(component_key)
     is_target = storage.has_target(component_key)
 
@@ -149,35 +187,13 @@ def get_component_attributions(
 
     w_unembed = _get_w_unembed(loaded) if is_source else None
 
-    return ComponentAttributions(
-        positive_sources=_to_api_entries(storage.get_top_sources(component_key, k, "positive"))
-        if is_target
-        else [],
-        negative_sources=_to_api_entries(storage.get_top_sources(component_key, k, "negative"))
-        if is_target
-        else [],
-        positive_targets=_to_api_entries(
-            storage.get_top_targets(
-                component_key,
-                k,
-                "positive",
-                w_unembed=w_unembed,
-                include_outputs=w_unembed is not None,
-            ),
-        )
-        if is_source
-        else [],
-        negative_targets=_to_api_entries(
-            storage.get_top_targets(
-                component_key,
-                k,
-                "negative",
-                w_unembed=w_unembed,
-                include_outputs=w_unembed is not None,
-            ),
-        )
-        if is_source
-        else [],
+    return AllMetricAttributions(
+        **{
+            metric: _get_component_attributions_for_metric(
+                storage, component_key, k, metric, is_source, is_target, w_unembed
+            )
+            for metric in ATTR_METRICS
+        }
     )
 
 
@@ -189,15 +205,17 @@ def get_attribution_sources(
     loaded: DepLoadedRun,
     k: Annotated[int, Query(ge=1)] = 10,
     sign: Literal["positive", "negative"] = "positive",
+    metric: AttrMetric = "attr",
 ) -> list[DatasetAttributionEntry]:
-    """Get top-k source components that attribute TO this target over the dataset."""
     storage = _require_storage(loaded)
     target_key = _storage_key(layer, component_idx)
     _require_target(storage, target_key)
 
     w_unembed = _get_w_unembed(loaded) if layer == "output" else None
 
-    return _to_api_entries(storage.get_top_sources(target_key, k, sign, w_unembed=w_unembed))
+    return _to_api_entries(
+        storage.get_top_sources(target_key, k, sign, metric, w_unembed=w_unembed)
+    )
 
 
 @router.get("/{layer}/{component_idx}/targets")
@@ -208,15 +226,17 @@ def get_attribution_targets(
     loaded: DepLoadedRun,
     k: Annotated[int, Query(ge=1)] = 10,
     sign: Literal["positive", "negative"] = "positive",
+    metric: AttrMetric = "attr",
 ) -> list[DatasetAttributionEntry]:
-    """Get top-k target components this source attributes TO over the dataset."""
     storage = _require_storage(loaded)
     source_key = _storage_key(layer, component_idx)
     _require_source(storage, source_key)
 
     w_unembed = _get_w_unembed(loaded)
 
-    return _to_api_entries(storage.get_top_targets(source_key, k, sign, w_unembed=w_unembed))
+    return _to_api_entries(
+        storage.get_top_targets(source_key, k, sign, metric, w_unembed=w_unembed)
+    )
 
 
 @router.get("/between/{source_layer}/{source_idx}/{target_layer}/{target_idx}")
@@ -227,8 +247,8 @@ def get_attribution_between(
     target_layer: str,
     target_idx: int,
     loaded: DepLoadedRun,
+    metric: AttrMetric = "attr",
 ) -> float:
-    """Get attribution strength from source component to target component."""
     storage = _require_storage(loaded)
     source_key = _storage_key(source_layer, source_idx)
     target_key = _storage_key(target_layer, target_idx)
@@ -237,4 +257,4 @@ def get_attribution_between(
 
     w_unembed = _get_w_unembed(loaded) if target_layer == "output" else None
 
-    return storage.get_attribution(source_key, target_key, w_unembed=w_unembed)
+    return storage.get_attribution(source_key, target_key, metric, w_unembed=w_unembed)
diff --git a/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte b/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
index cd86c9af1..3ec394eba 100644
--- a/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
+++ b/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
@@ -9,14 +9,22 @@
     import { COMPONENT_CARD_CONSTANTS } from "../../lib/componentCardConstants";
     import type { EdgeAttribution } from "../../lib/promptAttributionsTypes";
     import type { DatasetAttributions } from "../../lib/useComponentData.svelte";
+    import type { AttrMetric } from "../../lib/api/datasetAttributions";
     import EdgeAttributionGrid from "./EdgeAttributionGrid.svelte";
 
+    const METRIC_LABELS: Record<AttrMetric, string> = {
+        attr: "Signed",
+        attr_abs: "Abs Target",
+        mean_squared_attr: "RMS",
+    };
+
     type Props = {
         attributions: DatasetAttributions;
         onComponentClick?: (componentKey: string) => void;
     };
 
     let { attributions, onComponentClick }: Props = $props();
+    let selectedMetric = $state<AttrMetric>("attr");
 
     function handleClick(key: string) {
         if (onComponentClick) {
@@ -24,6 +32,8 @@
         }
     }
 
+    const active = $derived(attributions[selectedMetric]);
+
     function toEdgeAttribution(
         entries: { component_key: string; value: number }[],
         maxAbsValue: number,
@@ -36,26 +46,87 @@
     }
 
     const maxSourceVal = $derived(
-        Math.max(attributions.positive_sources[0]?.value ?? 0, Math.abs(attributions.negative_sources[0]?.value ?? 0)),
+        Math.max(
+            active.positive_sources[0]?.value ?? 0,
+            Math.abs(active.negative_sources[0]?.value ?? 0),
+        ),
     );
     const maxTargetVal = $derived(
-        Math.max(attributions.positive_targets[0]?.value ?? 0, Math.abs(attributions.negative_targets[0]?.value ?? 0)),
+        Math.max(
+            active.positive_targets[0]?.value ?? 0,
+            Math.abs(active.negative_targets[0]?.value ?? 0),
+        ),
     );
 
-    const positiveSources = $derived(toEdgeAttribution(attributions.positive_sources, maxSourceVal));
-    const negativeSources = $derived(toEdgeAttribution(attributions.negative_sources, maxSourceVal));
-    const positiveTargets = $derived(toEdgeAttribution(attributions.positive_targets, maxTargetVal));
-    const negativeTargets = $derived(toEdgeAttribution(attributions.negative_targets, maxTargetVal));
+    const positiveSources = $derived(toEdgeAttribution(active.positive_sources, maxSourceVal));
+    const negativeSources = $derived(toEdgeAttribution(active.negative_sources, maxSourceVal));
+    const positiveTargets = $derived(toEdgeAttribution(active.positive_targets, maxTargetVal));
+    const negativeTargets = $derived(toEdgeAttribution(active.negative_targets, maxTargetVal));
 </script>
 
-<EdgeAttributionGrid
-    title="Dataset Attributions"
-    incomingLabel="Incoming"
-    outgoingLabel="Outgoing"
-    incomingPositive={positiveSources}
-    incomingNegative={negativeSources}
-    outgoingPositive={positiveTargets}
-    outgoingNegative={negativeTargets}
-    pageSize={COMPONENT_CARD_CONSTANTS.DATASET_ATTRIBUTIONS_PAGE_SIZE}
-    onClick={handleClick}
-/>
+<div class="section">
+    <div class="metric-selector">
+        {#each Object.entries(METRIC_LABELS) as [metric, label] (metric)}
+            <label class="radio-item">
+                <input
+                    type="radio"
+                    name="dataset-attr-metric"
+                    checked={selectedMetric === metric}
+                    onchange={() => (selectedMetric = metric as AttrMetric)}
+                />
+                <span class="stat-label">{label}</span>
+            </label>
+        {/each}
+    </div>
+
+    <EdgeAttributionGrid
+        title="Dataset Attributions"
+        incomingLabel="Incoming"
+        outgoingLabel="Outgoing"
+        incomingPositive={positiveSources}
+        incomingNegative={negativeSources}
+        outgoingPositive={positiveTargets}
+        outgoingNegative={negativeTargets}
+        pageSize={COMPONENT_CARD_CONSTANTS.DATASET_ATTRIBUTIONS_PAGE_SIZE}
+        onClick={handleClick}
+    />
+</div>
+
+<style>
+    .section {
+        display: flex;
+        flex-direction: column;
+        gap: var(--space-2);
+    }
+
+    .metric-selector {
+        display: flex;
+        gap: var(--space-3);
+        font-size: var(--text-sm);
+    }
+
+    .radio-item {
+        display: flex;
+        align-items: center;
+        gap: var(--space-1);
+        cursor: pointer;
+        padding: var(--space-1);
+        border-radius: var(--radius-sm);
+    }
+
+    .radio-item:hover {
+        background: var(--bg-inset);
+    }
+
+    .radio-item input {
+        margin: 0;
+        cursor: pointer;
+        accent-color: var(--accent-primary);
+    }
+
+    .stat-label {
+        font-size: var(--text-sm);
+        font-weight: 500;
+        color: var(--text-primary);
+    }
+</style>
diff --git a/spd/app/frontend/src/lib/api/datasetAttributions.ts b/spd/app/frontend/src/lib/api/datasetAttributions.ts
index f995a33f6..9916439fd 100644
--- a/spd/app/frontend/src/lib/api/datasetAttributions.ts
+++ b/spd/app/frontend/src/lib/api/datasetAttributions.ts
@@ -18,20 +18,30 @@ export type ComponentAttributions = {
     negative_targets: DatasetAttributionEntry[];
 };
 
+export type AttrMetric = "attr" | "attr_abs" | "mean_squared_attr";
+
+export type AllMetricAttributions = {
+    attr: ComponentAttributions;
+    attr_abs: ComponentAttributions;
+    mean_squared_attr: ComponentAttributions;
+};
+
 export type DatasetAttributionsMetadata = {
     available: boolean;
 };
 
 export async function getDatasetAttributionsMetadata(): Promise<DatasetAttributionsMetadata> {
-    return fetchJson<DatasetAttributionsMetadata>(apiUrl("/api/dataset_attributions/metadata").toString());
+    return fetchJson<DatasetAttributionsMetadata>(
+        apiUrl("/api/dataset_attributions/metadata").toString(),
+    );
 }
 
 export async function getComponentAttributions(
     layer: string,
     componentIdx: number,
     k: number = 10,
-): Promise<ComponentAttributions> {
+): Promise<AllMetricAttributions> {
     const url = apiUrl(`/api/dataset_attributions/${encodeURIComponent(layer)}/${componentIdx}`);
     url.searchParams.set("k", String(k));
-    return fetchJson<ComponentAttributions>(url.toString());
+    return fetchJson<AllMetricAttributions>(url.toString());
 }
diff --git a/spd/app/frontend/src/lib/useComponentData.svelte.ts b/spd/app/frontend/src/lib/useComponentData.svelte.ts
index b9041f83e..d2af449ef 100644
--- a/spd/app/frontend/src/lib/useComponentData.svelte.ts
+++ b/spd/app/frontend/src/lib/useComponentData.svelte.ts
@@ -8,7 +8,7 @@ import {
     getInterpretationDetail,
     requestComponentInterpretation,
 } from "./api";
-import type { ComponentAttributions, InterpretationDetail } from "./api";
+import type { AllMetricAttributions, InterpretationDetail } from "./api";
 import type {
     SubcomponentCorrelationsResponse,
     SubcomponentActivationContexts,
@@ -23,7 +23,7 @@ const TOKEN_STATS_TOP_K = 200;
 /** Dataset attributions top-k */
 const DATASET_ATTRIBUTIONS_TOP_K = 20;
 
-export type { ComponentAttributions as DatasetAttributions };
+export type { AllMetricAttributions as DatasetAttributions };
 
 export type ComponentCoords = { layer: string; cIdx: number };
 
@@ -43,7 +43,7 @@ export function useComponentData() {
     // null inside Loadable means "no data for this component" (404)
     let correlations = $state<Loadable<SubcomponentCorrelationsResponse | null>>({ status: "uninitialized" });
     let tokenStats = $state<Loadable<TokenStatsResponse | null>>({ status: "uninitialized" });
-    let datasetAttributions = $state<Loadable<ComponentAttributions | null>>({ status: "uninitialized" });
+    let datasetAttributions = $state<Loadable<AllMetricAttributions | null>>({ status: "uninitialized" });
 
     let interpretationDetail = $state<Loadable<InterpretationDetail | null>>({ status: "uninitialized" });
 
diff --git a/spd/app/frontend/src/lib/useComponentDataExpectCached.svelte.ts b/spd/app/frontend/src/lib/useComponentDataExpectCached.svelte.ts
index f32dab70a..29110c327 100644
--- a/spd/app/frontend/src/lib/useComponentDataExpectCached.svelte.ts
+++ b/spd/app/frontend/src/lib/useComponentDataExpectCached.svelte.ts
@@ -17,7 +17,7 @@ import {
     getInterpretationDetail,
     requestComponentInterpretation,
 } from "./api";
-import type { ComponentAttributions, InterpretationDetail } from "./api";
+import type { AllMetricAttributions, InterpretationDetail } from "./api";
 import type {
     SubcomponentCorrelationsResponse,
     SubcomponentActivationContexts,
@@ -29,7 +29,7 @@ const DATASET_ATTRIBUTIONS_TOP_K = 20;
 /** Fetch more activation examples in background after initial cached load */
 const ACTIVATION_EXAMPLES_FULL_LIMIT = 200;
 
-export type { ComponentAttributions as DatasetAttributions };
+export type { AllMetricAttributions as DatasetAttributions };
 
 export type ComponentCoords = { layer: string; cIdx: number };
 
@@ -39,7 +39,7 @@ export function useComponentDataExpectCached() {
     let componentDetail = $state<Loadable<SubcomponentActivationContexts>>({ status: "uninitialized" });
     let correlations = $state<Loadable<SubcomponentCorrelationsResponse | null>>({ status: "uninitialized" });
     let tokenStats = $state<Loadable<TokenStatsResponse | null>>({ status: "uninitialized" });
-    let datasetAttributions = $state<Loadable<ComponentAttributions | null>>({ status: "uninitialized" });
+    let datasetAttributions = $state<Loadable<AllMetricAttributions | null>>({ status: "uninitialized" });
     let interpretationDetail = $state<Loadable<InterpretationDetail | null>>({ status: "uninitialized" });
 
     let currentCoords = $state<ComponentCoords | null>(null);
diff --git a/spd/dataset_attributions/storage.py b/spd/dataset_attributions/storage.py
index ca1c8e59d..22dae2e6e 100644
--- a/spd/dataset_attributions/storage.py
+++ b/spd/dataset_attributions/storage.py
@@ -22,6 +22,7 @@
 from spd.log import logger
 
 AttrDict = dict[str, dict[str, Tensor]]
+AttrMetric = Literal["attr", "attr_abs", "mean_squared_attr"]
 
 
 @dataclass
@@ -100,15 +101,100 @@ def has_target(self, key: str) -> bool:
                 first_source = next(iter(self.attr[layer].values()))
                 return 0 <= idx < first_source.shape[0]
 
-    # TODO: these methods need a metric parameter to select which of the 3 attr dicts to query
-    def get_attribution(self, *_args: object, **_kwargs: object) -> float:
-        raise ValueError("TODO: get_attribution needs metric selection")
+    def _get_attr_dict(self, metric: AttrMetric) -> AttrDict:
+        match metric:
+            case "attr":
+                return self.attr
+            case "attr_abs":
+                return self.attr_abs
+            case "mean_squared_attr":
+                return self.mean_squared_attr
 
-    def get_top_sources(self, *_args: object, **_kwargs: object) -> list[DatasetAttributionEntry]:
-        raise ValueError("TODO: get_top_sources needs metric selection")
+    def get_attribution(
+        self,
+        source_key: str,
+        target_key: str,
+        metric: AttrMetric,
+        w_unembed: Tensor | None = None,
+    ) -> float:
+        source_layer, source_idx = self._parse_key(source_key)
+        target_layer, target_idx = self._parse_key(target_key)
+        assert source_layer != "output", f"output tokens cannot be sources: {source_key}"
+
+        attrs = self._get_attr_dict(metric)
+        attr_matrix = attrs[target_layer][source_layer]
+
+        if target_layer == "output":
+            assert w_unembed is not None, "w_unembed required for output target queries"
+            w_unembed = w_unembed.to(attr_matrix.device)
+            return (attr_matrix[:, source_idx] @ w_unembed[:, target_idx]).item()
+
+        return attr_matrix[target_idx, source_idx].item()
+
+    def get_top_sources(
+        self,
+        target_key: str,
+        k: int,
+        sign: Literal["positive", "negative"],
+        metric: AttrMetric,
+        w_unembed: Tensor | None = None,
+    ) -> list[DatasetAttributionEntry]:
+        target_layer, target_idx = self._parse_key(target_key)
+        attrs = self._get_attr_dict(metric)
+
+        if target_layer == "output":
+            assert w_unembed is not None, "w_unembed required for output target queries"
+
+        value_segments: list[Tensor] = []
+        layer_names: list[str] = []
+
+        for source_layer, attr_matrix in attrs[target_layer].items():
+            if target_layer == "output":
+                assert w_unembed is not None
+                w = w_unembed.to(attr_matrix.device)
+                values = w[:, target_idx] @ attr_matrix
+            else:
+                values = attr_matrix[target_idx, :]
+
+            value_segments.append(values)
+            layer_names.append(source_layer)
+
+        return self._top_k_from_segments(value_segments, layer_names, k, sign)
 
-    def get_top_targets(self, *_args: object, **_kwargs: object) -> list[DatasetAttributionEntry]:
-        raise ValueError("TODO: get_top_targets needs metric selection")
+    def get_top_targets(
+        self,
+        source_key: str,
+        k: int,
+        sign: Literal["positive", "negative"],
+        metric: AttrMetric,
+        w_unembed: Tensor | None = None,
+        include_outputs: bool = True,
+    ) -> list[DatasetAttributionEntry]:
+        source_layer, source_idx = self._parse_key(source_key)
+        attrs = self._get_attr_dict(metric)
+
+        value_segments: list[Tensor] = []
+        layer_names: list[str] = []
+
+        for target_layer, sources in attrs.items():
+            if source_layer not in sources:
+                continue
+
+            attr_matrix = sources[source_layer]
+
+            if target_layer == "output":
+                if not include_outputs:
+                    continue
+                assert w_unembed is not None, "w_unembed required when include_outputs=True"
+                w = w_unembed.to(attr_matrix.device)
+                values = attr_matrix[:, source_idx] @ w
+            else:
+                values = attr_matrix[:, source_idx]
+
+            value_segments.append(values)
+            layer_names.append(target_layer)
+
+        return self._top_k_from_segments(value_segments, layer_names, k, sign)
 
     def _top_k_from_segments(
         self,

From 4fc7cf1e2e1d98748b9e31712939c9cb84c58cf3 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 19:22:52 +0000
Subject: [PATCH 39/62] Allow bare s-prefixed run IDs everywhere (e.g.
 "s-17805b61")

parse_wandb_run_path now accepts "s-xxxxxxxx" and expands to goodfire/spd.
Handled in backend so it works for CLI, app, and any other consumer.
Frontend placeholder updated.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/app/frontend/src/components/RunSelector.svelte |  2 +-
 spd/utils/wandb_utils.py                           | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/spd/app/frontend/src/components/RunSelector.svelte b/spd/app/frontend/src/components/RunSelector.svelte
index aa4728bd8..f174ee635 100644
--- a/spd/app/frontend/src/components/RunSelector.svelte
+++ b/spd/app/frontend/src/components/RunSelector.svelte
@@ -87,7 +87,7 @@
         <form class="custom-form" onsubmit={handleCustomSubmit}>
             <input
                 type="text"
-                placeholder="e.g. goodfire/spd/runs/33n6xjjt"
+                placeholder="e.g. s-17805b61 or goodfire/spd/runs/33n6xjjt"
                 bind:value={customPath}
                 disabled={isLoading}
             />
diff --git a/spd/utils/wandb_utils.py b/spd/utils/wandb_utils.py
index 6b1d85813..7dd49a730 100644
--- a/spd/utils/wandb_utils.py
+++ b/spd/utils/wandb_utils.py
@@ -31,7 +31,11 @@
 
 # Regex patterns for parsing W&B run references
 # Run IDs can be 8 chars (e.g., "d2ec3bfe") or prefixed with char-dash (e.g., "s-d2ec3bfe")
+DEFAULT_WANDB_ENTITY = "goodfire"
+DEFAULT_WANDB_PROJECT = "spd"
+
 _RUN_ID_PATTERN = r"(?:[a-z0-9]-)?[a-z0-9]{8}"
+_BARE_RUN_ID_RE = re.compile(r"^(s-[a-z0-9]{8})$")
 _WANDB_PATH_RE = re.compile(rf"^([^/\s]+)/([^/\s]+)/({_RUN_ID_PATTERN})$")
 _WANDB_PATH_WITH_RUNS_RE = re.compile(rf"^([^/\s]+)/([^/\s]+)/runs/({_RUN_ID_PATTERN})$")
 _WANDB_URL_RE = re.compile(
@@ -169,6 +173,7 @@ def parse_wandb_run_path(input_path: str) -> tuple[str, str, str]:
     """Parse various W&B run reference formats into (entity, project, run_id).
 
     Accepts:
+    - "s-xxxxxxxx" (bare SPD run ID, assumes goodfire/spd)
     - "entity/project/runId" (compact form)
     - "entity/project/runs/runId" (with /runs/)
     - "wandb:entity/project/runId" (with wandb: prefix)
@@ -187,6 +192,10 @@ def parse_wandb_run_path(input_path: str) -> tuple[str, str, str]:
     if s.startswith("wandb:"):
         s = s[6:]
 
+    # Bare run ID (e.g. "s-17805b61") → default entity/project
+    if m := _BARE_RUN_ID_RE.match(s):
+        return DEFAULT_WANDB_ENTITY, DEFAULT_WANDB_PROJECT, m.group(1)
+
     # Try compact form: entity/project/runid
     if m := _WANDB_PATH_RE.match(s):
         return m.group(1), m.group(2), m.group(3)
@@ -201,6 +210,7 @@ def parse_wandb_run_path(input_path: str) -> tuple[str, str, str]:
 
     raise ValueError(
         f"Invalid W&B run reference. Expected one of:\n"
+        f' - "s-xxxxxxxx" (bare run ID)\n'
         f' - "entity/project/xxxxxxxx"\n'
         f' - "entity/project/runs/xxxxxxxx"\n'
         f' - "wandb:entity/project/runs/xxxxxxxx"\n'

From 6a5d0f61226c8c0dfb7e1ace7972749244ef20d1 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 19:31:15 +0000
Subject: [PATCH 40/62] Fix AttributionRepo.open skipping valid subruns due to
 old-format dirs

Old subruns (da-timing-*, da-overnight-*) sort after da-YYYYMMDD_* and have
no dataset_attributions.pt. The old code only checked the last candidate
and returned None. Now iterates in reverse until finding one with the file.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/dataset_attributions/repo.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/spd/dataset_attributions/repo.py b/spd/dataset_attributions/repo.py
index 697036ba3..1175d584e 100644
--- a/spd/dataset_attributions/repo.py
+++ b/spd/dataset_attributions/repo.py
@@ -42,14 +42,13 @@ def open(cls, run_id: str) -> "AttributionRepo | None":
         candidates = sorted(
             [d for d in base_dir.iterdir() if d.is_dir() and d.name.startswith("da-")],
             key=lambda d: d.name,
+            reverse=True,
         )
-        if not candidates:
-            return None
-        subrun_dir = candidates[-1]
-        path = subrun_dir / "dataset_attributions.pt"
-        if not path.exists():
-            return None
-        return cls(DatasetAttributionStorage.load(path), subrun_id=subrun_dir.name)
+        for subrun_dir in candidates:
+            path = subrun_dir / "dataset_attributions.pt"
+            if path.exists():
+                return cls(DatasetAttributionStorage.load(path), subrun_id=subrun_dir.name)
+        return None
 
     def get_attributions(self) -> DatasetAttributionStorage:
         return self._storage

From 08b17c9154d4b004b9b8e6171815e29d133054bf Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 20:17:01 +0000
Subject: [PATCH 41/62] Fix 3s lag on attribution metric toggle: O(V) linear
 scan per pill

getTokenText did .find() over the full 50K vocab array for every
embed/output pill on each render. Build a Map<id, string> once via
$derived, making lookups O(1).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../components/ui/EdgeAttributionList.svelte  | 25 +++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/spd/app/frontend/src/components/ui/EdgeAttributionList.svelte b/spd/app/frontend/src/components/ui/EdgeAttributionList.svelte
index aa98848b5..5f06da323 100644
--- a/spd/app/frontend/src/components/ui/EdgeAttributionList.svelte
+++ b/spd/app/frontend/src/components/ui/EdgeAttributionList.svelte
@@ -7,6 +7,20 @@
 
     const runState = getContext<RunContext>(RUN_KEY);
 
+    function assert(condition: boolean, msg: string): asserts condition {
+        if (!condition) throw new Error(msg);
+    }
+
+    const vocabIdToText = $derived.by(() => {
+        const map = new Map<number, string>();
+        if (runState.allTokens.status === "loaded") {
+            for (const t of runState.allTokens.data) {
+                map.set(t.id, t.string);
+            }
+        }
+        return map;
+    });
+
     type Props = {
         items: EdgeAttribution[];
         onClick: (key: string) => void;
@@ -81,13 +95,10 @@
 
             if (layer === "embed" || layer === "output") {
                 const vocabIdx = parseInt(cIdx);
-                // Tokens are guaranteed loaded when run is loaded (see useRun.svelte.ts)
-                if (runState.allTokens.status !== "loaded") {
-                    throw new Error(`allTokens not loaded (status: ${runState.allTokens.status})`);
-                }
-                const tokenInfo = runState.allTokens.data.find((t) => t.id === vocabIdx);
-                if (!tokenInfo) throw new Error(`Token not found for vocab index ${vocabIdx}`);
-                return tokenInfo.string;
+                assert(runState.allTokens.status === "loaded", `allTokens not loaded`);
+                const text = vocabIdToText.get(vocabIdx);
+                assert(text !== undefined, `Token not found for vocab index ${vocabIdx}`);
+                return text;
             }
         }
 

From b0df7c09a861c4721100a978966aca3266f6451b Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 20:34:45 +0000
Subject: [PATCH 42/62] Ship token strings from backend instead of resolving
 vocab IDs in frontend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Backend resolves embed/output token strings via tokenizer.decode() and
includes token_str in DatasetAttributionEntry. Frontend uses it directly
instead of scanning a 50K vocab array per pill.

Removes tokens/outputProbs passthrough from EdgeAttributionGrid/List —
token strings now flow through EdgeAttribution.tokenStr from the source.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/routers/dataset_attributions.py   | 21 +++--
 .../prompt-attr/ComponentNodeCard.svelte      | 13 ++-
 .../ui/DatasetAttributionsSection.svelte      |  3 +-
 .../components/ui/EdgeAttributionGrid.svelte  | 15 +---
 .../components/ui/EdgeAttributionList.svelte  | 79 ++-----------------
 .../src/lib/api/datasetAttributions.ts        |  1 +
 .../src/lib/promptAttributionsTypes.ts        |  1 +
 7 files changed, 38 insertions(+), 95 deletions(-)

diff --git a/spd/app/backend/routers/dataset_attributions.py b/spd/app/backend/routers/dataset_attributions.py
index 19df369b8..bf8ee501a 100644
--- a/spd/app/backend/routers/dataset_attributions.py
+++ b/spd/app/backend/routers/dataset_attributions.py
@@ -24,6 +24,7 @@ class DatasetAttributionEntry(BaseModel):
     layer: str
     component_idx: int
     value: float
+    token_str: str | None = None
 
 
 class DatasetAttributionMetadata(BaseModel):
@@ -84,13 +85,18 @@ def _get_w_unembed(loaded: DepLoadedRun) -> Float[Tensor, "d_model vocab"]:
     return loaded.topology.get_unembed_weight()
 
 
-def _to_api_entries(entries: list[StorageEntry]) -> list[DatasetAttributionEntry]:
+def _to_api_entries(
+    entries: list[StorageEntry], loaded: DepLoadedRun
+) -> list[DatasetAttributionEntry]:
     return [
         DatasetAttributionEntry(
             component_key=e.component_key,
             layer=e.layer,
             component_idx=e.component_idx,
             value=e.value,
+            token_str=loaded.tokenizer.decode([e.component_idx])
+            if e.layer in ("embed", "output")
+            else None,
         )
         for e in entries
     ]
@@ -98,6 +104,7 @@ def _to_api_entries(entries: list[StorageEntry]) -> list[DatasetAttributionEntry
 
 def _get_component_attributions_for_metric(
     storage: DatasetAttributionStorage,
+    loaded: DepLoadedRun,
     component_key: str,
     k: int,
     metric: AttrMetric,
@@ -107,12 +114,12 @@ def _get_component_attributions_for_metric(
 ) -> ComponentAttributions:
     return ComponentAttributions(
         positive_sources=_to_api_entries(
-            storage.get_top_sources(component_key, k, "positive", metric)
+            storage.get_top_sources(component_key, k, "positive", metric), loaded
         )
         if is_target
         else [],
         negative_sources=_to_api_entries(
-            storage.get_top_sources(component_key, k, "negative", metric)
+            storage.get_top_sources(component_key, k, "negative", metric), loaded
         )
         if is_target
         else [],
@@ -125,6 +132,7 @@ def _get_component_attributions_for_metric(
                 w_unembed=w_unembed,
                 include_outputs=w_unembed is not None,
             ),
+            loaded,
         )
         if is_source
         else [],
@@ -137,6 +145,7 @@ def _get_component_attributions_for_metric(
                 w_unembed=w_unembed,
                 include_outputs=w_unembed is not None,
             ),
+            loaded,
         )
         if is_source
         else [],
@@ -190,7 +199,7 @@ def get_component_attributions(
     return AllMetricAttributions(
         **{
             metric: _get_component_attributions_for_metric(
-                storage, component_key, k, metric, is_source, is_target, w_unembed
+                storage, loaded, component_key, k, metric, is_source, is_target, w_unembed
             )
             for metric in ATTR_METRICS
         }
@@ -214,7 +223,7 @@ def get_attribution_sources(
     w_unembed = _get_w_unembed(loaded) if layer == "output" else None
 
     return _to_api_entries(
-        storage.get_top_sources(target_key, k, sign, metric, w_unembed=w_unembed)
+        storage.get_top_sources(target_key, k, sign, metric, w_unembed=w_unembed), loaded
     )
 
 
@@ -235,7 +244,7 @@ def get_attribution_targets(
     w_unembed = _get_w_unembed(loaded)
 
     return _to_api_entries(
-        storage.get_top_targets(source_key, k, sign, metric, w_unembed=w_unembed)
+        storage.get_top_targets(source_key, k, sign, metric, w_unembed=w_unembed), loaded
     )
 
 
diff --git a/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte b/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte
index a0d663208..640135c76 100644
--- a/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte
+++ b/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte
@@ -130,6 +130,16 @@
     const currentNodeKey = $derived(`${layer}:${seqIdx}:${cIdx}`);
     const N_EDGES_TO_DISPLAY = 20;
 
+    function resolveTokenStr(nodeKey: string): string | null {
+        const parts = nodeKey.split(":");
+        if (parts.length !== 3) return null;
+        const [layer, seqStr, cIdx] = parts;
+        const seqIdx = parseInt(seqStr);
+        if (layer === "embed") return tokens[seqIdx] ?? null;
+        if (layer === "output") return outputProbs[`${seqIdx}:${cIdx}`]?.token ?? null;
+        return null;
+    }
+
     function getTopEdgeAttributions(
         edges: EdgeData[],
         isPositive: boolean,
@@ -144,6 +154,7 @@
             key: getKey(e),
             value: e.val,
             normalizedMagnitude: Math.abs(e.val) / maxAbsVal,
+            tokenStr: resolveTokenStr(getKey(e)),
         }));
     }
 
@@ -249,8 +260,6 @@
             {outgoingNegative}
             pageSize={COMPONENT_CARD_CONSTANTS.PROMPT_ATTRIBUTIONS_PAGE_SIZE}
             onClick={handleEdgeNodeClick}
-            {tokens}
-            {outputProbs}
         />
     {/if}
 
diff --git a/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte b/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
index 3ec394eba..d47668bc3 100644
--- a/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
+++ b/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
@@ -35,13 +35,14 @@
     const active = $derived(attributions[selectedMetric]);
 
     function toEdgeAttribution(
-        entries: { component_key: string; value: number }[],
+        entries: { component_key: string; value: number; token_str: string | null }[],
         maxAbsValue: number,
     ): EdgeAttribution[] {
         return entries.map((e) => ({
             key: e.component_key,
             value: e.value,
             normalizedMagnitude: Math.abs(e.value) / (maxAbsValue || 1),
+            tokenStr: e.token_str,
         }));
     }
 
diff --git a/spd/app/frontend/src/components/ui/EdgeAttributionGrid.svelte b/spd/app/frontend/src/components/ui/EdgeAttributionGrid.svelte
index 844cb7c04..c90bfc33e 100644
--- a/spd/app/frontend/src/components/ui/EdgeAttributionGrid.svelte
+++ b/spd/app/frontend/src/components/ui/EdgeAttributionGrid.svelte
@@ -1,5 +1,5 @@
 <script lang="ts">
-    import type { EdgeAttribution, OutputProbability } from "../../lib/promptAttributionsTypes";
+    import type { EdgeAttribution } from "../../lib/promptAttributionsTypes";
     import EdgeAttributionList from "./EdgeAttributionList.svelte";
     import SectionHeader from "./SectionHeader.svelte";
 
@@ -13,9 +13,6 @@
         outgoingNegative: EdgeAttribution[];
         pageSize: number;
         onClick: (key: string) => void;
-        // Optional: only needed for prompt-level attributions with wte/output pseudo-layers
-        tokens?: string[];
-        outputProbs?: Record<string, OutputProbability>;
     };
 
     let {
@@ -28,8 +25,6 @@
         outgoingNegative,
         pageSize,
         onClick,
-        tokens,
-        outputProbs,
     }: Props = $props();
 
     const hasAnyIncoming = $derived(incomingPositive.length > 0 || incomingNegative.length > 0);
@@ -48,8 +43,6 @@
                         {onClick}
                         direction="positive"
                         title="Positive"
-                        {tokens}
-                        {outputProbs}
                     />
                 </div>
             {/if}
@@ -61,8 +54,6 @@
                         {onClick}
                         direction="negative"
                         title="Negative"
-                        {tokens}
-                        {outputProbs}
                     />
                 </div>
             {/if}
@@ -82,8 +73,6 @@
                         {onClick}
                         direction="positive"
                         title="Positive"
-                        {tokens}
-                        {outputProbs}
                     />
                 </div>
             {/if}
@@ -95,8 +84,6 @@
                         {onClick}
                         direction="negative"
                         title="Negative"
-                        {tokens}
-                        {outputProbs}
                     />
                 </div>
             {/if}
diff --git a/spd/app/frontend/src/components/ui/EdgeAttributionList.svelte b/spd/app/frontend/src/components/ui/EdgeAttributionList.svelte
index 5f06da323..7ab276067 100644
--- a/spd/app/frontend/src/components/ui/EdgeAttributionList.svelte
+++ b/spd/app/frontend/src/components/ui/EdgeAttributionList.svelte
@@ -1,38 +1,21 @@
 <script lang="ts">
     import { getContext } from "svelte";
     import { colors } from "../../lib/colors";
-    import type { EdgeAttribution, OutputProbability } from "../../lib/promptAttributionsTypes";
+    import type { EdgeAttribution } from "../../lib/promptAttributionsTypes";
     import { RUN_KEY, type InterpretationBackendState, type RunContext } from "../../lib/useRun.svelte";
     import { lerp } from "../prompt-attr/graphUtils";
 
     const runState = getContext<RunContext>(RUN_KEY);
 
-    function assert(condition: boolean, msg: string): asserts condition {
-        if (!condition) throw new Error(msg);
-    }
-
-    const vocabIdToText = $derived.by(() => {
-        const map = new Map<number, string>();
-        if (runState.allTokens.status === "loaded") {
-            for (const t of runState.allTokens.data) {
-                map.set(t.id, t.string);
-            }
-        }
-        return map;
-    });
-
     type Props = {
         items: EdgeAttribution[];
         onClick: (key: string) => void;
         pageSize: number;
         direction: "positive" | "negative";
         title?: string;
-        // Optional: only needed for prompt-level attributions with embed/output pseudo-layers
-        tokens?: string[];
-        outputProbs?: Record<string, OutputProbability>;
     };
 
-    let { items, onClick, pageSize, direction, title, tokens, outputProbs }: Props = $props();
+    let { items, onClick, pageSize, direction, title }: Props = $props();
 
     // Extract component key (layer:cIdx) from either format
     function getComponentKey(key: string): string {
@@ -62,54 +45,6 @@
         return layer === "embed" || layer === "output";
     }
 
-    // Get the raw token text for a token node (used in tooltips)
-    function getTokenText(key: string): string {
-        const parts = key.split(":");
-
-        // Prompt attributions: 3-part keys (layer:seq:cIdx)
-        if (tokens && outputProbs && parts.length === 3) {
-            const [layer, seqStr, cIdx] = parts;
-            const seqIdx = parseInt(seqStr);
-
-            if (layer === "embed") {
-                if (seqIdx < 0 || seqIdx >= tokens.length) {
-                    throw new Error(
-                        `EdgeAttributionList: seqIdx ${seqIdx} out of bounds for tokens length ${tokens.length}`,
-                    );
-                }
-                return tokens[seqIdx];
-            }
-
-            if (layer === "output") {
-                const entry = outputProbs[`${seqIdx}:${cIdx}`];
-                if (!entry) {
-                    throw new Error(`EdgeAttributionList: output node ${key} not found in outputProbs`);
-                }
-                return entry.token;
-            }
-        }
-
-        // Dataset attributions: 2-part keys (layer:cIdx) - cIdx is vocab ID
-        if (parts.length === 2) {
-            const [layer, cIdx] = parts;
-
-            if (layer === "embed" || layer === "output") {
-                const vocabIdx = parseInt(cIdx);
-                assert(runState.allTokens.status === "loaded", `allTokens not loaded`);
-                const text = vocabIdToText.get(vocabIdx);
-                assert(text !== undefined, `Token not found for vocab index ${vocabIdx}`);
-                return text;
-            }
-        }
-
-        throw new Error(`getTokenText called on non-token node: ${key}`);
-    }
-
-    // Get the quoted token string for display (e.g., "'hello'")
-    function getQuotedTokenLabel(key: string): string {
-        return `'${getTokenText(key)}'`;
-    }
-
     // Get the token type label for the right side (e.g., "Input token" or "Output token")
     function getTokenTypeLabel(key: string): string {
         const layer = key.split(":")[0];
@@ -167,7 +102,7 @@
         {/if}
     </div>
     <div class="items">
-        {#each paginatedItems as { key, value, normalizedMagnitude } (key)}
+        {#each paginatedItems as { key, value, normalizedMagnitude, tokenStr } (key)}
             {@const bgColor = getBgColor(normalizedMagnitude)}
             {@const textColor = normalizedMagnitude > 0.8 ? "white" : "var(--text-primary)"}
             {@const formattedKey = key}
@@ -181,9 +116,9 @@
                             <span class="interp-label" style="color: {textColor};">{interp.data.label}</span>
                             <span class="layer-label" style="color: {textColor};">{getLayerLabel(key)}</span>
                         </span>
-                    {:else if isToken}
+                    {:else if isToken && tokenStr}
                         <span class="pill-content">
-                            <span class="interp-label" style="color: {textColor};">{getQuotedTokenLabel(key)}</span>
+                            <span class="interp-label" style="color: {textColor};">'{tokenStr}'</span>
                             <span class="layer-label" style="color: {textColor};">{getTokenTypeLabel(key)}</span>
                         </span>
                     {:else}
@@ -217,8 +152,8 @@
                                 </svg>
                             </button>
                             <div class="tooltip-confidence">Confidence: {interp.data.confidence}</div>
-                        {:else if isToken}
-                            <div class="tooltip-token">Token: '{getTokenText(key)}'</div>
+                        {:else if isToken && tokenStr}
+                            <div class="tooltip-token">Token: '{tokenStr}'</div>
                         {/if}
                     </div>
                 {/if}
diff --git a/spd/app/frontend/src/lib/api/datasetAttributions.ts b/spd/app/frontend/src/lib/api/datasetAttributions.ts
index 9916439fd..09aebe77f 100644
--- a/spd/app/frontend/src/lib/api/datasetAttributions.ts
+++ b/spd/app/frontend/src/lib/api/datasetAttributions.ts
@@ -9,6 +9,7 @@ export type DatasetAttributionEntry = {
     layer: string;
     component_idx: number;
     value: number;
+    token_str: string | null;
 };
 
 export type ComponentAttributions = {
diff --git a/spd/app/frontend/src/lib/promptAttributionsTypes.ts b/spd/app/frontend/src/lib/promptAttributionsTypes.ts
index fc705fad0..1b1370a1e 100644
--- a/spd/app/frontend/src/lib/promptAttributionsTypes.ts
+++ b/spd/app/frontend/src/lib/promptAttributionsTypes.ts
@@ -20,6 +20,7 @@ export type EdgeAttribution = {
     key: string; // "layer:seq:cIdx" for prompt or "layer:cIdx" for dataset
     value: number; // raw attribution value (positive or negative)
     normalizedMagnitude: number; // |value| / maxAbsValue, for color intensity (0-1)
+    tokenStr: string | null; // resolved token string for embed/output layers
 };
 
 export type OutputProbability = {

From 2385a82b94fd6c40ad053f9690f18d8488394e7d Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 20:42:03 +0000
Subject: [PATCH 43/62] Hide negative attribution column for non-signed metrics

attr_abs and mean_squared_attr are non-negative by definition, so the
negative top-k is meaningless. Only show negative column for signed attr.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/components/ui/DatasetAttributionsSection.svelte     | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte b/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
index d47668bc3..434ecfe62 100644
--- a/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
+++ b/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
@@ -59,10 +59,12 @@
         ),
     );
 
+    const hasSigned = $derived(selectedMetric === "attr");
+
     const positiveSources = $derived(toEdgeAttribution(active.positive_sources, maxSourceVal));
-    const negativeSources = $derived(toEdgeAttribution(active.negative_sources, maxSourceVal));
+    const negativeSources = $derived(hasSigned ? toEdgeAttribution(active.negative_sources, maxSourceVal) : []);
     const positiveTargets = $derived(toEdgeAttribution(active.positive_targets, maxTargetVal));
-    const negativeTargets = $derived(toEdgeAttribution(active.negative_targets, maxTargetVal));
+    const negativeTargets = $derived(hasSigned ? toEdgeAttribution(active.negative_targets, maxTargetVal) : []);
 </script>
 
 <div class="section">

From 747991a027ae0a2dd3e2f37ceaaa60e925c50728 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 20:55:28 +0000
Subject: [PATCH 44/62] Narrow frontend types: SignedAttributions vs
 UnsignedAttributions

mean_squared_attr only has positive_sources/positive_targets.
Hardcoded three paths in DatasetAttributionsSection matching each type.

Slow benchmark result: per-element loops >14x slower than scatter_add_
(>60 min vs 4.3 min per batch on s-17805b61, timed out at 1hr).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../ui/DatasetAttributionsSection.svelte      | 117 ++++++++++--------
 .../src/lib/api/datasetAttributions.ts        |  13 +-
 2 files changed, 75 insertions(+), 55 deletions(-)

diff --git a/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte b/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
index 434ecfe62..1d8799d63 100644
--- a/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
+++ b/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
@@ -9,15 +9,9 @@
     import { COMPONENT_CARD_CONSTANTS } from "../../lib/componentCardConstants";
     import type { EdgeAttribution } from "../../lib/promptAttributionsTypes";
     import type { DatasetAttributions } from "../../lib/useComponentData.svelte";
-    import type { AttrMetric } from "../../lib/api/datasetAttributions";
+    import type { AttrMetric, DatasetAttributionEntry } from "../../lib/api/datasetAttributions";
     import EdgeAttributionGrid from "./EdgeAttributionGrid.svelte";
 
-    const METRIC_LABELS: Record<AttrMetric, string> = {
-        attr: "Signed",
-        attr_abs: "Abs Target",
-        mean_squared_attr: "RMS",
-    };
-
     type Props = {
         attributions: DatasetAttributions;
         onComponentClick?: (componentKey: string) => void;
@@ -32,10 +26,8 @@
         }
     }
 
-    const active = $derived(attributions[selectedMetric]);
-
     function toEdgeAttribution(
-        entries: { component_key: string; value: number; token_str: string | null }[],
+        entries: DatasetAttributionEntry[],
         maxAbsValue: number,
     ): EdgeAttribution[] {
         return entries.map((e) => ({
@@ -46,53 +38,76 @@
         }));
     }
 
-    const maxSourceVal = $derived(
-        Math.max(
-            active.positive_sources[0]?.value ?? 0,
-            Math.abs(active.negative_sources[0]?.value ?? 0),
-        ),
-    );
-    const maxTargetVal = $derived(
-        Math.max(
-            active.positive_targets[0]?.value ?? 0,
-            Math.abs(active.negative_targets[0]?.value ?? 0),
-        ),
-    );
-
-    const hasSigned = $derived(selectedMetric === "attr");
-
-    const positiveSources = $derived(toEdgeAttribution(active.positive_sources, maxSourceVal));
-    const negativeSources = $derived(hasSigned ? toEdgeAttribution(active.negative_sources, maxSourceVal) : []);
-    const positiveTargets = $derived(toEdgeAttribution(active.positive_targets, maxTargetVal));
-    const negativeTargets = $derived(hasSigned ? toEdgeAttribution(active.negative_targets, maxTargetVal) : []);
+    function maxAbs(...vals: number[]): number {
+        return Math.max(...vals.map(Math.abs));
+    }
+
+    // attr: signed
+    const attrMaxSource = $derived(maxAbs(attributions.attr.positive_sources[0]?.value ?? 0, attributions.attr.negative_sources[0]?.value ?? 0));
+    const attrMaxTarget = $derived(maxAbs(attributions.attr.positive_targets[0]?.value ?? 0, attributions.attr.negative_targets[0]?.value ?? 0));
+
+    // attr_abs: signed
+    const absMaxSource = $derived(maxAbs(attributions.attr_abs.positive_sources[0]?.value ?? 0, attributions.attr_abs.negative_sources[0]?.value ?? 0));
+    const absMaxTarget = $derived(maxAbs(attributions.attr_abs.positive_targets[0]?.value ?? 0, attributions.attr_abs.negative_targets[0]?.value ?? 0));
+
+    // mean_squared_attr: unsigned (positive only)
+    const rmsMaxSource = $derived(attributions.mean_squared_attr.positive_sources[0]?.value ?? 0);
+    const rmsMaxTarget = $derived(attributions.mean_squared_attr.positive_targets[0]?.value ?? 0);
 </script>
 
 <div class="section">
     <div class="metric-selector">
-        {#each Object.entries(METRIC_LABELS) as [metric, label] (metric)}
-            <label class="radio-item">
-                <input
-                    type="radio"
-                    name="dataset-attr-metric"
-                    checked={selectedMetric === metric}
-                    onchange={() => (selectedMetric = metric as AttrMetric)}
-                />
-                <span class="stat-label">{label}</span>
-            </label>
-        {/each}
+        <label class="radio-item">
+            <input type="radio" name="dataset-attr-metric" checked={selectedMetric === "attr"} onchange={() => (selectedMetric = "attr")} />
+            <span class="stat-label">Signed</span>
+        </label>
+        <label class="radio-item">
+            <input type="radio" name="dataset-attr-metric" checked={selectedMetric === "attr_abs"} onchange={() => (selectedMetric = "attr_abs")} />
+            <span class="stat-label">Abs Target</span>
+        </label>
+        <label class="radio-item">
+            <input type="radio" name="dataset-attr-metric" checked={selectedMetric === "mean_squared_attr"} onchange={() => (selectedMetric = "mean_squared_attr")} />
+            <span class="stat-label">RMS</span>
+        </label>
     </div>
 
-    <EdgeAttributionGrid
-        title="Dataset Attributions"
-        incomingLabel="Incoming"
-        outgoingLabel="Outgoing"
-        incomingPositive={positiveSources}
-        incomingNegative={negativeSources}
-        outgoingPositive={positiveTargets}
-        outgoingNegative={negativeTargets}
-        pageSize={COMPONENT_CARD_CONSTANTS.DATASET_ATTRIBUTIONS_PAGE_SIZE}
-        onClick={handleClick}
-    />
+    {#if selectedMetric === "attr"}
+        <EdgeAttributionGrid
+            title="Dataset Attributions"
+            incomingLabel="Incoming"
+            outgoingLabel="Outgoing"
+            incomingPositive={toEdgeAttribution(attributions.attr.positive_sources, attrMaxSource)}
+            incomingNegative={toEdgeAttribution(attributions.attr.negative_sources, attrMaxSource)}
+            outgoingPositive={toEdgeAttribution(attributions.attr.positive_targets, attrMaxTarget)}
+            outgoingNegative={toEdgeAttribution(attributions.attr.negative_targets, attrMaxTarget)}
+            pageSize={COMPONENT_CARD_CONSTANTS.DATASET_ATTRIBUTIONS_PAGE_SIZE}
+            onClick={handleClick}
+        />
+    {:else if selectedMetric === "attr_abs"}
+        <EdgeAttributionGrid
+            title="Dataset Attributions"
+            incomingLabel="Incoming"
+            outgoingLabel="Outgoing"
+            incomingPositive={toEdgeAttribution(attributions.attr_abs.positive_sources, absMaxSource)}
+            incomingNegative={toEdgeAttribution(attributions.attr_abs.negative_sources, absMaxSource)}
+            outgoingPositive={toEdgeAttribution(attributions.attr_abs.positive_targets, absMaxTarget)}
+            outgoingNegative={toEdgeAttribution(attributions.attr_abs.negative_targets, absMaxTarget)}
+            pageSize={COMPONENT_CARD_CONSTANTS.DATASET_ATTRIBUTIONS_PAGE_SIZE}
+            onClick={handleClick}
+        />
+    {:else}
+        <EdgeAttributionGrid
+            title="Dataset Attributions"
+            incomingLabel="Incoming"
+            outgoingLabel="Outgoing"
+            incomingPositive={toEdgeAttribution(attributions.mean_squared_attr.positive_sources, rmsMaxSource)}
+            incomingNegative={[]}
+            outgoingPositive={toEdgeAttribution(attributions.mean_squared_attr.positive_targets, rmsMaxTarget)}
+            outgoingNegative={[]}
+            pageSize={COMPONENT_CARD_CONSTANTS.DATASET_ATTRIBUTIONS_PAGE_SIZE}
+            onClick={handleClick}
+        />
+    {/if}
 </div>
 
 <style>
diff --git a/spd/app/frontend/src/lib/api/datasetAttributions.ts b/spd/app/frontend/src/lib/api/datasetAttributions.ts
index 09aebe77f..158c68f6f 100644
--- a/spd/app/frontend/src/lib/api/datasetAttributions.ts
+++ b/spd/app/frontend/src/lib/api/datasetAttributions.ts
@@ -12,19 +12,24 @@ export type DatasetAttributionEntry = {
     token_str: string | null;
 };
 
-export type ComponentAttributions = {
+export type SignedAttributions = {
     positive_sources: DatasetAttributionEntry[];
     negative_sources: DatasetAttributionEntry[];
     positive_targets: DatasetAttributionEntry[];
     negative_targets: DatasetAttributionEntry[];
 };
 
+export type UnsignedAttributions = {
+    positive_sources: DatasetAttributionEntry[];
+    positive_targets: DatasetAttributionEntry[];
+};
+
 export type AttrMetric = "attr" | "attr_abs" | "mean_squared_attr";
 
 export type AllMetricAttributions = {
-    attr: ComponentAttributions;
-    attr_abs: ComponentAttributions;
-    mean_squared_attr: ComponentAttributions;
+    attr: SignedAttributions;
+    attr_abs: SignedAttributions;
+    mean_squared_attr: UnsignedAttributions;
 };
 
 export type DatasetAttributionsMetadata = {

From e36e187679126b25d28cae02fff644ed756d5626 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 20:58:57 +0000
Subject: [PATCH 45/62] Update dataset_attributions CLAUDE.md for new storage
 format and 3 metrics

Rewrite docs to reflect dict-of-dicts storage, canonical naming, split
entrypoints, 3 metrics, and updated query method signatures.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/dataset_attributions/CLAUDE.md | 155 ++++++++++-------------------
 1 file changed, 55 insertions(+), 100 deletions(-)

diff --git a/spd/dataset_attributions/CLAUDE.md b/spd/dataset_attributions/CLAUDE.md
index faf3a5373..ba2c75b9f 100644
--- a/spd/dataset_attributions/CLAUDE.md
+++ b/spd/dataset_attributions/CLAUDE.md
@@ -5,150 +5,105 @@ Multi-GPU pipeline for computing component-to-component attribution strengths ag
 ## Usage (SLURM)
 
 ```bash
-# Process specific number of batches
 spd-attributions <wandb_path> --n_batches 1000 --n_gpus 8
-
-# Process entire training dataset (omit --n_batches)
-spd-attributions <wandb_path> --n_gpus 24
-
-# With optional parameters
-spd-attributions <wandb_path> --n_batches 1000 --n_gpus 8 \
-    --batch_size 64 --ci_threshold 1e-6 --time 48:00:00
+spd-attributions <wandb_path> --n_gpus 24  # whole dataset
 ```
 
 The command:
-1. Creates a git snapshot branch for reproducibility (jobs may be queued)
-2. Submits a SLURM job array with N tasks (one per GPU)
+1. Creates a git snapshot branch for reproducibility
+2. Submits a SLURM job array (one per GPU)
 3. Each task processes batches where `batch_idx % world_size == rank`
-4. Submits a merge job (depends on array completion) that combines all worker results
-
-**Note**: `--n_batches` is optional. If omitted, the pipeline processes the entire training dataset.
+4. Submits a merge job (depends on array completion)
 
 ## Usage (non-SLURM)
 
-For environments without SLURM, run the worker script directly:
-
 ```bash
-# Single GPU (defaults from DatasetAttributionConfig, auto-generates subrun ID)
-python -m spd.dataset_attributions.scripts.run <wandb_path>
+# Single GPU
+python -m spd.dataset_attributions.scripts.run_worker <wandb_path>
 
-# Single GPU with config file
-python -m spd.dataset_attributions.scripts.run <wandb_path> --config_path path/to/config.yaml
-
-# Multi-GPU (run in parallel via shell, tmux, etc.)
-# All workers and the merge step must share the same --subrun_id
+# Multi-GPU
 SUBRUN="da-$(date +%Y%m%d_%H%M%S)"
-python -m spd.dataset_attributions.scripts.run <path> --config_json '{"n_batches": 1000}' --rank 0 --world_size 4 --subrun_id $SUBRUN &
-python -m spd.dataset_attributions.scripts.run <path> --config_json '{"n_batches": 1000}' --rank 1 --world_size 4 --subrun_id $SUBRUN &
-python -m spd.dataset_attributions.scripts.run <path> --config_json '{"n_batches": 1000}' --rank 2 --world_size 4 --subrun_id $SUBRUN &
-python -m spd.dataset_attributions.scripts.run <path> --config_json '{"n_batches": 1000}' --rank 3 --world_size 4 --subrun_id $SUBRUN &
+python -m spd.dataset_attributions.scripts.run_worker <path> --config_json '{"n_batches": 1000}' --rank 0 --world_size 4 --subrun_id $SUBRUN &
+python -m spd.dataset_attributions.scripts.run_worker <path> --config_json '{"n_batches": 1000}' --rank 1 --world_size 4 --subrun_id $SUBRUN &
+# ...
 wait
-
-# Merge results after all workers complete
-python -m spd.dataset_attributions.scripts.run <path> --merge --subrun_id $SUBRUN
+python -m spd.dataset_attributions.scripts.run_merge --wandb_path <path> --subrun_id $SUBRUN
 ```
 
-Each worker processes batches where `batch_idx % world_size == rank`, then the merge step combines all partial results.
-
 ## Data Storage
 
-Each attribution invocation creates a timestamped sub-run directory. `AttributionRepo` automatically loads from the latest sub-run.
-
 ```
 SPD_OUT_DIR/dataset_attributions/<run_id>/
-├── da-20260211_120000/                    # sub-run 1
-│   ├── dataset_attributions.pt            # Final merged attributions
-│   └── worker_states/                     # cleaned up after merge
+├── da-20260223_183250/                    # sub-run (latest picked by repo)
+│   ├── dataset_attributions.pt            # merged result
+│   └── worker_states/
 │       └── dataset_attributions_rank_*.pt
-├── da-20260211_140000/                    # sub-run 2
-│   └── ...
 ```
 
-Legacy layout (pre sub-run) is still supported as a fallback by `AttributionRepo`:
-
-```
-SPD_OUT_DIR/dataset_attributions/<run_id>/
-└── dataset_attributions.pt
-```
-
-## Architecture
+`AttributionRepo.open(run_id)` loads the latest `da-*` subrun that has a `dataset_attributions.pt`.
 
-### SLURM Launcher (`scripts/run_slurm.py`, `scripts/run_slurm_cli.py`)
+## Three Attribution Metrics
 
-Entry point via `spd-attributions`. Submits array job + dependent merge job.
+The harvester accumulates three metrics simultaneously:
 
-### Worker Script (`scripts/run.py`)
+| Metric | Formula | Description |
+|--------|---------|-------------|
+| `attr` | E[∂y/∂x · x] | Signed mean attribution |
+| `attr_abs` | E[∂\|y\|/∂x · x] | Attribution to absolute value of target (2 backward passes) |
+| `mean_squared_attr` | E[(∂y/∂x · x)²] | Mean squared attribution (pre-sqrt, mergeable across workers) |
 
-Internal script called by SLURM jobs. Accepts config via `--config_path` (file) or `--config_json` (inline JSON). Supports:
-- `--config_path`/`--config_json`: Provide `DatasetAttributionConfig` (defaults used if neither given)
-- `--rank R --world_size N`: Process subset of batches
-- `--merge`: Combine per-rank results into final file
-- `--subrun_id`: Sub-run identifier (auto-generated if not provided)
+Naming convention: modifier *before* `attr` applies to the target (e.g. `attr_abs` = attribution to |target|). Modifier *after* applies to the attribution itself (e.g. `squared_attr` = squared attribution).
 
-### Config (`config.py`)
+## Architecture
 
-`DatasetAttributionConfig` (tuning params) and `AttributionsSlurmConfig` (DatasetAttributionConfig + SLURM params). `wandb_path` is a runtime arg, not part of config.
+### Storage (`storage.py`)
 
-### Harvest Logic (`harvest.py`)
+`DatasetAttributionStorage` stores three nested dicts:
+```
+attrs[target_layer][source_layer] = Tensor[target_d, source_d]
+```
 
-Main harvesting functions:
-- `harvest_attributions(wandb_path, config, output_dir, ...)`: Process batches for a single rank
-- `merge_attributions(output_dir)`: Combine worker results from `output_dir/worker_states/` into `output_dir`
+All layer names use **canonical addressing** (`"embed"`, `"0.glu.up"`, `"output"`).
 
-### Attribution Harvester (`harvester.py`)
+For output targets, `target_d = d_model`. Output token attributions computed on-the-fly: `attr @ w_unembed[:, token_id]`.
 
-Core class that accumulates attribution strengths using gradient × activation formula:
+Key methods: `get_attribution()`, `get_top_sources()`, `get_top_targets()` — all take an `AttrMetric` parameter to select which metric dict to query. `merge(paths)` classmethod for combining worker results.
 
-```
-attribution[src, tgt] = Σ_batch Σ_pos (∂out[pos, tgt] / ∂in[pos, src]) × in_act[pos, src]
-```
+### Harvester (`harvester.py`)
 
-Key optimizations:
+Accumulates attributions using gradient × activation. Uses **concrete module paths** internally (talks to model cache/CI). Key optimizations:
 1. Sum outputs over positions before gradients (reduces backward passes)
-2. For output targets, store attributions to output residual stream instead of vocab tokens (reduces storage from O((V+C)²) to O((V+C)×(C+d_model)))
+2. Output-residual storage (O(d_model) instead of O(vocab))
+3. `scatter_add_` for embed sources, vectorized `.add_()` for components (>14x faster than per-element loops)
 
-### Storage (`storage.py`)
-
-`DatasetAttributionStorage` class using output-residual-based storage for scalability.
+### Harvest (`harvest.py`)
 
-**Storage structure:**
-- `source_to_component`: (n_sources, n_components) - direct attributions to component targets
-- `source_to_out_residual`: (n_sources, d_model) - attributions to output residual stream for output queries
+Orchestrates the pipeline: loads model, builds gradient connectivity, runs batches, translates concrete→canonical at storage boundary via `topology.target_to_canon()`.
 
-**Source indexing (rows):**
-- `[0, vocab_size)`: wte tokens
-- `[vocab_size, vocab_size + n_components)`: component layers
+### Scripts
 
-**Target handling:**
-- Component targets: direct lookup in `source_to_component`
-- Output targets: computed on-the-fly via `source_to_out_residual @ w_unembed[:, token_id]`
+- `scripts/run_worker.py` — worker entrypoint (single GPU)
+- `scripts/run_merge.py` — merge entrypoint (CPU only, needs ~200G RAM)
+- `scripts/run_slurm.py` — SLURM launcher (array + merge jobs)
+- `scripts/run_slurm_cli.py` — CLI wrapper for `spd-attributions`
 
-**Why output-residual-based storage?**
+### Config (`config.py`)
 
-For large vocab models (V=32K), the naive approach would require O((V+C)²) storage (~4 GB).
-The output-residual-based approach requires only O((V+C)×(C+d)) storage (~670 MB for Llama-scale),
-a 6.5x reduction. Output attributions are computed on-the-fly at query time with negligible latency.
+- `DatasetAttributionConfig`: n_batches, batch_size, ci_threshold
+- `AttributionsSlurmConfig`: adds n_gpus, partition, time, merge_time, merge_mem (default 200G)
 
 ### Repository (`repo.py`)
 
-`AttributionRepo` provides read access via `AttributionRepo.open(run_id)`. Returns `None` if no data exists. Storage is loaded eagerly at construction.
+`AttributionRepo.open(run_id)` → loads latest subrun. Returns `None` if no data.
 
-## Key Types
+## Query Methods
 
-```python
-DatasetAttributionStorage   # Main storage class with split matrices
-DatasetAttributionEntry     # Single entry: component_key, layer, component_idx, value
-DatasetAttributionConfig    # Config (BaseConfig): n_batches, batch_size, ci_threshold
-```
+All query methods take `metric: AttrMetric` (`"attr"`, `"attr_abs"`, or `"mean_squared_attr"`).
 
-## Query Methods
+| Method | w_unembed? | Description |
+|--------|-----------|-------------|
+| `get_top_sources(target_key, k, sign, metric)` | If output target | Top sources → target |
+| `get_top_targets(source_key, k, sign, metric)` | If include_outputs | Top targets ← source |
+| `get_attribution(source_key, target_key, metric)` | If output target | Single attribution value |
 
-| Method | w_unembed required? | Description |
-|--------|---------------------|-------------|
-| `get_top_sources(component_key, k, sign)` | No | Top sources → component target |
-| `get_top_sources(output_key, k, sign, w_unembed)` | Yes | Top sources → output token |
-| `get_top_component_targets(source_key, k, sign)` | No | Top component targets |
-| `get_top_output_targets(source_key, k, sign, w_unembed)` | Yes | Top output token targets |
-| `get_top_targets(source_key, k, sign, w_unembed)` | Yes | All targets (components + outputs) |
-| `get_attribution(source_key, component_key)` | No | Single component attribution |
-| `get_attribution(source_key, output_key, w_unembed)` | Yes | Single output attribution |
+Key format: `"embed:{token_id}"`, `"0.glu.up:{c_idx}"`, `"output:{token_id}"`.

From 206dcf0ac95c026f22f6c3494f5a7017ff5f5d17 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 22:56:10 +0000
Subject: [PATCH 46/62] Integrate new dataset attributions storage, lazy
 harvest loading, embed/output attributions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add attr_metric config (default: attr_abs) for selecting attribution metric
- Concrete↔canonical key translation between harvest and attribution storage
- Include embed/output token attributions in prompts (w_unembed support)
- Lazy component loading: use get_summary() upfront, get_component() per-prompt
  (eliminates 54GB harvest DB bulk load at startup)
- Add granular logging for each loading step
- Add export_html.py script for static site data export
- Unify get_downstream/get_upstream into single get_related_components with
  injected GetAttributed callable (from stash recovery)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/topological_interp/config.py              |   3 +-
 spd/topological_interp/graph_context.py       | 104 +--
 spd/topological_interp/interpret.py           | 632 +++++++-----------
 spd/topological_interp/prompts.py             |  49 +-
 spd/topological_interp/scripts/export_html.py | 271 ++++++++
 spd/topological_interp/scripts/run.py         |  32 +-
 6 files changed, 569 insertions(+), 522 deletions(-)
 create mode 100644 spd/topological_interp/scripts/export_html.py

diff --git a/spd/topological_interp/config.py b/spd/topological_interp/config.py
index 9a580aee7..6902f7c28 100644
--- a/spd/topological_interp/config.py
+++ b/spd/topological_interp/config.py
@@ -3,14 +3,15 @@
 from openrouter.components import Effort
 
 from spd.base_config import BaseConfig
+from spd.dataset_attributions.storage import AttrMetric
 from spd.settings import DEFAULT_PARTITION_NAME
 
 
 class TopologicalInterpConfig(BaseConfig):
     model: str = "google/gemini-3-flash-preview"
     reasoning_effort: Effort = "low"
+    attr_metric: AttrMetric = "attr_abs"
     top_k_attributed: int = 8
-    top_k_correlated: int = 5
     max_examples: int = 30
     label_max_words: int = 8
     cost_limit_usd: float | None = None
diff --git a/spd/topological_interp/graph_context.py b/spd/topological_interp/graph_context.py
index 316f51a85..8c9858870 100644
--- a/spd/topological_interp/graph_context.py
+++ b/spd/topological_interp/graph_context.py
@@ -1,11 +1,13 @@
-"""Gather related components from attribution graph and co-firing statistics."""
+"""Gather related components from attribution graph."""
 
+from collections.abc import Callable
 from dataclasses import dataclass
+from typing import Literal
 
-from spd.dataset_attributions.storage import DatasetAttributionStorage
+from spd.dataset_attributions.storage import DatasetAttributionEntry
 from spd.harvest.analysis import get_correlated_components
 from spd.harvest.storage import CorrelationStorage
-from spd.topological_interp.ordering import is_later_layer, parse_component_key
+from spd.topological_interp.ordering import parse_component_key
 from spd.topological_interp.schemas import LabelResult
 
 
@@ -19,98 +21,36 @@ class RelatedComponent:
     pmi: float | None
 
 
-def get_downstream_components(
-    component_key: str,
-    attribution_storage: DatasetAttributionStorage,
-    correlation_storage: CorrelationStorage,
-    labels_so_far: dict[str, LabelResult],
-    layer_descriptions: dict[str, str],
-    k: int,
-) -> list[RelatedComponent]:
-    """Top-K downstream (later-layer) components by absolute attribution."""
-    source_layer, _ = parse_component_key(component_key)
-
-    pos_targets = attribution_storage.get_top_component_targets(
-        component_key, k=k * 2, sign="positive"
-    )
-    neg_targets = attribution_storage.get_top_component_targets(
-        component_key, k=k * 2, sign="negative"
-    )
-
-    all_targets = pos_targets + neg_targets
-    all_targets.sort(key=lambda e: abs(e.value), reverse=True)
+GetAttributed = Callable[[str, int, Literal["positive", "negative"]], list[DatasetAttributionEntry]]
 
-    downstream = [
-        e
-        for e in all_targets
-        if e.layer in layer_descriptions
-        and is_later_layer(source_layer, e.layer, layer_descriptions)
-    ]
-    downstream = downstream[:k]
-
-    cofiring = _build_cofiring_lookup(component_key, correlation_storage, k * 3)
 
-    return [_build_related(e.component_key, e.value, cofiring, labels_so_far) for e in downstream]
-
-
-def get_upstream_components(
+def get_related_components(
     component_key: str,
-    attribution_storage: DatasetAttributionStorage,
+    get_attributed: GetAttributed,
     correlation_storage: CorrelationStorage,
     labels_so_far: dict[str, LabelResult],
-    layer_descriptions: dict[str, str],
     k: int,
 ) -> list[RelatedComponent]:
-    """Top-K upstream (earlier-layer) components by absolute attribution."""
-    target_layer, _ = parse_component_key(component_key)
-
-    pos_sources = attribution_storage.get_top_sources(component_key, k=k * 2, sign="positive")
-    neg_sources = attribution_storage.get_top_sources(component_key, k=k * 2, sign="negative")
+    """Top-K components connected via attribution, enriched with co-firing stats and labels."""
+    my_layer, _ = parse_component_key(component_key)
 
-    all_sources = [e for e in pos_sources + neg_sources if not e.component_key.startswith("wte:")]
-    all_sources.sort(key=lambda e: abs(e.value), reverse=True)
+    pos = get_attributed(component_key, k * 2, "positive")
+    neg = get_attributed(component_key, k * 2, "negative")
 
-    upstream = [
-        e
-        for e in all_sources
-        if e.layer in layer_descriptions
-        and is_later_layer(e.layer, target_layer, layer_descriptions)
-    ]
-    upstream = upstream[:k]
+    candidates = pos + neg
+    candidates.sort(key=lambda e: abs(e.value), reverse=True)
+    candidates = candidates[:k]
 
     cofiring = _build_cofiring_lookup(component_key, correlation_storage, k * 3)
+    result = [_build_related(e.component_key, e.value, cofiring, labels_so_far) for e in candidates]
 
-    return [_build_related(e.component_key, e.value, cofiring, labels_so_far) for e in upstream]
-
-
-def get_cofiring_components(
-    component_key: str,
-    correlation_storage: CorrelationStorage,
-    k: int,
-) -> list[RelatedComponent]:
-    """Top-K co-firing components by Jaccard similarity."""
-    correlated = get_correlated_components(
-        correlation_storage, component_key, metric="jaccard", top_k=k
-    )
-
-    pmi_lookup: dict[str, float] = {}
-    pmi_results = get_correlated_components(
-        correlation_storage, component_key, metric="pmi", top_k=k * 3
-    )
-    for c in pmi_results:
-        pmi_lookup[c.component_key] = c.score
-
-    return [
-        RelatedComponent(
-            component_key=c.component_key,
-            attribution=0.0,
-            label=None,
-            confidence=None,
-            jaccard=c.score,
-            pmi=pmi_lookup.get(c.component_key),
+    for r in result:
+        r_layer, _ = parse_component_key(r.component_key)
+        assert r_layer != my_layer, (
+            f"Same-layer component {r.component_key} in related list for {component_key}"
         )
-        for c in correlated
-    ]
+
+    return result
 
 
 def _build_cofiring_lookup(
diff --git a/spd/topological_interp/interpret.py b/spd/topological_interp/interpret.py
index 1b9637323..be572aa02 100644
--- a/spd/topological_interp/interpret.py
+++ b/spd/topological_interp/interpret.py
@@ -1,37 +1,39 @@
 """Main three-phase topological interpretation execution.
 
-Phase 1: Output pass (late → early) — "What does this component DO?"
-Phase 2: Input pass (early → late) — "What TRIGGERS this component?"
-Phase 3: Unification (parallel over all) — Synthesize into unified label.
-
-Each directional pass is a scan: components are processed layer-by-layer, and
-each component's prompt includes labels from previously-labeled components in
-the same scan direction. The scan accumulator is an in-memory dict; the DB is
-a write-only durable sink (read only on resume to seed the accumulator).
+Structure:
+    output_labels = scan(layers_reversed, step)
+    input_labels  = scan(layers_forward,  step)
+    unified       = map(output_labels + input_labels, unify)
+
+Each scan folds over layers. Within a layer, components are labeled in parallel
+via async LLM calls. The fold accumulator (labels_so_far) lets each component's
+prompt include labels from previously-processed layers.
 """
 
 import asyncio
-from collections.abc import Iterable
+from collections.abc import AsyncGenerator, Awaitable, Callable, Iterable
+from functools import partial
 from pathlib import Path
 from typing import Literal
 
+from torch import Tensor
+
 from spd.app.backend.app_tokenizer import AppTokenizer
 from spd.autointerp.llm_api import LLMError, LLMJob, LLMResult, map_llm_calls
 from spd.autointerp.schemas import ModelMetadata
-from spd.dataset_attributions.storage import DatasetAttributionStorage
+from spd.dataset_attributions.storage import (
+    AttrMetric,
+    DatasetAttributionEntry,
+    DatasetAttributionStorage,
+)
 from spd.harvest.analysis import get_input_token_stats, get_output_token_stats
 from spd.harvest.repo import HarvestRepo
-from spd.harvest.schemas import ComponentData
 from spd.harvest.storage import CorrelationStorage, TokenStatsStorage
 from spd.log import logger
+from spd.topological_interp import graph_context
 from spd.topological_interp.config import TopologicalInterpConfig
 from spd.topological_interp.db import TopologicalInterpDB
-from spd.topological_interp.graph_context import (
-    RelatedComponent,
-    get_cofiring_components,
-    get_downstream_components,
-    get_upstream_components,
-)
+from spd.topological_interp.graph_context import RelatedComponent, get_related_components
 from spd.topological_interp.ordering import group_and_sort_by_layer
 from spd.topological_interp.prompts import (
     LABEL_SCHEMA,
@@ -39,7 +41,11 @@
     format_output_prompt,
     format_unification_prompt,
 )
-from spd.topological_interp.schemas import LabelResult, PromptEdge
+from spd.topological_interp.schemas import LabelResult
+
+GetRelated = Callable[[str, dict[str, LabelResult]], list[RelatedComponent]]
+FormatPrompt = Callable[..., str]
+Step = Callable[[list[str], dict[str, LabelResult]], Awaitable[dict[str, LabelResult]]]
 
 
 def run_topological_interp(
@@ -52,168 +58,28 @@ def run_topological_interp(
     model_metadata: ModelMetadata,
     db_path: Path,
     tokenizer_name: str,
+    w_unembed: Tensor,
 ) -> None:
+    logger.info("Loading tokenizer...")
     app_tok = AppTokenizer.from_pretrained(tokenizer_name)
-    components = harvest.get_all_components()
-
-    components = [c for c in components if c.firing_density > 0.0]
-    components = sorted(components, key=lambda c: c.firing_density, reverse=True)
 
+    logger.info("Loading component summaries...")
+    summaries = harvest.get_summary()
+    alive = {k: s for k, s in summaries.items() if s.firing_density > 0.0}
+    all_keys = sorted(alive, key=lambda k: alive[k].firing_density, reverse=True)
     if config.limit is not None:
-        components = components[: config.limit]
-
-    all_keys = [c.component_key for c in components]
-    component_by_key = {c.component_key: c for c in components}
-    layers_ordered = group_and_sort_by_layer(all_keys, model_metadata.layer_descriptions)
+        all_keys = all_keys[: config.limit]
 
+    layers = group_and_sort_by_layer(all_keys, model_metadata.layer_descriptions)
     total = len(all_keys)
-    logger.info(f"Topological interp: {total} components across {len(layers_ordered)} layers")
-
-    async def _run() -> None:
-        db = TopologicalInterpDB(db_path)
-
-        try:
-            logger.section("Phase 1: Output pass (late → early)")
-            await _run_output_pass(
-                db=db,
-                layers_ordered=layers_ordered,
-                component_by_key=component_by_key,
-                config=config,
-                openrouter_api_key=openrouter_api_key,
-                model_metadata=model_metadata,
-                app_tok=app_tok,
-                token_stats=token_stats,
-                attribution_storage=attribution_storage,
-                correlation_storage=correlation_storage,
-                total=total,
-            )
-
-            logger.section("Phase 2: Input pass (early → late)")
-            await _run_input_pass(
-                db=db,
-                layers_ordered=layers_ordered,
-                component_by_key=component_by_key,
-                config=config,
-                openrouter_api_key=openrouter_api_key,
-                model_metadata=model_metadata,
-                app_tok=app_tok,
-                token_stats=token_stats,
-                attribution_storage=attribution_storage,
-                correlation_storage=correlation_storage,
-                total=total,
-            )
-
-            logger.section("Phase 3: Unification")
-            await _run_unification(
-                db=db,
-                all_keys=all_keys,
-                config=config,
-                openrouter_api_key=openrouter_api_key,
-            )
-
-            output_count = db.get_label_count("output_labels")
-            input_count = db.get_label_count("input_labels")
-            unified_count = db.get_label_count("unified_labels")
-            logger.info(
-                f"Completed: {output_count} output, {input_count} input, "
-                f"{unified_count} unified labels -> {db_path}"
-            )
-        finally:
-            db.close()
-
-    asyncio.run(_run())
-
-
-# -- Phase 1: Output pass -----------------------------------------------------
-
-
-def _build_output_jobs(
-    keys: list[str],
-    component_by_key: dict[str, ComponentData],
-    token_stats: TokenStatsStorage,
-    app_tok: AppTokenizer,
-    attribution_storage: DatasetAttributionStorage,
-    correlation_storage: CorrelationStorage,
-    labels_so_far: dict[str, LabelResult],
-    db: TopologicalInterpDB,
-    config: TopologicalInterpConfig,
-    model_metadata: ModelMetadata,
-) -> Iterable[LLMJob]:
-    for key in keys:
-        component = component_by_key[key]
-
-        input_stats = get_input_token_stats(token_stats, key, app_tok, top_k=20)
-        output_stats = get_output_token_stats(token_stats, key, app_tok, top_k=50)
-        assert input_stats is not None, f"No input token stats for {key}"
-        assert output_stats is not None, f"No output token stats for {key}"
-
-        downstream = get_downstream_components(
-            key,
-            attribution_storage,
-            correlation_storage,
-            labels_so_far,
-            model_metadata.layer_descriptions,
-            config.top_k_attributed,
-        )
-
-        _save_edges(db, key, downstream, "downstream", "output")
-
-        prompt = format_output_prompt(
-            component=component,
-            model_metadata=model_metadata,
-            app_tok=app_tok,
-            input_token_stats=input_stats,
-            output_token_stats=output_stats,
-            downstream=downstream,
-            label_max_words=config.label_max_words,
-            max_examples=config.max_examples,
-        )
-        yield LLMJob(prompt=prompt, schema=LABEL_SCHEMA, key=key)
-
-
-async def _run_output_pass(
-    db: TopologicalInterpDB,
-    layers_ordered: list[tuple[str, list[str]]],
-    component_by_key: dict[str, ComponentData],
-    config: TopologicalInterpConfig,
-    openrouter_api_key: str,
-    model_metadata: ModelMetadata,
-    app_tok: AppTokenizer,
-    token_stats: TokenStatsStorage,
-    attribution_storage: DatasetAttributionStorage,
-    correlation_storage: CorrelationStorage,
-    total: int,
-) -> None:
-    # Seed scan accumulator from DB for resume
-    labels_so_far: dict[str, LabelResult] = db.get_all_output_labels()
-    if labels_so_far:
-        logger.info(f"Output pass: resuming, {len(labels_so_far)} already completed")
-
-    completed_so_far = 0
-
-    for layer, keys in reversed(layers_ordered):
-        pending = [k for k in keys if k not in labels_so_far]
-        if not pending:
-            completed_so_far += len(keys)
-            continue
-
-        jobs = _build_output_jobs(
-            pending,
-            component_by_key,
-            token_stats,
-            app_tok,
-            attribution_storage,
-            correlation_storage,
-            labels_so_far,
-            db,
-            config,
-            model_metadata,
-        )
+    logger.info(f"Topological interp: {total} components across {len(layers)} layers")
 
-        n_errors = 0
-        n_done = 0
+    # -- Injected behaviours ---------------------------------------------------
 
-        async for outcome in map_llm_calls(
+    async def llm_map(
+        jobs: Iterable[LLMJob], n_total: int | None = None
+    ) -> AsyncGenerator[LLMResult | LLMError]:
+        async for result in map_llm_calls(
             openrouter_api_key=openrouter_api_key,
             model=config.model,
             reasoning_effort=config.reasoning_effort,
@@ -223,240 +89,234 @@ async def _run_output_pass(
             max_requests_per_minute=config.max_requests_per_minute,
             cost_limit_usd=config.cost_limit_usd,
             response_schema=LABEL_SCHEMA,
-            n_total=len(pending),
+            n_total=n_total,
         ):
-            match outcome:
-                case LLMResult(job=job, parsed=parsed, raw=raw):
-                    result = _parsed_to_label_result(job.key, parsed, raw, job.prompt)
-                    labels_so_far[job.key] = result
-                    db.save_output_label(result)
-                    n_done += 1
-                case LLMError(job=job, error=e):
-                    n_errors += 1
-                    logger.error(f"Output pass: skipping {job.key}: {type(e).__name__}: {e}")
-
-            _check_error_rate(n_errors, n_done)
-
-        completed_so_far += len(keys)
-        logger.info(f"Output pass: completed layer {layer} ({completed_so_far}/{total})")
-
-
-# -- Phase 2: Input pass ------------------------------------------------------
+            yield result
+
+    concrete_to_canon = model_metadata.layer_descriptions
+    canon_to_concrete = {v: k for k, v in concrete_to_canon.items()}
+
+    def _translate_entries(entries: list[DatasetAttributionEntry]) -> list[DatasetAttributionEntry]:
+        for e in entries:
+            if e.layer in canon_to_concrete:
+                e.layer = canon_to_concrete[e.layer]
+                e.component_key = f"{e.layer}:{e.component_idx}"
+        return entries
+
+    def _to_canon(concrete_key: str) -> str:
+        layer, idx = concrete_key.rsplit(":", 1)
+        return f"{concrete_to_canon[layer]}:{idx}"
+
+    def _make_get_targets(metric: AttrMetric) -> "graph_context.GetAttributed":
+        def get(
+            key: str, k: int, sign: Literal["positive", "negative"]
+        ) -> list[DatasetAttributionEntry]:
+            return _translate_entries(
+                attribution_storage.get_top_targets(
+                    _to_canon(key),
+                    k=k,
+                    sign=sign,
+                    metric=metric,
+                    w_unembed=w_unembed,
+                    include_outputs=True,
+                )
+            )
 
+        return get
 
-def _build_input_jobs(
-    keys: list[str],
-    component_by_key: dict[str, ComponentData],
-    token_stats: TokenStatsStorage,
-    app_tok: AppTokenizer,
-    attribution_storage: DatasetAttributionStorage,
-    correlation_storage: CorrelationStorage,
-    labels_so_far: dict[str, LabelResult],
-    db: TopologicalInterpDB,
-    config: TopologicalInterpConfig,
-    model_metadata: ModelMetadata,
-) -> Iterable[LLMJob]:
-    for key in keys:
-        component = component_by_key[key]
-
-        input_stats = get_input_token_stats(token_stats, key, app_tok, top_k=20)
-        output_stats = get_output_token_stats(token_stats, key, app_tok, top_k=50)
-        assert input_stats is not None, f"No input token stats for {key}"
-        assert output_stats is not None, f"No output token stats for {key}"
-
-        upstream = get_upstream_components(
-            key,
-            attribution_storage,
-            correlation_storage,
-            labels_so_far,
-            model_metadata.layer_descriptions,
-            config.top_k_attributed,
-        )
-        cofiring = get_cofiring_components(key, correlation_storage, config.top_k_correlated)
-
-        _save_edges(db, key, upstream, "upstream", "input")
-        _save_edges(db, key, cofiring, "upstream", "input")
-
-        prompt = format_input_prompt(
-            component=component,
-            model_metadata=model_metadata,
-            app_tok=app_tok,
-            input_token_stats=input_stats,
-            output_token_stats=output_stats,
-            upstream=upstream,
-            cofiring=cofiring,
-            label_max_words=config.label_max_words,
-            max_examples=config.max_examples,
-        )
-        yield LLMJob(prompt=prompt, schema=LABEL_SCHEMA, key=key)
+    def _make_get_sources(metric: AttrMetric) -> "graph_context.GetAttributed":
+        def get(
+            key: str, k: int, sign: Literal["positive", "negative"]
+        ) -> list[DatasetAttributionEntry]:
+            return _translate_entries(
+                attribution_storage.get_top_sources(_to_canon(key), k=k, sign=sign, metric=metric)
+            )
 
+        return get
 
-async def _run_input_pass(
-    db: TopologicalInterpDB,
-    layers_ordered: list[tuple[str, list[str]]],
-    component_by_key: dict[str, ComponentData],
-    config: TopologicalInterpConfig,
-    openrouter_api_key: str,
-    model_metadata: ModelMetadata,
-    app_tok: AppTokenizer,
-    token_stats: TokenStatsStorage,
-    attribution_storage: DatasetAttributionStorage,
-    correlation_storage: CorrelationStorage,
-    total: int,
-) -> None:
-    # Seed scan accumulator from DB for resume
-    labels_so_far: dict[str, LabelResult] = db.get_all_input_labels()
-    if labels_so_far:
-        logger.info(f"Input pass: resuming, {len(labels_so_far)} already completed")
+    def _get_related(get_attributed: "graph_context.GetAttributed") -> GetRelated:
+        def get(key: str, labels_so_far: dict[str, LabelResult]) -> list[RelatedComponent]:
+            return get_related_components(
+                key,
+                get_attributed,
+                correlation_storage,
+                labels_so_far,
+                config.top_k_attributed,
+            )
 
-    completed_so_far = 0
+        return get
+
+    # -- Layer processor -------------------------------------------------------
+
+    async def process_layer(
+        get_related: GetRelated,
+        format_prompt: FormatPrompt,
+        save_label: Callable[[LabelResult], None],
+        pending: list[str],
+        labels_so_far: dict[str, LabelResult],
+    ) -> dict[str, LabelResult]:
+        def jobs() -> Iterable[LLMJob]:
+            for key in pending:
+                component = harvest.get_component(key)
+                assert component is not None, f"Component {key} not found in harvest DB"
+                i_stats = get_input_token_stats(token_stats, key, app_tok, top_k=20)
+                o_stats = get_output_token_stats(token_stats, key, app_tok, top_k=50)
+                assert i_stats is not None and o_stats is not None
+
+                related = get_related(key, labels_so_far)
+                prompt = format_prompt(
+                    component=component,
+                    model_metadata=model_metadata,
+                    app_tok=app_tok,
+                    input_token_stats=i_stats,
+                    output_token_stats=o_stats,
+                    related=related,
+                    label_max_words=config.label_max_words,
+                    max_examples=config.max_examples,
+                )
+                yield LLMJob(prompt=prompt, schema=LABEL_SCHEMA, key=key)
+
+        return await _collect_labels(llm_map, jobs(), len(pending), save_label)
+
+    # -- Scan (fold over layers) -----------------------------------------------
+
+    async def scan(
+        layer_order: list[tuple[str, list[str]]],
+        initial: dict[str, LabelResult],
+        step: Step,
+    ) -> dict[str, LabelResult]:
+        labels = dict(initial)
+        if labels:
+            logger.info(f"Resuming, {len(labels)} already completed")
+
+        completed_so_far = 0
+        for layer, keys in layer_order:
+            pending = [k for k in keys if k not in labels]
+            if not pending:
+                completed_so_far += len(keys)
+                continue
+
+            new_labels = await step(pending, labels)
+            labels.update(new_labels)
 
-    for layer, keys in layers_ordered:
-        pending = [k for k in keys if k not in labels_so_far]
-        if not pending:
             completed_so_far += len(keys)
-            continue
-
-        jobs = _build_input_jobs(
-            pending,
-            component_by_key,
-            token_stats,
-            app_tok,
-            attribution_storage,
-            correlation_storage,
-            labels_so_far,
-            db,
-            config,
-            model_metadata,
-        )
-
-        n_errors = 0
-        n_done = 0
-
-        async for outcome in map_llm_calls(
-            openrouter_api_key=openrouter_api_key,
-            model=config.model,
-            reasoning_effort=config.reasoning_effort,
-            jobs=jobs,
-            max_tokens=8000,
-            max_concurrent=config.max_concurrent,
-            max_requests_per_minute=config.max_requests_per_minute,
-            cost_limit_usd=config.cost_limit_usd,
-            response_schema=LABEL_SCHEMA,
-            n_total=len(pending),
-        ):
-            match outcome:
-                case LLMResult(job=job, parsed=parsed, raw=raw):
-                    result = _parsed_to_label_result(job.key, parsed, raw, job.prompt)
-                    labels_so_far[job.key] = result
-                    db.save_input_label(result)
-                    n_done += 1
-                case LLMError(job=job, error=e):
-                    n_errors += 1
-                    logger.error(f"Input pass: skipping {job.key}: {type(e).__name__}: {e}")
-
-            _check_error_rate(n_errors, n_done)
-
-        completed_so_far += len(keys)
-        logger.info(f"Input pass: completed layer {layer} ({completed_so_far}/{total})")
-
+            logger.info(f"Completed layer {layer} ({completed_so_far}/{total})")
+
+        return labels
+
+    # -- Map (parallel over all components) ------------------------------------
+
+    async def map_unify(
+        output_labels: dict[str, LabelResult],
+        input_labels: dict[str, LabelResult],
+    ) -> None:
+        completed = db.get_completed_unified_keys()
+        keys = [k for k in all_keys if k not in completed]
+        if not keys:
+            logger.info("Unification: all labels already completed")
+            return
+        if completed:
+            logger.info(f"Unification: resuming, {len(completed)} already completed")
+
+        n_skipped = 0
+
+        def jobs() -> Iterable[LLMJob]:
+            nonlocal n_skipped
+            for key in keys:
+                out = output_labels.get(key)
+                inp = input_labels.get(key)
+                if out is None or inp is None:
+                    n_skipped += 1
+                    continue
+                prompt = format_unification_prompt(out, inp, config.label_max_words)
+                yield LLMJob(prompt=prompt, schema=LABEL_SCHEMA, key=key)
+
+        logger.info(f"Unifying {len(keys)} components")
+        new_labels = await _collect_labels(llm_map, jobs(), len(keys), db.save_unified_label)
+
+        if n_skipped:
+            logger.warning(f"Skipped {n_skipped} components missing output or input labels")
+        logger.info(f"Unification: completed {len(new_labels)}/{len(keys)}")
+
+    # -- Run -------------------------------------------------------------------
+
+    logger.info("Initializing DB and building scan steps...")
+    db = TopologicalInterpDB(db_path)
+
+    metric = config.attr_metric
+    get_targets = _make_get_targets(metric)
+    get_sources = _make_get_sources(metric)
+
+    label_output = partial(
+        process_layer,
+        _get_related(get_targets),
+        format_output_prompt,
+        db.save_output_label,
+    )
+    label_input = partial(
+        process_layer,
+        _get_related(get_sources),
+        format_input_prompt,
+        db.save_input_label,
+    )
 
-# -- Phase 3: Unification -----------------------------------------------------
+    async def _run() -> None:
+        logger.section("Phase 1: Output pass (late → early)")
+        output_labels = await scan(list(reversed(layers)), db.get_all_output_labels(), label_output)
 
+        logger.section("Phase 2: Input pass (early → late)")
+        input_labels = await scan(list(layers), db.get_all_input_labels(), label_input)
 
-def _build_unification_jobs(
-    keys: list[str],
-    db: TopologicalInterpDB,
-    config: TopologicalInterpConfig,
-) -> Iterable[LLMJob]:
-    n_skipped = 0
-    for key in keys:
-        output_label = db.get_output_label(key)
-        input_label = db.get_input_label(key)
-        if output_label is None or input_label is None:
-            n_skipped += 1
-            logger.warning(
-                f"Skipping unification for {key}: "
-                f"output={'yes' if output_label else 'MISSING'}, "
-                f"input={'yes' if input_label else 'MISSING'}"
-            )
-            continue
+        logger.section("Phase 3: Unification")
+        await map_unify(output_labels, input_labels)
 
-        prompt = format_unification_prompt(
-            output_label=output_label,
-            input_label=input_label,
-            label_max_words=config.label_max_words,
+        logger.info(
+            f"Completed: {db.get_label_count('output_labels')} output, "
+            f"{db.get_label_count('input_labels')} input, "
+            f"{db.get_label_count('unified_labels')} unified labels -> {db_path}"
         )
-        yield LLMJob(prompt=prompt, schema=LABEL_SCHEMA, key=key)
-    if n_skipped:
-        logger.warning(f"Skipped {n_skipped} components missing output or input labels")
-
 
-async def _run_unification(
-    db: TopologicalInterpDB,
-    all_keys: list[str],
-    config: TopologicalInterpConfig,
-    openrouter_api_key: str,
-) -> None:
-    completed = db.get_completed_unified_keys()
-    pending = [k for k in all_keys if k not in completed]
+    try:
+        asyncio.run(_run())
+    finally:
+        db.close()
 
-    if not pending:
-        logger.info("Unification: all labels already completed")
-        return
 
-    if completed:
-        logger.info(f"Unification: resuming, {len(completed)} already completed")
+# -- Shared LLM call machinery ------------------------------------------------
 
-    logger.info(f"Unifying {len(pending)} components")
-
-    jobs = _build_unification_jobs(pending, db, config)
 
+async def _collect_labels(
+    llm_map: Callable[[Iterable[LLMJob], int | None], AsyncGenerator[LLMResult | LLMError]],
+    jobs: Iterable[LLMJob],
+    n_total: int,
+    save_label: Callable[[LabelResult], None],
+) -> dict[str, LabelResult]:
+    """Run LLM jobs, parse results, save to DB, return new labels."""
+    new_labels: dict[str, LabelResult] = {}
     n_errors = 0
-    n_done = 0
-
-    async for outcome in map_llm_calls(
-        openrouter_api_key=openrouter_api_key,
-        model=config.model,
-        reasoning_effort=config.reasoning_effort,
-        jobs=jobs,
-        max_tokens=4000,
-        max_concurrent=config.max_concurrent,
-        max_requests_per_minute=config.max_requests_per_minute,
-        cost_limit_usd=config.cost_limit_usd,
-        response_schema=LABEL_SCHEMA,
-        n_total=len(pending),
-    ):
+
+    async for outcome in llm_map(jobs, n_total):
         match outcome:
             case LLMResult(job=job, parsed=parsed, raw=raw):
-                result = _parsed_to_label_result(job.key, parsed, raw, job.prompt)
-                db.save_unified_label(result)
-                n_done += 1
+                result = _parse_label(job.key, parsed, raw, job.prompt)
+                save_label(result)
+                new_labels[job.key] = result
             case LLMError(job=job, error=e):
                 n_errors += 1
-                logger.error(f"Unification: skipping {job.key}: {type(e).__name__}: {e}")
-
-        _check_error_rate(n_errors, n_done)
+                logger.error(f"Skipping {job.key}: {type(e).__name__}: {e}")
+        _check_error_rate(n_errors, len(new_labels))
 
-    logger.info(f"Unification: completed {n_done}/{len(pending)}")
+    return new_labels
 
 
-# -- Helpers -------------------------------------------------------------------
-
-
-def _parsed_to_label_result(
-    component_key: str,
-    parsed: dict[str, object],
-    raw: str,
-    prompt: str,
-) -> LabelResult:
+def _parse_label(key: str, parsed: dict[str, object], raw: str, prompt: str) -> LabelResult:
     assert len(parsed) == 3, f"Expected 3 fields, got {len(parsed)}"
     label = parsed["label"]
     confidence = parsed["confidence"]
     reasoning = parsed["reasoning"]
     assert isinstance(label, str) and isinstance(confidence, str) and isinstance(reasoning, str)
     return LabelResult(
-        component_key=component_key,
+        component_key=key,
         label=label,
         confidence=confidence,
         reasoning=reasoning,
@@ -471,27 +331,3 @@ def _check_error_rate(n_errors: int, n_done: int) -> None:
         raise RuntimeError(
             f"Error rate {n_errors / total:.0%} ({n_errors}/{total}) exceeds 20% threshold"
         )
-
-
-def _save_edges(
-    db: TopologicalInterpDB,
-    component_key: str,
-    related: list[RelatedComponent],
-    direction: Literal["upstream", "downstream"],
-    pass_name: Literal["output", "input"],
-) -> None:
-    if not related:
-        return
-    edges = [
-        PromptEdge(
-            component_key=component_key,
-            related_key=r.component_key,
-            direction=direction,
-            pass_name=pass_name,
-            attribution=r.attribution,
-            related_label=r.label,
-            related_confidence=r.confidence,
-        )
-        for r in related
-    ]
-    db.save_prompt_edges(edges)
diff --git a/spd/topological_interp/prompts.py b/spd/topological_interp/prompts.py
index 0c4abc8ca..415904343 100644
--- a/spd/topological_interp/prompts.py
+++ b/spd/topological_interp/prompts.py
@@ -105,14 +105,14 @@ def format_output_prompt(
     app_tok: AppTokenizer,
     input_token_stats: TokenPRLift,
     output_token_stats: TokenPRLift,
-    downstream: list[RelatedComponent],
+    related: list[RelatedComponent],
     label_max_words: int,
     max_examples: int,
 ) -> str:
     context = _build_context_block(
         component, model_metadata, app_tok, input_token_stats, output_token_stats, max_examples
     )
-    downstream_table = _format_attributed_table(downstream)
+    related_table = _format_attributed_table(related, app_tok)
 
     return f"""\
 You are analyzing a component in a neural network to understand its OUTPUT FUNCTION — what it does when it fires.
@@ -120,7 +120,7 @@ def format_output_prompt(
 {context}
 ## Downstream components (what this component influences)
 These components in later layers are most influenced by this component (by gradient attribution):
-{downstream_table}
+{related_table}
 ## Task
 Give a {label_max_words}-word-or-fewer label describing this component's OUTPUT FUNCTION — what it does when it fires.
 
@@ -142,16 +142,14 @@ def format_input_prompt(
     app_tok: AppTokenizer,
     input_token_stats: TokenPRLift,
     output_token_stats: TokenPRLift,
-    upstream: list[RelatedComponent],
-    cofiring: list[RelatedComponent],
+    related: list[RelatedComponent],
     label_max_words: int,
     max_examples: int,
 ) -> str:
     context = _build_context_block(
         component, model_metadata, app_tok, input_token_stats, output_token_stats, max_examples
     )
-    upstream_table = _format_attributed_table(upstream)
-    cofiring_table = _format_cofiring_table(cofiring)
+    related_table = _format_attributed_table(related, app_tok)
 
     return f"""\
 You are analyzing a component in a neural network to understand its INPUT FUNCTION — what triggers it to fire.
@@ -159,10 +157,7 @@ def format_input_prompt(
 {context}
 ## Upstream components (what feeds into this component)
 These components in earlier layers most strongly attribute to this component:
-{upstream_table}
-## Co-firing components
-Components that frequently fire together with this one:
-{cofiring_table}
+{related_table}
 ## Task
 Give a {label_max_words}-word-or-fewer label describing this component's INPUT FUNCTION — what conditions trigger it to fire.
 
@@ -198,13 +193,14 @@ def format_unification_prompt(
 """
 
 
-def _format_attributed_table(components: list[RelatedComponent]) -> str:
+def _format_attributed_table(components: list[RelatedComponent], app_tok: AppTokenizer) -> str:
     if not components:
         return "(no attributed components found)\n"
 
     lines: list[str] = []
     for n in components:
-        parts = [f"  {n.component_key} (attribution: {n.attribution:.4f}"]
+        display = _component_display(n.component_key, app_tok)
+        parts = [f"  {display} (attribution: {n.attribution:.4f}"]
         if n.jaccard is not None:
             parts.append(f", co-firing Jaccard: {n.jaccard:.3f}")
         parts.append(")")
@@ -217,21 +213,12 @@ def _format_attributed_table(components: list[RelatedComponent]) -> str:
     return "\n".join(lines) + "\n"
 
 
-def _format_cofiring_table(components: list[RelatedComponent]) -> str:
-    if not components:
-        return "(no co-firing components found)\n"
-
-    lines: list[str] = []
-    for n in components:
-        parts = [f"  {n.component_key}"]
-        if n.jaccard is not None:
-            parts.append(f" (Jaccard: {n.jaccard:.3f}")
-            if n.pmi is not None:
-                parts.append(f", PMI: {n.pmi:.2f}")
-            parts.append(")")
-        line = "".join(parts)
-        if n.label is not None:
-            line += f'\n    label: "{n.label}" (confidence: {n.confidence})'
-        lines.append(line)
-
-    return "\n".join(lines) + "\n"
+def _component_display(key: str, app_tok: AppTokenizer) -> str:
+    layer, idx_str = key.rsplit(":", 1)
+    match layer:
+        case "embed":
+            return f'input token "{app_tok.get_tok_display(int(idx_str))}"'
+        case "output":
+            return f'output token "{app_tok.get_tok_display(int(idx_str))}"'
+        case _:
+            return key
diff --git a/spd/topological_interp/scripts/export_html.py b/spd/topological_interp/scripts/export_html.py
new file mode 100644
index 000000000..9616214bf
--- /dev/null
+++ b/spd/topological_interp/scripts/export_html.py
@@ -0,0 +1,271 @@
+"""Export topological interpretation data to JSON for the static HTML page.
+
+Usage:
+    python -m spd.topological_interp.scripts.export_html s-17805b61
+    python -m spd.topological_interp.scripts.export_html s-17805b61 --subrun_id ti-20260223_213443
+    python -m spd.topological_interp.scripts.export_html s-17805b61 --mock
+"""
+
+import json
+import random
+from dataclasses import asdict
+from typing import Any
+
+from spd.settings import SPD_OUT_DIR
+from spd.topological_interp.repo import TopologicalInterpRepo
+from spd.topological_interp.schemas import LabelResult, get_topological_interp_dir
+
+WWW_DIR = SPD_OUT_DIR / "www"
+DATA_DIR = WWW_DIR / "data"
+
+
+def _label_to_dict(label: LabelResult) -> dict[str, str]:
+    return {
+        "label": label.label,
+        "confidence": label.confidence,
+        "reasoning": label.reasoning,
+    }
+
+
+def _parse_component_key(key: str) -> tuple[str, int]:
+    layer, idx_str = key.rsplit(":", 1)
+    return layer, int(idx_str)
+
+
+def export_from_repo(repo: TopologicalInterpRepo) -> dict[str, Any]:
+    output_labels = repo.get_all_output_labels()
+    input_labels = repo.get_all_input_labels()
+    unified_labels = repo.get_all_unified_labels()
+
+    all_keys = sorted(
+        set(output_labels) | set(input_labels) | set(unified_labels),
+        key=lambda k: (_parse_component_key(k)[0], _parse_component_key(k)[1]),
+    )
+
+    components = []
+    for key in all_keys:
+        layer, component_idx = _parse_component_key(key)
+        entry: dict[str, Any] = {
+            "key": key,
+            "layer": layer,
+            "component_idx": component_idx,
+        }
+        if key in output_labels:
+            entry["output_label"] = _label_to_dict(output_labels[key])
+        if key in input_labels:
+            entry["input_label"] = _label_to_dict(input_labels[key])
+        if key in unified_labels:
+            entry["unified_label"] = _label_to_dict(unified_labels[key])
+
+        edges = repo.get_prompt_edges(key)
+        if edges:
+            entry["edges"] = [asdict(e) for e in edges]
+
+        components.append(entry)
+
+    label_counts = repo.get_label_counts()
+
+    return {
+        "decomposition_id": repo.run_id,
+        "subrun_id": repo.subrun_id,
+        "label_counts": label_counts,
+        "components": components,
+    }
+
+
+def generate_mock_data(decomposition_id: str) -> dict[str, Any]:
+    random.seed(42)
+
+    layers = [
+        "h.0.mlp.c_fc",
+        "h.0.mlp.down_proj",
+        "h.0.attn.q_proj",
+        "h.0.attn.k_proj",
+        "h.0.attn.v_proj",
+        "h.0.attn.o_proj",
+        "h.1.mlp.c_fc",
+        "h.1.mlp.down_proj",
+        "h.1.attn.q_proj",
+        "h.1.attn.k_proj",
+        "h.1.attn.v_proj",
+        "h.1.attn.o_proj",
+    ]
+
+    output_labels_pool = [
+        "sentence-final punctuation and period prediction",
+        "proper nouns and character name completions",
+        "emotional adjectives describing characters",
+        "temporal adverbs and time-related transitions",
+        "morphological suffix completion (-ing, -ed, -ly)",
+        "determiners preceding concrete nouns",
+        "dialogue-opening quotation marks and speech verbs",
+        "plural noun suffixes after quantity words",
+        "conjunction and clause boundary detection",
+        "verb tense agreement and auxiliary verbs",
+        "spatial prepositions and location descriptors",
+        "possessive pronouns and genitive markers",
+        "narrative action verbs (walked, looked, said)",
+        "abstract emotion nouns (fear, joy, anger)",
+        "comparative and superlative adjective forms",
+    ]
+
+    input_labels_pool = [
+        "punctuation and common function words",
+        "sentence-initial capital letters and proper nouns",
+        "mid-sentence verbs following subject nouns",
+        "adjective-noun boundaries in descriptive phrases",
+        "clause-final positions before conjunctions",
+        "article-noun sequences in noun phrases",
+        "subject pronouns at clause boundaries",
+        "preposition-object sequences",
+        "verb stems preceding inflectional suffixes",
+        "quotation marks and dialogue boundaries",
+        "comma-separated list items",
+        "sentence-medial adverbs after auxiliaries",
+        "concrete nouns following determiners",
+        "coordinating conjunctions between clauses",
+        "word stems requiring morphological completion",
+    ]
+
+    unified_labels_pool = [
+        "sentence termination tracking and terminal punctuation prediction",
+        "character name recognition and proper noun completion",
+        "emotional state description through adjective selection",
+        "temporal transition signaling via adverbs and tense markers",
+        "morphological word completion from stems to suffixed forms",
+        "noun phrase construction: determiners predicting concrete nouns",
+        "dialogue framing through quotation marks and speech attribution",
+        "plural morphology following quantifiers and numerals",
+        "clause coordination and syntactic boundary marking",
+        "verbal agreement and auxiliary verb selection",
+        "spatial relationship encoding via prepositional phrases",
+        "possessive construction and genitive case marking",
+        "narrative action sequencing through core verbs",
+        "abstract emotional vocabulary and sentiment expression",
+        "degree modification and comparative construction",
+    ]
+
+    confidences = ["high", "high", "high", "medium", "medium", "low"]
+
+    reasoning_templates = [
+        "The output function focuses on {output_focus}, while the input function responds to {input_focus}. Together, this component acts as a bridge between {bridge_from} and {bridge_to}, consistent with its position in {layer}.",
+        "This component's output pattern of {output_focus} is activated by {input_focus} in the input. The unified interpretation captures how {bridge_from} contexts trigger {bridge_to} predictions.",
+        "Downstream context shows this component feeds into {output_focus} pathways, while upstream context reveals activation by {input_focus}. The synthesis reflects a coherent role in {bridge_from}-to-{bridge_to} processing.",
+    ]
+
+    focus_terms = [
+        "punctuation patterns",
+        "noun completions",
+        "verb inflections",
+        "emotional descriptors",
+        "syntactic boundaries",
+        "morphological suffixes",
+        "dialogue markers",
+        "temporal signals",
+        "spatial relationships",
+    ]
+
+    components = []
+    for layer in layers:
+        n_components = random.randint(8, 20)
+        indices = sorted(random.sample(range(500), n_components))
+        for idx in indices:
+            key = f"{layer}:{idx}"
+            conf = random.choice(confidences)
+            output_conf = random.choice(confidences)
+            input_conf = random.choice(confidences)
+
+            output_label = random.choice(output_labels_pool)
+            input_label = random.choice(input_labels_pool)
+            unified_label = random.choice(unified_labels_pool)
+
+            reasoning = random.choice(reasoning_templates).format(
+                output_focus=random.choice(focus_terms),
+                input_focus=random.choice(focus_terms),
+                bridge_from=random.choice(focus_terms),
+                bridge_to=random.choice(focus_terms),
+                layer=layer,
+            )
+
+            components.append(
+                {
+                    "key": key,
+                    "layer": layer,
+                    "component_idx": idx,
+                    "output_label": {
+                        "label": output_label,
+                        "confidence": output_conf,
+                        "reasoning": f"Output: {reasoning}",
+                    },
+                    "input_label": {
+                        "label": input_label,
+                        "confidence": input_conf,
+                        "reasoning": f"Input: {reasoning}",
+                    },
+                    "unified_label": {
+                        "label": unified_label,
+                        "confidence": conf,
+                        "reasoning": reasoning,
+                    },
+                }
+            )
+
+    return {
+        "decomposition_id": decomposition_id,
+        "subrun_id": "ti-mock",
+        "label_counts": {
+            "output": len(components),
+            "input": len(components),
+            "unified": len(components),
+        },
+        "components": components,
+    }
+
+
+def main(
+    decomposition_id: str,
+    subrun_id: str | None = None,
+    mock: bool = False,
+) -> None:
+    DATA_DIR.mkdir(parents=True, exist_ok=True)
+    out_path = DATA_DIR / f"topological_interp_{decomposition_id}.json"
+
+    if mock:
+        data = generate_mock_data(decomposition_id)
+        print(f"Generated mock data: {len(data['components'])} components")
+    else:
+        if subrun_id is not None:
+            base_dir = get_topological_interp_dir(decomposition_id)
+            subrun_dir = base_dir / subrun_id
+            assert subrun_dir.exists(), f"Subrun dir not found: {subrun_dir}"
+            db_path = subrun_dir / "interp.db"
+            assert db_path.exists(), f"No interp.db in {subrun_dir}"
+            from spd.topological_interp.db import TopologicalInterpDB
+
+            db = TopologicalInterpDB(db_path, readonly=True)
+            repo = TopologicalInterpRepo(db=db, subrun_dir=subrun_dir, run_id=decomposition_id)
+        else:
+            repo = TopologicalInterpRepo.open(decomposition_id)
+            if repo is None:
+                print(
+                    f"No topological interp data for {decomposition_id}. "
+                    "Generating mock data instead."
+                )
+                data = generate_mock_data(decomposition_id)
+                with open(out_path, "w") as f:
+                    json.dump(data, f)
+                print(f"Wrote mock data to {out_path}")
+                return
+
+        data = export_from_repo(repo)
+        print(f"Exported {len(data['components'])} components from {data['subrun_id']}")
+
+    with open(out_path, "w") as f:
+        json.dump(data, f)
+    print(f"Wrote {out_path}")
+
+
+if __name__ == "__main__":
+    import fire
+
+    fire.Fire(main)
diff --git a/spd/topological_interp/scripts/run.py b/spd/topological_interp/scripts/run.py
index 9857a60be..c37d550a8 100644
--- a/spd/topological_interp/scripts/run.py
+++ b/spd/topological_interp/scripts/run.py
@@ -11,6 +11,7 @@
 from dotenv import load_dotenv
 
 from spd.adapters import adapter_from_id
+from spd.adapters.spd import SPDAdapter
 from spd.dataset_attributions.repo import AttributionRepo
 from spd.harvest.repo import HarvestRepo
 from spd.log import logger
@@ -31,32 +32,42 @@ def main(
     openrouter_api_key = os.environ.get("OPENROUTER_API_KEY")
     assert openrouter_api_key, "OPENROUTER_API_KEY not set"
 
+    subrun_id = "ti-" + datetime.now().strftime("%Y%m%d_%H%M%S")
+    subrun_dir = get_topological_interp_subrun_dir(decomposition_id, subrun_id)
+    subrun_dir.mkdir(parents=True, exist_ok=True)
+    config.to_file(subrun_dir / "config.yaml")
+    db_path = subrun_dir / "interp.db"
+    logger.info(f"Topological interp run: {subrun_dir}")
+
+    logger.info("Loading adapter and model metadata...")
+    adapter = adapter_from_id(decomposition_id)
+    assert isinstance(adapter, SPDAdapter)
+    w_unembed = adapter._topology.get_unembed_weight()
+
+    logger.info("Loading harvest data...")
     if harvest_subrun_id is not None:
         harvest = HarvestRepo(decomposition_id, subrun_id=harvest_subrun_id, readonly=True)
     else:
         harvest = HarvestRepo.open_most_recent(decomposition_id, readonly=True)
         assert harvest is not None, f"No harvest data for {decomposition_id}"
 
+    logger.info("Loading dataset attributions...")
     attributions = AttributionRepo.open(decomposition_id)
     assert attributions is not None, f"Dataset attributions required for {decomposition_id}"
     attribution_storage = attributions.get_attributions()
+    logger.info(
+        f"  {attribution_storage.n_components} components, {attribution_storage.n_batches_processed} batches"
+    )
 
+    logger.info("Loading component correlations...")
     correlations = harvest.get_correlations()
     assert correlations is not None, f"Component correlations required for {decomposition_id}"
 
+    logger.info("Loading token stats...")
     token_stats = harvest.get_token_stats()
     assert token_stats is not None, f"Token stats required for {decomposition_id}"
 
-    subrun_id = "ti-" + datetime.now().strftime("%Y%m%d_%H%M%S")
-    subrun_dir = get_topological_interp_subrun_dir(decomposition_id, subrun_id)
-    subrun_dir.mkdir(parents=True, exist_ok=True)
-
-    config.to_file(subrun_dir / "config.yaml")
-    db_path = subrun_dir / "interp.db"
-
-    logger.info(f"Topological interp run: {subrun_dir}")
-
-    adapter = adapter_from_id(decomposition_id)
+    logger.info("Data loading complete")
 
     run_topological_interp(
         openrouter_api_key=openrouter_api_key,
@@ -68,6 +79,7 @@ def main(
         model_metadata=adapter.model_metadata,
         db_path=db_path,
         tokenizer_name=adapter.tokenizer_name,
+        w_unembed=w_unembed,
     )
 
 

From 42fca1124fd9590cd141d19c464960ebbdc1bb4f Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 22:59:49 +0000
Subject: [PATCH 47/62] Separate output/input context in prompts, reduce
 examples, remove erroneous metadata

Output pass only sees: output tokens, says examples, downstream components.
Input pass only sees: input tokens, fires-on examples, upstream components.
Remove dataset description and model class from prompts (erroneous).
Reduce max_examples default from 30 to 10.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/topological_interp/config.py    |  2 +-
 spd/topological_interp/interpret.py | 45 +++++++++++----
 spd/topological_interp/prompts.py   | 85 +++++++++++------------------
 3 files changed, 67 insertions(+), 65 deletions(-)

diff --git a/spd/topological_interp/config.py b/spd/topological_interp/config.py
index 6902f7c28..533e724a8 100644
--- a/spd/topological_interp/config.py
+++ b/spd/topological_interp/config.py
@@ -12,7 +12,7 @@ class TopologicalInterpConfig(BaseConfig):
     reasoning_effort: Effort = "low"
     attr_metric: AttrMetric = "attr_abs"
     top_k_attributed: int = 8
-    max_examples: int = 30
+    max_examples: int = 10
     label_max_words: int = 8
     cost_limit_usd: float | None = None
     max_requests_per_minute: int = 500
diff --git a/spd/topological_interp/interpret.py b/spd/topological_interp/interpret.py
index be572aa02..80368fa48 100644
--- a/spd/topological_interp/interpret.py
+++ b/spd/topological_interp/interpret.py
@@ -44,7 +44,6 @@
 from spd.topological_interp.schemas import LabelResult
 
 GetRelated = Callable[[str, dict[str, LabelResult]], list[RelatedComponent]]
-FormatPrompt = Callable[..., str]
 Step = Callable[[list[str], dict[str, LabelResult]], Awaitable[dict[str, LabelResult]]]
 
 
@@ -146,11 +145,10 @@ def get(key: str, labels_so_far: dict[str, LabelResult]) -> list[RelatedComponen
 
         return get
 
-    # -- Layer processor -------------------------------------------------------
+    # -- Layer processors ------------------------------------------------------
 
-    async def process_layer(
+    async def process_output_layer(
         get_related: GetRelated,
-        format_prompt: FormatPrompt,
         save_label: Callable[[LabelResult], None],
         pending: list[str],
         labels_so_far: dict[str, LabelResult],
@@ -159,16 +157,14 @@ def jobs() -> Iterable[LLMJob]:
             for key in pending:
                 component = harvest.get_component(key)
                 assert component is not None, f"Component {key} not found in harvest DB"
-                i_stats = get_input_token_stats(token_stats, key, app_tok, top_k=20)
                 o_stats = get_output_token_stats(token_stats, key, app_tok, top_k=50)
-                assert i_stats is not None and o_stats is not None
+                assert o_stats is not None, f"No output token stats for {key}"
 
                 related = get_related(key, labels_so_far)
-                prompt = format_prompt(
+                prompt = format_output_prompt(
                     component=component,
                     model_metadata=model_metadata,
                     app_tok=app_tok,
-                    input_token_stats=i_stats,
                     output_token_stats=o_stats,
                     related=related,
                     label_max_words=config.label_max_words,
@@ -178,6 +174,33 @@ def jobs() -> Iterable[LLMJob]:
 
         return await _collect_labels(llm_map, jobs(), len(pending), save_label)
 
+    async def process_input_layer(
+        get_related: GetRelated,
+        save_label: Callable[[LabelResult], None],
+        pending: list[str],
+        labels_so_far: dict[str, LabelResult],
+    ) -> dict[str, LabelResult]:
+        def jobs() -> Iterable[LLMJob]:
+            for key in pending:
+                component = harvest.get_component(key)
+                assert component is not None, f"Component {key} not found in harvest DB"
+                i_stats = get_input_token_stats(token_stats, key, app_tok, top_k=20)
+                assert i_stats is not None, f"No input token stats for {key}"
+
+                related = get_related(key, labels_so_far)
+                prompt = format_input_prompt(
+                    component=component,
+                    model_metadata=model_metadata,
+                    app_tok=app_tok,
+                    input_token_stats=i_stats,
+                    related=related,
+                    label_max_words=config.label_max_words,
+                    max_examples=config.max_examples,
+                )
+                yield LLMJob(prompt=prompt, schema=LABEL_SCHEMA, key=key)
+
+        return await _collect_labels(llm_map, jobs(), len(pending), save_label)
+
     # -- Scan (fold over layers) -----------------------------------------------
 
     async def scan(
@@ -248,15 +271,13 @@ def jobs() -> Iterable[LLMJob]:
     get_sources = _make_get_sources(metric)
 
     label_output = partial(
-        process_layer,
+        process_output_layer,
         _get_related(get_targets),
-        format_output_prompt,
         db.save_output_label,
     )
     label_input = partial(
-        process_layer,
+        process_input_layer,
         _get_related(get_sources),
-        format_input_prompt,
         db.save_input_label,
     )
 
diff --git a/spd/topological_interp/prompts.py b/spd/topological_interp/prompts.py
index 415904343..6e45b1df2 100644
--- a/spd/topological_interp/prompts.py
+++ b/spd/topological_interp/prompts.py
@@ -1,17 +1,13 @@
 """Prompt formatters for topological interpretation.
 
 Three prompts:
-1. Output pass (late→early): "What does this component DO?"
-2. Input pass (early→late): "What TRIGGERS this component?"
+1. Output pass (late→early): "What does this component DO?" — output tokens, says examples, downstream
+2. Input pass (early→late): "What TRIGGERS this component?" — input tokens, fires-on examples, upstream
 3. Unification: Synthesize output + input labels into unified label.
-
-Output and input passes are independent — neither depends on the other's labels.
-The unification step combines them.
 """
 
 from spd.app.backend.app_tokenizer import AppTokenizer
 from spd.autointerp.prompt_helpers import (
-    DATASET_DESCRIPTIONS,
     build_fires_on_examples,
     build_input_section,
     build_output_section,
@@ -40,15 +36,10 @@
 _FORBIDDEN = "FORBIDDEN vague words: narrative, story, character, theme, descriptive, content, transition, scene."
 
 
-def _build_context_block(
+def _component_header(
     component: ComponentData,
     model_metadata: ModelMetadata,
-    app_tok: AppTokenizer,
-    input_token_stats: TokenPRLift,
-    output_token_stats: TokenPRLift,
-    max_examples: int,
 ) -> str:
-    """Shared context block used by both output and input prompts."""
     canonical = model_metadata.layer_descriptions.get(component.layer, component.layer)
     layer_desc = human_layer_desc(canonical, model_metadata.n_blocks)
     position_note = layer_position_note(canonical, model_metadata.n_blocks)
@@ -60,64 +51,44 @@ def _build_context_block(
         else "extremely rare"
     )
 
-    dataset_desc = DATASET_DESCRIPTIONS.get(
-        model_metadata.dataset_name, model_metadata.dataset_name
-    )
-
-    input_pmi = (
-        [(app_tok.get_tok_display(tid), pmi) for tid, pmi in component.input_token_pmi.top]
-        if component.input_token_pmi.top
-        else None
-    )
-    output_pmi = (
-        [(app_tok.get_tok_display(tid), pmi) for tid, pmi in component.output_token_pmi.top]
-        if component.output_token_pmi.top
-        else None
-    )
-
-    output_section = build_output_section(output_token_stats, output_pmi)
-    input_section = build_input_section(input_token_stats, input_pmi)
-    fires_on = build_fires_on_examples(component, app_tok, max_examples)
-    says = build_says_examples(component, app_tok, max_examples)
-
     context_notes = " ".join(filter(None, [position_note, dens_note]))
 
     return f"""\
 ## Context
-- Model: {model_metadata.model_class} ({model_metadata.n_blocks} blocks), dataset: {dataset_desc}
-- Component: {layer_desc} (component {component.component_idx})
+- Component: {layer_desc} (component {component.component_idx}), {model_metadata.n_blocks}-block model
 - Firing rate: {component.firing_density * 100:.2f}% ({rate_str})
-{context_notes}
-
-## Output tokens (what the model produces when this component fires)
-{output_section}
-## Input tokens (what causes this component to fire)
-{input_section}
-## Activation examples — where the component fires
-{fires_on}
-## Activation examples — what the model produces
-{says}"""
+{context_notes}"""
 
 
 def format_output_prompt(
     component: ComponentData,
     model_metadata: ModelMetadata,
     app_tok: AppTokenizer,
-    input_token_stats: TokenPRLift,
     output_token_stats: TokenPRLift,
     related: list[RelatedComponent],
     label_max_words: int,
     max_examples: int,
 ) -> str:
-    context = _build_context_block(
-        component, model_metadata, app_tok, input_token_stats, output_token_stats, max_examples
+    header = _component_header(component, model_metadata)
+
+    output_pmi = (
+        [(app_tok.get_tok_display(tid), pmi) for tid, pmi in component.output_token_pmi.top]
+        if component.output_token_pmi.top
+        else None
     )
+    output_section = build_output_section(output_token_stats, output_pmi)
+    says = build_says_examples(component, app_tok, max_examples)
     related_table = _format_attributed_table(related, app_tok)
 
     return f"""\
 You are analyzing a component in a neural network to understand its OUTPUT FUNCTION — what it does when it fires.
 
-{context}
+{header}
+
+## Output tokens (what the model produces when this component fires)
+{output_section}
+## Activation examples — what the model produces
+{says}
 ## Downstream components (what this component influences)
 These components in later layers are most influenced by this component (by gradient attribution):
 {related_table}
@@ -141,20 +112,30 @@ def format_input_prompt(
     model_metadata: ModelMetadata,
     app_tok: AppTokenizer,
     input_token_stats: TokenPRLift,
-    output_token_stats: TokenPRLift,
     related: list[RelatedComponent],
     label_max_words: int,
     max_examples: int,
 ) -> str:
-    context = _build_context_block(
-        component, model_metadata, app_tok, input_token_stats, output_token_stats, max_examples
+    header = _component_header(component, model_metadata)
+
+    input_pmi = (
+        [(app_tok.get_tok_display(tid), pmi) for tid, pmi in component.input_token_pmi.top]
+        if component.input_token_pmi.top
+        else None
     )
+    input_section = build_input_section(input_token_stats, input_pmi)
+    fires_on = build_fires_on_examples(component, app_tok, max_examples)
     related_table = _format_attributed_table(related, app_tok)
 
     return f"""\
 You are analyzing a component in a neural network to understand its INPUT FUNCTION — what triggers it to fire.
 
-{context}
+{header}
+
+## Input tokens (what causes this component to fire)
+{input_section}
+## Activation examples — where the component fires
+{fires_on}
 ## Upstream components (what feeds into this component)
 These components in earlier layers most strongly attribute to this component:
 {related_table}

From c397a2c3aa7bc0dd18f9dc1189b5933e6d18c4ea Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 23:03:41 +0000
Subject: [PATCH 48/62] Add activation examples to unification prompt

The unification step now sees fires-on and says examples alongside the
two-perspective labels, giving it grounding to make better unified labels.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/topological_interp/interpret.py | 12 +++++++++++-
 spd/topological_interp/prompts.py   | 25 ++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/spd/topological_interp/interpret.py b/spd/topological_interp/interpret.py
index 80368fa48..cac5e0ee9 100644
--- a/spd/topological_interp/interpret.py
+++ b/spd/topological_interp/interpret.py
@@ -251,7 +251,17 @@ def jobs() -> Iterable[LLMJob]:
                 if out is None or inp is None:
                     n_skipped += 1
                     continue
-                prompt = format_unification_prompt(out, inp, config.label_max_words)
+                component = harvest.get_component(key)
+                assert component is not None, f"Component {key} not found in harvest DB"
+                prompt = format_unification_prompt(
+                    output_label=out,
+                    input_label=inp,
+                    component=component,
+                    model_metadata=model_metadata,
+                    app_tok=app_tok,
+                    label_max_words=config.label_max_words,
+                    max_examples=config.max_examples,
+                )
                 yield LLMJob(prompt=prompt, schema=LABEL_SCHEMA, key=key)
 
         logger.info(f"Unifying {len(keys)} components")
diff --git a/spd/topological_interp/prompts.py b/spd/topological_interp/prompts.py
index 6e45b1df2..f5ea1491e 100644
--- a/spd/topological_interp/prompts.py
+++ b/spd/topological_interp/prompts.py
@@ -33,8 +33,6 @@
     "additionalProperties": False,
 }
 
-_FORBIDDEN = "FORBIDDEN vague words: narrative, story, character, theme, descriptive, content, transition, scene."
-
 
 def _component_header(
     component: ComponentData,
@@ -101,7 +99,7 @@ def format_output_prompt(
 - "object pronouns after verbs"
 - "aquatic scene vocabulary (frog, river, pond)"
 
-{_FORBIDDEN} Lowercase only. Say "unclear" if the evidence is too weak.
+Say "unclear" if the evidence is too weak.
 
 Respond with JSON: {{"label": "...", "confidence": "low|medium|high", "reasoning": "..."}}
 """
@@ -148,7 +146,7 @@ def format_input_prompt(
 - "tokens following proper nouns"
 - "positions requiring verb conjugation"
 
-{_FORBIDDEN} Lowercase only. Say "unclear" if the evidence is too weak.
+Lowercase only. Say "unclear" if the evidence is too weak.
 
 Respond with JSON: {{"label": "...", "confidence": "low|medium|high", "reasoning": "..."}}
 """
@@ -157,10 +155,26 @@ def format_input_prompt(
 def format_unification_prompt(
     output_label: LabelResult,
     input_label: LabelResult,
+    component: ComponentData,
+    model_metadata: ModelMetadata,
+    app_tok: AppTokenizer,
     label_max_words: int,
+    max_examples: int,
 ) -> str:
+    header = _component_header(component, model_metadata)
+    fires_on = build_fires_on_examples(component, app_tok, max_examples)
+    says = build_says_examples(component, app_tok, max_examples)
+
     return f"""\
-A neural network component has been analyzed from two perspectives:
+A neural network component has been analyzed from two perspectives.
+
+{header}
+
+## Activation examples — where the component fires
+{fires_on}
+## Activation examples — what the model produces
+{says}
+## Two-perspective analysis
 
 OUTPUT FUNCTION: "{output_label.label}" (confidence: {output_label.confidence})
   Reasoning: {output_label.reasoning}
@@ -168,6 +182,7 @@ def format_unification_prompt(
 INPUT FUNCTION: "{input_label.label}" (confidence: {input_label.confidence})
   Reasoning: {input_label.reasoning}
 
+## Task
 Synthesize these into a single unified label (max {label_max_words} words) that captures the component's complete role. If input and output suggest the same concept, unify them. If they describe genuinely different aspects (e.g. fires on X, produces Y), combine both. Lowercase only.
 
 Respond with JSON: {{"label": "...", "confidence": "low|medium|high", "reasoning": "..."}}

From f12aedf749efe14bfc96e3fd88bfdf5d1ee42266 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 23:15:41 +0000
Subject: [PATCH 49/62] Clean up prompts: human-readable keys, normalized
 attributions, filter unlabeled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace raw component keys (h.3.mlp.c_fc:42) with human-readable descriptions
  (layer 3 MLP up-projection, component 42) using human_layer_desc
- Normalize attributions: strongest = 1.0, rest relative (+0.85, -0.42, etc.)
- Filter unlabeled components from related table (API failures), keep token entries
- Remove dead _FORBIDDEN constant and inconsistent "lowercase only" instruction
- max_examples 10 → 20 (user edit)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/topological_interp/config.py  |  2 +-
 spd/topological_interp/prompts.py | 55 +++++++++++++++++--------------
 2 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/spd/topological_interp/config.py b/spd/topological_interp/config.py
index 533e724a8..532c61a4a 100644
--- a/spd/topological_interp/config.py
+++ b/spd/topological_interp/config.py
@@ -12,7 +12,7 @@ class TopologicalInterpConfig(BaseConfig):
     reasoning_effort: Effort = "low"
     attr_metric: AttrMetric = "attr_abs"
     top_k_attributed: int = 8
-    max_examples: int = 10
+    max_examples: int = 20
     label_max_words: int = 8
     cost_limit_usd: float | None = None
     max_requests_per_minute: int = 500
diff --git a/spd/topological_interp/prompts.py b/spd/topological_interp/prompts.py
index f5ea1491e..5f60fc21f 100644
--- a/spd/topological_interp/prompts.py
+++ b/spd/topological_interp/prompts.py
@@ -76,7 +76,7 @@ def format_output_prompt(
     )
     output_section = build_output_section(output_token_stats, output_pmi)
     says = build_says_examples(component, app_tok, max_examples)
-    related_table = _format_attributed_table(related, app_tok)
+    related_table = _format_related_table(related, model_metadata, app_tok)
 
     return f"""\
 You are analyzing a component in a neural network to understand its OUTPUT FUNCTION — what it does when it fires.
@@ -93,12 +93,6 @@ def format_output_prompt(
 ## Task
 Give a {label_max_words}-word-or-fewer label describing this component's OUTPUT FUNCTION — what it does when it fires.
 
-Examples of good labels:
-- "word stem completion (stems → suffixes)"
-- "closes dialogue with quotation marks"
-- "object pronouns after verbs"
-- "aquatic scene vocabulary (frog, river, pond)"
-
 Say "unclear" if the evidence is too weak.
 
 Respond with JSON: {{"label": "...", "confidence": "low|medium|high", "reasoning": "..."}}
@@ -123,7 +117,7 @@ def format_input_prompt(
     )
     input_section = build_input_section(input_token_stats, input_pmi)
     fires_on = build_fires_on_examples(component, app_tok, max_examples)
-    related_table = _format_attributed_table(related, app_tok)
+    related_table = _format_related_table(related, model_metadata, app_tok)
 
     return f"""\
 You are analyzing a component in a neural network to understand its INPUT FUNCTION — what triggers it to fire.
@@ -140,13 +134,7 @@ def format_input_prompt(
 ## Task
 Give a {label_max_words}-word-or-fewer label describing this component's INPUT FUNCTION — what conditions trigger it to fire.
 
-Examples of good labels:
-- "periods and sentence boundaries"
-- "prepositions before noun phrases"
-- "tokens following proper nouns"
-- "positions requiring verb conjugation"
-
-Lowercase only. Say "unclear" if the evidence is too weak.
+Say "unclear" if the evidence is too weak.
 
 Respond with JSON: {{"label": "...", "confidence": "low|medium|high", "reasoning": "..."}}
 """
@@ -183,20 +171,32 @@ def format_unification_prompt(
   Reasoning: {input_label.reasoning}
 
 ## Task
-Synthesize these into a single unified label (max {label_max_words} words) that captures the component's complete role. If input and output suggest the same concept, unify them. If they describe genuinely different aspects (e.g. fires on X, produces Y), combine both. Lowercase only.
+Synthesize these into a single unified label (max {label_max_words} words) that captures the component's complete role. If input and output suggest the same concept, unify them. If they describe genuinely different aspects (e.g. fires on X, produces Y), combine both.
 
 Respond with JSON: {{"label": "...", "confidence": "low|medium|high", "reasoning": "..."}}
 """
 
 
-def _format_attributed_table(components: list[RelatedComponent], app_tok: AppTokenizer) -> str:
-    if not components:
-        return "(no attributed components found)\n"
+def _format_related_table(
+    components: list[RelatedComponent],
+    model_metadata: ModelMetadata,
+    app_tok: AppTokenizer,
+) -> str:
+    # Filter: only show labeled components and token entries (embed/output)
+    visible = [n for n in components if n.label is not None or _is_token_entry(n.component_key)]
+    if not visible:
+        return "(no related components with labels found)\n"
+
+    # Normalize attributions: strongest = 1.0
+    max_attr = max(abs(n.attribution) for n in visible)
+    norm = max_attr if max_attr > 0 else 1.0
 
     lines: list[str] = []
-    for n in components:
-        display = _component_display(n.component_key, app_tok)
-        parts = [f"  {display} (attribution: {n.attribution:.4f}"]
+    for n in visible:
+        display = _component_display(n.component_key, model_metadata, app_tok)
+        rel_attr = n.attribution / norm
+
+        parts = [f"  {display} (relative attribution: {rel_attr:+.2f}"]
         if n.jaccard is not None:
             parts.append(f", co-firing Jaccard: {n.jaccard:.3f}")
         parts.append(")")
@@ -209,7 +209,12 @@ def _format_attributed_table(components: list[RelatedComponent], app_tok: AppTok
     return "\n".join(lines) + "\n"
 
 
-def _component_display(key: str, app_tok: AppTokenizer) -> str:
+def _is_token_entry(key: str) -> bool:
+    layer = key.rsplit(":", 1)[0]
+    return layer in ("embed", "output")
+
+
+def _component_display(key: str, model_metadata: ModelMetadata, app_tok: AppTokenizer) -> str:
     layer, idx_str = key.rsplit(":", 1)
     match layer:
         case "embed":
@@ -217,4 +222,6 @@ def _component_display(key: str, app_tok: AppTokenizer) -> str:
         case "output":
             return f'output token "{app_tok.get_tok_display(int(idx_str))}"'
         case _:
-            return key
+            canonical = model_metadata.layer_descriptions.get(layer, layer)
+            desc = human_layer_desc(canonical, model_metadata.n_blocks)
+            return f"{desc}, component {idx_str}"

From 98a65ae0ffe2c9a6b3a8be1f3f18da2f46071a2f Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Mon, 23 Feb 2026 23:18:17 +0000
Subject: [PATCH 50/62] Tweak component display, tighten error threshold to 5%

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/topological_interp/interpret.py | 4 ++--
 spd/topological_interp/prompts.py   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/spd/topological_interp/interpret.py b/spd/topological_interp/interpret.py
index cac5e0ee9..1850435e1 100644
--- a/spd/topological_interp/interpret.py
+++ b/spd/topological_interp/interpret.py
@@ -358,7 +358,7 @@ def _parse_label(key: str, parsed: dict[str, object], raw: str, prompt: str) ->
 
 def _check_error_rate(n_errors: int, n_done: int) -> None:
     total = n_errors + n_done
-    if total > 10 and n_errors / total > 0.2:
+    if total > 10 and n_errors / total > 0.05:
         raise RuntimeError(
-            f"Error rate {n_errors / total:.0%} ({n_errors}/{total}) exceeds 20% threshold"
+            f"Error rate {n_errors / total:.0%} ({n_errors}/{total}) exceeds 5% threshold"
         )
diff --git a/spd/topological_interp/prompts.py b/spd/topological_interp/prompts.py
index 5f60fc21f..93e4a2c0f 100644
--- a/spd/topological_interp/prompts.py
+++ b/spd/topological_interp/prompts.py
@@ -224,4 +224,4 @@ def _component_display(key: str, model_metadata: ModelMetadata, app_tok: AppToke
         case _:
             canonical = model_metadata.layer_descriptions.get(layer, layer)
             desc = human_layer_desc(canonical, model_metadata.n_blocks)
-            return f"{desc}, component {idx_str}"
+            return f"component from {desc}"

From 17a25ba7a5fba520310fe6ba99835f4fd7416782 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Tue, 24 Feb 2026 13:59:58 +0000
Subject: [PATCH 51/62] wip.

---
 .claude/worktrees/xenodochial-germain         |   2 +-
 spd/dataset_attributions/harvester.py         | 313 +++++++++++-------
 .../scripts/run_worker.py                     |  14 +-
 3 files changed, 190 insertions(+), 139 deletions(-)

diff --git a/.claude/worktrees/xenodochial-germain b/.claude/worktrees/xenodochial-germain
index 4b52a4869..5c9f344eb 160000
--- a/.claude/worktrees/xenodochial-germain
+++ b/.claude/worktrees/xenodochial-germain
@@ -1 +1 @@
-Subproject commit 4b52a4869474bd80365c573434855d091abbbb5b
+Subproject commit 5c9f344eb490e90bed9db5102325459d42c3c0f4
diff --git a/spd/dataset_attributions/harvester.py b/spd/dataset_attributions/harvester.py
index 82141c879..17513ad7b 100644
--- a/spd/dataset_attributions/harvester.py
+++ b/spd/dataset_attributions/harvester.py
@@ -7,16 +7,9 @@
 Three metrics are accumulated:
 - attr:         E[∂y/∂x · x]           (signed mean attribution)
 - attr_abs:     E[∂|y|/∂x · x]         (attribution to absolute value of target)
-- squared_attr: E[(∂y/∂x · x)²]        (mean squared attribution, for RMS)
 
-Naming convention: modifier before "attr" applies to the target (e.g. attr_abs =
-attribution to |target|), modifier after applies to the attribution itself
-(e.g. squared_attr = squared attribution).
-
-Uses residual-based storage for scalability:
-- Component targets: accumulated directly
-- Output targets: accumulated as attributions to output residual stream,
-  computed on-the-fly at query time via w_unembed
+Output (pseudo-) component attributions are handled differently: We accumulate attributions
+to the output residual stream, then later project this into token space.
 
 All layer keys are concrete module paths (e.g. "wte", "h.0.attn.q_proj", "lm_head").
 Translation to canonical names happens at the storage boundary in harvest.py.
@@ -24,10 +17,11 @@
 
 from collections import defaultdict
 from dataclasses import dataclass
+from functools import partial
 from typing import Any
 
 import torch
-from jaxtyping import Bool, Int
+from jaxtyping import Bool, Float, Int
 from torch import Tensor, nn
 
 from spd.configs import SamplingType
@@ -36,6 +30,22 @@
 from spd.utils.general_utils import bf16_autocast
 
 
+class AttributionAccumulator:
+    def __init__(
+        self,
+        regular_layers: dict[str, int],
+        sources_by_target: dict[str, list[str]],
+        component_alive: dict[str, Bool[Tensor, " n_components"]],
+        unembed_path: str,
+        unembed_module: nn.Linear,
+        embed_path: str,
+        embedding_module: nn.Embedding,
+        device: torch.device,
+    ):
+        self._regular_layers = regular_layers
+        self._embedding_module = embedding_module
+
+
 class AttributionHarvester:
     """Accumulates attribution strengths across batches using concrete module paths.
 
@@ -80,74 +90,141 @@ def __init__(
         self.n_tokens = 0
         self.output_d_model = unembed_module.in_features
 
-        self.attr_accumulator = self._build_accumulator(
-            sources_by_target, component_alive, unembed_module, vocab_size, device
-        )
-        self.attr_abs_accumulator = self._build_accumulator(
-            sources_by_target, component_alive, unembed_module, vocab_size, device
-        )
-        self.square_attr_accumulator = self._build_accumulator(
-            sources_by_target, component_alive, unembed_module, vocab_size, device
-        )
-
-    def _build_accumulator(
-        self,
-        sources_by_target: dict[str, list[str]],
-        component_alive: dict[str, Bool[Tensor, " n_components"]],
-        unembed_module: nn.Linear,
-        vocab_size: int,
-        device: torch.device,
-    ) -> dict[str, dict[str, Tensor]]:
-        accumulator: dict[str, dict[str, Tensor]] = {}
-
-        for target_layer, source_layers in sources_by_target.items():
-            if target_layer == self.unembed_path:
-                target_d = unembed_module.in_features
-            else:
-                (target_c,) = component_alive[target_layer].shape
-                target_d = target_c
-
-            source_acc: dict[str, Tensor] = {}
+        #     self._attr_val_accumulator = self._build_attr_accumulator(sources_by_target)
+        #     self._attr_abs_accumulator = self._build_attr_accumulator(sources_by_target)
+
+        #     self._square_act_accumulator = {
+        #         layer: torch.zeros(c, device=device) for layer, c in self.model.module_to_c.items()
+        #     }
+
+        #     self._ci_sum_accumulator = {
+        #         layer: torch.zeros(c, device=device) for layer, c in self.model.module_to_c.items()
+        #     }
+
+        # def _build_attr_accumulator(
+        #     self,
+        #     sources_by_target: dict[str, list[str]],
+        # ) -> dict[str, dict[str, Tensor]]:
+        #     accumulator: dict[str, dict[str, Tensor]] = {}
+
+        #     for target_layer, source_layers in sources_by_target.items():
+        #         if target_layer == self.unembed_path:
+        #             target_d = self.unembed_module.in_features
+        #         else:
+        #             target_d = self.model.module_to_c[target_layer]
+
+        #         source_acc: dict[str, Tensor] = {}
+        #         for source_layer in source_layers:
+        #             if source_layer == self.embed_path:
+        #                 source_d = self.embedding_module.num_embeddings
+        #             else:
+        #                 source_d = self.model.module_to_c[source_layer]
+
+        #             source_acc[source_layer] = torch.zeros((target_d, source_d), device=self.device)
+
+        #         accumulator[target_layer] = source_acc
+
+        #     return accumulator
+
+        sources_by_regular_target = self.sources_by_target.copy()
+
+        unembed_sources = sources_by_regular_target[self.unembed_path].copy()
+        del sources_by_regular_target[self.unembed_path]
+
+        # we store attributions to the embedding *output*
+        embed_tgts_acc: dict[str, Tensor] = {}
+        n_emb = self.embedding_module.num_embeddings
+        for target, sources in sources_by_regular_target.items():
+            if self.embed_path in sources:
+                tgt_c = self.model.module_to_c[target]
+                embed_tgts_acc[target] = torch.zeros((tgt_c, n_emb), device=self.device)
+                continue
+            sources.remove(self.embed_path)
+
+        # we use d_model here because we store attributions to the pre-unembed residual
+        unembed_srcs_acc: dict[str, Tensor] = {}
+        d_model = self.unembed_module.in_features
+        for source in unembed_sources:
+            src_c = self.model.module_to_c[source]
+            unembed_srcs_acc[source] = torch.zeros((d_model, src_c), device=self.device)
+
+        # for normal components, we just go C <-> C
+        acc: dict[str, dict[str, Tensor]] = {}
+        for target_layer, source_layers in sources_by_regular_target.items():
+            acc[target_layer] = {}
             for source_layer in source_layers:
-                if source_layer == self.embed_path:
-                    source_d = vocab_size
-                else:
-                    (source_c,) = component_alive[source_layer].shape
-                    source_d = source_c
+                tgt_c = self.model.module_to_c[target_layer]
+                src_c = self.model.module_to_c[source_layer]
+                acc[target_layer][source_layer] = torch.zeros(
+                    (tgt_c, src_c), device=self.device
+                )
+
+        self._acc = acc
+        self._embed_tgts_acc = embed_tgts_acc
+        self._unembed_srcs_acc = unembed_srcs_acc
 
-                source_acc[source_layer] = torch.zeros((target_d, source_d), device=device)
+    # def add_embed_attr_(
+    #     self,
+    #     target_layer: str,
+    #     target_idx: int,
+    #     tokens: Int[Tensor, "batch seq"],
+    #     ci_weighted_attr_val: Float[Tensor, " c"],
+    # ) -> None:
 
-            accumulator[target_layer] = source_acc
+    def add_unembed_attr_(
+        self,
+        target_idx: int,
+        source_layer: str,
+        ci_weighted_attr_val: Float[Tensor, " c"],
+    ) -> None:
+        self._unembed_targets[target_layer][target_idx].add_(ci_weighted_attr_val)
+        # if source_layer == self.embed_path:
+        #     # Per-token: sum grad*act*ci over d_model, scatter by token id
+        #     attr_val = ci_weighted_attr_val.sum(dim=-1).flatten()
 
-        return accumulator
+        #     attr_acc.scatter_add_(0, tokens.flatten(), attr_val)
+        # else:
+        #     # Per-component: sum grad*act*ci over batch and sequence
+        #     attr_acc.add_(ci_weighted_attr_val.sum(dim=(0, 1)))
 
     @dataclass
     class NormalizedAttrs:
-        attr: dict[str, dict[str, Tensor]]
-        attr_abs: dict[str, dict[str, Tensor]]
-        mean_squared_attr: dict[str, dict[str, Tensor]]
+        attr: dict[str, dict[str, Float[Tensor, "c_target c_source"]]]
+        attr_abs: dict[str, dict[str, Float[Tensor, "c_target c_source"]]]
 
     def normalized_attrs(self) -> NormalizedAttrs:
         """Return the accumulated attributions normalized by n_tokens.
 
         mean_squared_attr is pre-sqrt so it can be merged across workers.
         """
-        attr = defaultdict[str, dict[str, Tensor]](dict)
-        attr_abs = defaultdict[str, dict[str, Tensor]](dict)
-        mean_squared_attr = defaultdict[str, dict[str, Tensor]](dict)
+        normed_attr_val = defaultdict[str, dict[str, Float[Tensor, "c_target c_source"]]](dict)
+        normed_attr_abs = defaultdict[str, dict[str, Float[Tensor, "c_target c_source"]]](dict)
+
+        for target in self._attr_val_accumulator:
+            mean_squared_act = self._square_act_accumulator[target] / self.n_tokens
+            mean_target_act_l2 = mean_squared_act.sqrt()  # (C_target,)
 
-        for target in self.attr_accumulator:
             for source in self.sources_by_target[target]:
-                attr[target][source] = self.attr_accumulator[target][source] / self.n_tokens
-                attr_abs[target][source] = self.attr_abs_accumulator[target][source] / self.n_tokens
-                mean_squared_attr[target][source] = (
-                    self.square_attr_accumulator[target][source] / self.n_tokens
+                mean_attr_val = self._attr_val_accumulator[target][source]  # (C_target, C_source)
+                mean_attr_abs = self._attr_abs_accumulator[target][source]  # (C_target, C_source)
+
+                source_ci_sum = (
+                    self._ci_sum_accumulator[source] if source != self.embed_path else 1.0
+                )  # (C_source,)
+
+                ci_weighted_mean_attr_val = mean_attr_val / source_ci_sum  # (C_target, C_source)
+                ci_weighted_mean_attr_abs = mean_attr_abs / source_ci_sum  # (C_target, C_source)
+
+                normed_attr_val[target][source] = (
+                    ci_weighted_mean_attr_val / mean_target_act_l2[..., None]
+                )
+                normed_attr_abs[target][source] = (
+                    ci_weighted_mean_attr_abs / mean_target_act_l2[..., None]
                 )
 
         return self.NormalizedAttrs(
-            attr=attr,
-            attr_abs=attr_abs,
-            mean_squared_attr=mean_squared_attr,
+            attr=normed_attr_val,
+            attr_abs=normed_attr_abs,
         )
 
     def process_batch(self, tokens: Int[Tensor, "batch seq"]) -> None:
@@ -198,20 +275,21 @@ def pre_unembed_hook(_mod: nn.Module, args: tuple[Any, ...], _kwargs: Any) -> No
         cache[f"{self.embed_path}_post_detach"] = embed_out[0]
         cache[f"{self.unembed_path}_pre_detach"] = pre_unembed[0]
 
+        for real_layer, ci_vals in ci.lower_leaky.items():
+            self._ci_sum_accumulator[real_layer].add_(ci_vals.sum(dim=(0, 1)))
+
         for target_layer in self.sources_by_target:
+            # I think this will error because there's no output components hook, in fact, there are no
+            # output components
+            target_acts_raw = cache[f"{target_layer}_post_detach"]
+            self._square_act_accumulator[target_layer].add_(
+                target_acts_raw.square().sum(dim=(0, 1))
+            )
+
             if target_layer == self.unembed_path:
-                self._process_output_targets(
-                    cache,
-                    tokens,
-                    ci.lower_leaky,
-                )
+                self._process_output_targets(cache, tokens, ci.lower_leaky)
             else:
-                self._process_component_targets(
-                    cache,
-                    tokens,
-                    ci.lower_leaky,
-                    target_layer,
-                )
+                self._process_component_targets(cache, tokens, ci.lower_leaky, target_layer)
 
     def _process_output_targets(
         self,
@@ -223,26 +301,21 @@ def _process_output_targets(
         out_residual = cache[f"{self.unembed_path}_pre_detach"]
 
         out_residual_sum = out_residual.sum(dim=(0, 1))
-        out_residual_sum_abs = out_residual.abs().sum(dim=(0, 1))
 
         source_layers = self.sources_by_target[self.unembed_path]
         source_acts = [cache[f"{s}_post_detach"] for s in source_layers]
 
         for d_idx in range(self.output_d_model):
             grads = torch.autograd.grad(out_residual_sum[d_idx], source_acts, retain_graph=True)
-            abs_grads = torch.autograd.grad(
-                out_residual_sum_abs[d_idx], source_acts, retain_graph=True
-            )
 
             self._accumulate_attributions(
-                self.unembed_path,
-                d_idx,
-                source_layers,
-                source_acts,
-                list(grads),
-                list(abs_grads),
-                ci,
-                tokens,
+                attr_accumulator=self._attr_val_accumulator[self.unembed_path],
+                target_idx=d_idx,
+                source_layers=source_layers,
+                source_acts=source_acts,
+                source_grads=list(grads),
+                ci=ci,
+                tokens=tokens,
             )
 
     def _process_component_targets(
@@ -259,76 +332,62 @@ def _process_component_targets(
 
         target_acts_raw = cache[f"{target_layer}_pre_detach"]
 
-        target_ci_detached = ci[target_layer].detach()
-        ci_weighted_target_acts = (target_acts_raw * target_ci_detached).sum(dim=(0, 1))
-        ci_weighted_target_acts_abs = (target_acts_raw.abs() * target_ci_detached).sum(dim=(0, 1))
+        target_acts = target_acts_raw.sum(dim=(0, 1))
+        target_acts_abs = target_acts_raw.abs().sum(dim=(0, 1))
 
         source_layers = self.sources_by_target[target_layer]
         source_acts = [cache[f"{s}_post_detach"] for s in source_layers]
 
         for t_idx in torch.where(alive_targets)[0].tolist():
-            grads = torch.autograd.grad(
-                ci_weighted_target_acts[t_idx], source_acts, retain_graph=True
+            attr_for_target = partial(
+                self._accumulate_attributions,
+                target_idx=t_idx,
+                source_layers=source_layers,
+                source_acts=source_acts,
+                ci=ci,
+                tokens=tokens,
             )
 
-            abs_grads = torch.autograd.grad(
-                ci_weighted_target_acts_abs[t_idx], source_acts, retain_graph=True
+            val_grads = torch.autograd.grad(target_acts[t_idx], source_acts, retain_graph=True)
+            attr_for_target(
+                attr_accumulator=self._attr_val_accumulator[target_layer],
+                source_grads=list(val_grads),
             )
 
-            self._accumulate_attributions(
-                target_layer,
-                t_idx,
-                source_layers,
-                source_acts,
-                list(grads),
-                list(abs_grads),
-                ci,
-                tokens,
+            abs_grads = torch.autograd.grad(target_acts_abs[t_idx], source_acts, retain_graph=True)
+            attr_for_target(
+                attr_accumulator=self._attr_abs_accumulator[target_layer],
+                source_grads=list(abs_grads),
             )
 
     @torch.no_grad()  # pyright: ignore[reportUntypedFunctionDecorator]
     def _accumulate_attributions(
         self,
-        target_layer: str,
+        attr_accumulator: dict[str, Float[Tensor, "target_c source_c"]],
         target_idx: int,
         source_layers: list[str],
-        source_acts: list[Tensor],
-        source_grads: list[Tensor],
-        source_abs_grads: list[Tensor],
-        ci: dict[str, Tensor],
+        source_acts: list[Float[Tensor, "batch seq c"]],
+        source_grads: list[Float[Tensor, "batch seq c"]],
+        ci: dict[str, Float[Tensor, "batch seq c"]],
         tokens: Int[Tensor, "batch seq"],
     ) -> None:
         """Accumulate grad*act attributions from sources to a target column."""
-
-        attr_accumulator = self.attr_accumulator[target_layer]
-        attr_abs_accumulator = self.attr_abs_accumulator[target_layer]
-        square_attr_accumulator = self.square_attr_accumulator[target_layer]
-
-        for source_layer, act, grad, abs_grad in zip(
-            source_layers, source_acts, source_grads, source_abs_grads, strict=True
-        ):
-            attr_acc = attr_accumulator[source_layer][target_idx]
-            attr_abs_acc = attr_abs_accumulator[source_layer][target_idx]
-            square_attr_acc = square_attr_accumulator[source_layer][target_idx]
+        for source_layer, act, grad in zip(source_layers, source_acts, source_grads, strict=True):
+            attr_acc = attr_accumulator[source_layer][target_idx]  # (C_source,)
 
             # Embed has no CI (all tokens always active)
             source_ci = ci[source_layer] if source_layer != self.embed_path else 1.0
 
-            ci_weighted_attr = grad * act * source_ci
-            ci_weighted_attr_abs = abs_grad * act * source_ci
-            ci_weighted_squared_attr = ci_weighted_attr.square()
+            ci_weighted_attr_val = grad * act * source_ci  # (B S C)
 
             if source_layer == self.embed_path:
                 # Per-token: sum grad*act*ci over d_model, scatter by token id
-                attr = ci_weighted_attr.sum(dim=-1).flatten()
-                attr_abs = ci_weighted_attr_abs.sum(dim=-1).flatten()
-                attr_squared = ci_weighted_squared_attr.sum(dim=-1).flatten()
+                attr_val = ci_weighted_attr_val.sum(dim=-1).flatten()
 
-                attr_acc.scatter_add_(0, tokens.flatten(), attr)
-                attr_abs_acc.scatter_add_(0, tokens.flatten(), attr_abs)
-                square_attr_acc.scatter_add_(0, tokens.flatten(), attr_squared)
+                attr_acc.scatter_add_(0, tokens.flatten(), attr_val)
             else:
                 # Per-component: sum grad*act*ci over batch and sequence
-                attr_acc.add_(ci_weighted_attr.sum(dim=(0, 1)))
-                attr_abs_acc.add_(ci_weighted_attr_abs.sum(dim=(0, 1)))
-                square_attr_acc.add_(ci_weighted_squared_attr.sum(dim=(0, 1)))
+                attr_acc.add_(ci_weighted_attr_val.sum(dim=(0, 1)))
+
+
+# TODO symbolic reified in / out handling
diff --git a/spd/dataset_attributions/scripts/run_worker.py b/spd/dataset_attributions/scripts/run_worker.py
index 995a9f22a..1f512fbb7 100644
--- a/spd/dataset_attributions/scripts/run_worker.py
+++ b/spd/dataset_attributions/scripts/run_worker.py
@@ -19,15 +19,14 @@
 from spd.dataset_attributions.config import DatasetAttributionConfig
 from spd.dataset_attributions.harvest import harvest_attributions
 from spd.dataset_attributions.repo import get_attributions_subrun_dir
-from spd.log import logger
 from spd.utils.wandb_utils import parse_wandb_run_path
 
 
 def main(
     wandb_path: str,
-    config_json: dict[str, Any] | None = None,
-    rank: int | None = None,
-    world_size: int | None = None,
+    config_json: dict[str, Any],
+    rank: int,
+    world_size: int,
     subrun_id: str | None = None,
     harvest_subrun_id: str | None = None,
 ) -> None:
@@ -43,13 +42,6 @@ def main(
     )
     output_dir = get_attributions_subrun_dir(run_id, subrun_id)
 
-    if world_size is not None:
-        logger.info(
-            f"Distributed harvest: {wandb_path} (rank {rank}/{world_size}, subrun {subrun_id})"
-        )
-    else:
-        logger.info(f"Single-GPU harvest: {wandb_path} (subrun {subrun_id})")
-
     harvest_attributions(
         wandb_path=wandb_path,
         config=config,

From 47818534f8230b972daa64eadd9f60ce494560eb Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Tue, 24 Feb 2026 15:02:02 +0000
Subject: [PATCH 52/62] wip: Refactor dataset attribution harvester to track
 abs attributions

---
 .../backend/routers/dataset_attributions.py   |  21 -
 spd/dataset_attributions/harvester.py         | 286 ++++-------
 spd/dataset_attributions/storage.py           | 447 +++++++++---------
 3 files changed, 318 insertions(+), 436 deletions(-)

diff --git a/spd/app/backend/routers/dataset_attributions.py b/spd/app/backend/routers/dataset_attributions.py
index bf8ee501a..adbd4592b 100644
--- a/spd/app/backend/routers/dataset_attributions.py
+++ b/spd/app/backend/routers/dataset_attributions.py
@@ -246,24 +246,3 @@ def get_attribution_targets(
     return _to_api_entries(
         storage.get_top_targets(source_key, k, sign, metric, w_unembed=w_unembed), loaded
     )
-
-
-@router.get("/between/{source_layer}/{source_idx}/{target_layer}/{target_idx}")
-@log_errors
-def get_attribution_between(
-    source_layer: str,
-    source_idx: int,
-    target_layer: str,
-    target_idx: int,
-    loaded: DepLoadedRun,
-    metric: AttrMetric = "attr",
-) -> float:
-    storage = _require_storage(loaded)
-    source_key = _storage_key(source_layer, source_idx)
-    target_key = _storage_key(target_layer, target_idx)
-    _require_source(storage, source_key)
-    _require_target(storage, target_key)
-
-    w_unembed = _get_w_unembed(loaded) if target_layer == "output" else None
-
-    return storage.get_attribution(source_key, target_key, metric, w_unembed=w_unembed)
diff --git a/spd/dataset_attributions/harvester.py b/spd/dataset_attributions/harvester.py
index 17513ad7b..50adbd42d 100644
--- a/spd/dataset_attributions/harvester.py
+++ b/spd/dataset_attributions/harvester.py
@@ -15,13 +15,10 @@
 Translation to canonical names happens at the storage boundary in harvest.py.
 """
 
-from collections import defaultdict
-from dataclasses import dataclass
-from functools import partial
 from typing import Any
 
 import torch
-from jaxtyping import Bool, Float, Int
+from jaxtyping import Bool, Int
 from torch import Tensor, nn
 
 from spd.configs import SamplingType
@@ -30,22 +27,6 @@
 from spd.utils.general_utils import bf16_autocast
 
 
-class AttributionAccumulator:
-    def __init__(
-        self,
-        regular_layers: dict[str, int],
-        sources_by_target: dict[str, list[str]],
-        component_alive: dict[str, Bool[Tensor, " n_components"]],
-        unembed_path: str,
-        unembed_module: nn.Linear,
-        embed_path: str,
-        embedding_module: nn.Embedding,
-        device: torch.device,
-    ):
-        self._regular_layers = regular_layers
-        self._embedding_module = embedding_module
-
-
 class AttributionHarvester:
     """Accumulates attribution strengths across batches using concrete module paths.
 
@@ -90,58 +71,29 @@ def __init__(
         self.n_tokens = 0
         self.output_d_model = unembed_module.in_features
 
-        #     self._attr_val_accumulator = self._build_attr_accumulator(sources_by_target)
-        #     self._attr_abs_accumulator = self._build_attr_accumulator(sources_by_target)
-
-        #     self._square_act_accumulator = {
-        #         layer: torch.zeros(c, device=device) for layer, c in self.model.module_to_c.items()
-        #     }
-
-        #     self._ci_sum_accumulator = {
-        #         layer: torch.zeros(c, device=device) for layer, c in self.model.module_to_c.items()
-        #     }
-
-        # def _build_attr_accumulator(
-        #     self,
-        #     sources_by_target: dict[str, list[str]],
-        # ) -> dict[str, dict[str, Tensor]]:
-        #     accumulator: dict[str, dict[str, Tensor]] = {}
-
-        #     for target_layer, source_layers in sources_by_target.items():
-        #         if target_layer == self.unembed_path:
-        #             target_d = self.unembed_module.in_features
-        #         else:
-        #             target_d = self.model.module_to_c[target_layer]
-
-        #         source_acc: dict[str, Tensor] = {}
-        #         for source_layer in source_layers:
-        #             if source_layer == self.embed_path:
-        #                 source_d = self.embedding_module.num_embeddings
-        #             else:
-        #                 source_d = self.model.module_to_c[source_layer]
-
-        #             source_acc[source_layer] = torch.zeros((target_d, source_d), device=self.device)
-
-        #         accumulator[target_layer] = source_acc
-
-        #     return accumulator
-
         sources_by_regular_target = self.sources_by_target.copy()
 
         unembed_sources = sources_by_regular_target[self.unembed_path].copy()
         del sources_by_regular_target[self.unembed_path]
 
+        self._emb_unemb_attr_acc = torch.zeros(
+            (self.unembed_module.in_features, self.embedding_module.num_embeddings),
+            device=self.device,
+        )
+
         # we store attributions to the embedding *output*
         embed_tgts_acc: dict[str, Tensor] = {}
+        embed_tgts_acc_abs: dict[str, Tensor] = {}
         n_emb = self.embedding_module.num_embeddings
         for target, sources in sources_by_regular_target.items():
             if self.embed_path in sources:
                 tgt_c = self.model.module_to_c[target]
                 embed_tgts_acc[target] = torch.zeros((tgt_c, n_emb), device=self.device)
-                continue
-            sources.remove(self.embed_path)
+                embed_tgts_acc_abs[target] = torch.zeros((tgt_c, n_emb), device=self.device)
+                sources.remove(self.embed_path)
 
         # we use d_model here because we store attributions to the pre-unembed residual
+        # no abs version here because output is always positive
         unembed_srcs_acc: dict[str, Tensor] = {}
         d_model = self.unembed_module.in_features
         for source in unembed_sources:
@@ -150,82 +102,35 @@ def __init__(
 
         # for normal components, we just go C <-> C
         acc: dict[str, dict[str, Tensor]] = {}
+        acc_abs: dict[str, dict[str, Tensor]] = {}
         for target_layer, source_layers in sources_by_regular_target.items():
             acc[target_layer] = {}
+            acc_abs[target_layer] = {}
             for source_layer in source_layers:
                 tgt_c = self.model.module_to_c[target_layer]
                 src_c = self.model.module_to_c[source_layer]
-                acc[target_layer][source_layer] = torch.zeros(
+                acc[target_layer][source_layer] = torch.zeros((tgt_c, src_c), device=self.device)
+                acc_abs[target_layer][source_layer] = torch.zeros(
                     (tgt_c, src_c), device=self.device
                 )
 
-        self._acc = acc
         self._embed_tgts_acc = embed_tgts_acc
-        self._unembed_srcs_acc = unembed_srcs_acc
+        self._embed_tgts_acc_abs = embed_tgts_acc_abs
 
-    # def add_embed_attr_(
-    #     self,
-    #     target_layer: str,
-    #     target_idx: int,
-    #     tokens: Int[Tensor, "batch seq"],
-    #     ci_weighted_attr_val: Float[Tensor, " c"],
-    # ) -> None:
+        self._regular_layers_acc = acc
+        self._regular_layers_acc_abs = acc_abs
 
-    def add_unembed_attr_(
-        self,
-        target_idx: int,
-        source_layer: str,
-        ci_weighted_attr_val: Float[Tensor, " c"],
-    ) -> None:
-        self._unembed_targets[target_layer][target_idx].add_(ci_weighted_attr_val)
-        # if source_layer == self.embed_path:
-        #     # Per-token: sum grad*act*ci over d_model, scatter by token id
-        #     attr_val = ci_weighted_attr_val.sum(dim=-1).flatten()
-
-        #     attr_acc.scatter_add_(0, tokens.flatten(), attr_val)
-        # else:
-        #     # Per-component: sum grad*act*ci over batch and sequence
-        #     attr_acc.add_(ci_weighted_attr_val.sum(dim=(0, 1)))
-
-    @dataclass
-    class NormalizedAttrs:
-        attr: dict[str, dict[str, Float[Tensor, "c_target c_source"]]]
-        attr_abs: dict[str, dict[str, Float[Tensor, "c_target c_source"]]]
-
-    def normalized_attrs(self) -> NormalizedAttrs:
-        """Return the accumulated attributions normalized by n_tokens.
-
-        mean_squared_attr is pre-sqrt so it can be merged across workers.
-        """
-        normed_attr_val = defaultdict[str, dict[str, Float[Tensor, "c_target c_source"]]](dict)
-        normed_attr_abs = defaultdict[str, dict[str, Float[Tensor, "c_target c_source"]]](dict)
-
-        for target in self._attr_val_accumulator:
-            mean_squared_act = self._square_act_accumulator[target] / self.n_tokens
-            mean_target_act_l2 = mean_squared_act.sqrt()  # (C_target,)
-
-            for source in self.sources_by_target[target]:
-                mean_attr_val = self._attr_val_accumulator[target][source]  # (C_target, C_source)
-                mean_attr_abs = self._attr_abs_accumulator[target][source]  # (C_target, C_source)
-
-                source_ci_sum = (
-                    self._ci_sum_accumulator[source] if source != self.embed_path else 1.0
-                )  # (C_source,)
-
-                ci_weighted_mean_attr_val = mean_attr_val / source_ci_sum  # (C_target, C_source)
-                ci_weighted_mean_attr_abs = mean_attr_abs / source_ci_sum  # (C_target, C_source)
-
-                normed_attr_val[target][source] = (
-                    ci_weighted_mean_attr_val / mean_target_act_l2[..., None]
-                )
-                normed_attr_abs[target][source] = (
-                    ci_weighted_mean_attr_abs / mean_target_act_l2[..., None]
-                )
+        self._unembed_srcs_acc = unembed_srcs_acc
 
-        return self.NormalizedAttrs(
-            attr=normed_attr_val,
-            attr_abs=normed_attr_abs,
-        )
+        self._ci_sum_accumulator = {
+            layer: torch.zeros((c), device=self.device)
+            for layer, c in self.model.module_to_c.items()
+        }
+
+        self._square_component_act_accumulator = {
+            layer: torch.zeros((c), device=self.device)
+            for layer, c in self.model.module_to_c.items()
+        }
 
     def process_batch(self, tokens: Int[Tensor, "batch seq"]) -> None:
         """Accumulate attributions from one batch."""
@@ -276,19 +181,15 @@ def pre_unembed_hook(_mod: nn.Module, args: tuple[Any, ...], _kwargs: Any) -> No
         cache[f"{self.unembed_path}_pre_detach"] = pre_unembed[0]
 
         for real_layer, ci_vals in ci.lower_leaky.items():
-            self._ci_sum_accumulator[real_layer].add_(ci_vals.sum(dim=(0, 1)))
+            sum_ci = ci_vals.sum(dim=(0, 1))
+            self._ci_sum_accumulator[real_layer].add_(sum_ci)
 
         for target_layer in self.sources_by_target:
-            # I think this will error because there's no output components hook, in fact, there are no
-            # output components
-            target_acts_raw = cache[f"{target_layer}_post_detach"]
-            self._square_act_accumulator[target_layer].add_(
-                target_acts_raw.square().sum(dim=(0, 1))
-            )
-
             if target_layer == self.unembed_path:
                 self._process_output_targets(cache, tokens, ci.lower_leaky)
             else:
+                sum_sq_acts = cache[f"{target_layer}_post_detach"].square().sum(dim=(0, 1))
+                self._square_component_act_accumulator[target_layer].add_(sum_sq_acts)
                 self._process_component_targets(cache, tokens, ci.lower_leaky, target_layer)
 
     def _process_output_targets(
@@ -303,20 +204,24 @@ def _process_output_targets(
         out_residual_sum = out_residual.sum(dim=(0, 1))
 
         source_layers = self.sources_by_target[self.unembed_path]
+        assert self.embed_path in source_layers, "remove me when passed"
+
         source_acts = [cache[f"{s}_post_detach"] for s in source_layers]
 
         for d_idx in range(self.output_d_model):
             grads = torch.autograd.grad(out_residual_sum[d_idx], source_acts, retain_graph=True)
-
-            self._accumulate_attributions(
-                attr_accumulator=self._attr_val_accumulator[self.unembed_path],
-                target_idx=d_idx,
-                source_layers=source_layers,
-                source_acts=source_acts,
-                source_grads=list(grads),
-                ci=ci,
-                tokens=tokens,
-            )
+            for source_layer, act, grad in zip(source_layers, source_acts, grads, strict=True):
+                if source_layer == self.embed_path:
+                    # token attribution is just grad * act
+                    # because act is just the embedding
+                    token_attr = (grad * act).sum(dim=-1)  # (B S)
+                    self._emb_unemb_attr_acc[d_idx].scatter_add_(
+                        0, tokens.flatten(), token_attr.flatten()
+                    )
+                else:
+                    # Per-component: sum grad*act*ci over batch and sequence
+                    ci_weighted_attr = (grad * act * ci[source_layer]).sum(dim=(0, 1))
+                    self._unembed_srcs_acc[source_layer][d_idx].add_(ci_weighted_attr)
 
     def _process_component_targets(
         self,
@@ -339,55 +244,70 @@ def _process_component_targets(
         source_acts = [cache[f"{s}_post_detach"] for s in source_layers]
 
         for t_idx in torch.where(alive_targets)[0].tolist():
-            attr_for_target = partial(
-                self._accumulate_attributions,
-                target_idx=t_idx,
-                source_layers=source_layers,
-                source_acts=source_acts,
-                ci=ci,
-                tokens=tokens,
-            )
+            grads_val = torch.autograd.grad(target_acts[t_idx], source_acts, retain_graph=True)
+            grads_abs = torch.autograd.grad(target_acts_abs[t_idx], source_acts, retain_graph=True)
 
-            val_grads = torch.autograd.grad(target_acts[t_idx], source_acts, retain_graph=True)
-            attr_for_target(
-                attr_accumulator=self._attr_val_accumulator[target_layer],
-                source_grads=list(val_grads),
-            )
+            for source_layer, act, grad_val, grad_abs in zip(
+                source_layers, source_acts, grads_val, grads_abs, strict=True
+            ):
+                if source_layer == self.embed_path:
+                    # token attribution is just grad * act
+                    # because act is just the embedding
+                    tok_embeddings = act
 
-            abs_grads = torch.autograd.grad(target_acts_abs[t_idx], source_acts, retain_graph=True)
-            attr_for_target(
-                attr_accumulator=self._attr_abs_accumulator[target_layer],
-                source_grads=list(abs_grads),
-            )
+                    token_attr = (grad_val * tok_embeddings).sum(dim=-1)  # (B S)
+                    token_attr_abs = (grad_abs * tok_embeddings).sum(dim=-1)  # (B S)
 
-    @torch.no_grad()  # pyright: ignore[reportUntypedFunctionDecorator]
-    def _accumulate_attributions(
-        self,
-        attr_accumulator: dict[str, Float[Tensor, "target_c source_c"]],
-        target_idx: int,
-        source_layers: list[str],
-        source_acts: list[Float[Tensor, "batch seq c"]],
-        source_grads: list[Float[Tensor, "batch seq c"]],
-        ci: dict[str, Float[Tensor, "batch seq c"]],
-        tokens: Int[Tensor, "batch seq"],
-    ) -> None:
-        """Accumulate grad*act attributions from sources to a target column."""
-        for source_layer, act, grad in zip(source_layers, source_acts, source_grads, strict=True):
-            attr_acc = attr_accumulator[source_layer][target_idx]  # (C_source,)
+                    acc = self._embed_tgts_acc[source_layer][t_idx]
+                    acc_abs = self._embed_tgts_acc_abs[source_layer][t_idx]
 
-            # Embed has no CI (all tokens always active)
-            source_ci = ci[source_layer] if source_layer != self.embed_path else 1.0
+                    acc.scatter_add_(0, tokens.flatten(), token_attr.flatten())
+                    acc_abs.scatter_add_(0, tokens.flatten(), token_attr_abs.flatten())
+                else:
+                    ci_weighted_attr_val = grad_val * act * ci[source_layer]  # (B S C)
+                    ci_weighted_attr_abs = grad_abs * act * ci[source_layer]  # (B S C)
 
-            ci_weighted_attr_val = grad * act * source_ci  # (B S C)
+                    ci_weighted_attr_abs_sum = ci_weighted_attr_abs.sum(dim=(0, 1))  # (C,)
+                    ci_weighted_attr_val_sum = ci_weighted_attr_val.sum(dim=(0, 1))  # (C,)
 
-            if source_layer == self.embed_path:
-                # Per-token: sum grad*act*ci over d_model, scatter by token id
-                attr_val = ci_weighted_attr_val.sum(dim=-1).flatten()
+                    attr_acc = self._regular_layers_acc[target_layer][source_layer][t_idx]
+                    attr_acc_abs = self._regular_layers_acc_abs[target_layer][source_layer][t_idx]
 
-                attr_acc.scatter_add_(0, tokens.flatten(), attr_val)
-            else:
-                # Per-component: sum grad*act*ci over batch and sequence
-                attr_acc.add_(ci_weighted_attr_val.sum(dim=(0, 1)))
+                    attr_acc.add_(ci_weighted_attr_val_sum)
+                    attr_acc_abs.add_(ci_weighted_attr_abs_sum)
+
+
+    # def normalized_attrs(self) -> NormalizedAttrs:
+    #     """Return the accumulated attributions normalized by n_tokens.
+
+    #     mean_squared_attr is pre-sqrt so it can be merged across workers.
+    #     """
+    #     normed_attr_val = defaultdict[str, dict[str, Float[Tensor, "c_target c_source"]]](dict)
+    #     normed_attr_abs = defaultdict[str, dict[str, Float[Tensor, "c_target c_source"]]](dict)
+
+    #     for target in self._attr_val_accumulator:
+    #         mean_squared_act = self._square_act_accumulator[target] / self.n_tokens
+    #         mean_target_act_l2 = mean_squared_act.sqrt()  # (C_target,)
+
+    #         for source in self.sources_by_target[target]:
+    #             mean_attr_val = self._attr_val_accumulator[target][source]  # (C_target, C_source)
+    #             mean_attr_abs = self._attr_abs_accumulator[target][source]  # (C_target, C_source)
+
+    #             source_ci_sum = (
+    #                 self._ci_sum_accumulator[source] if source != self.embed_path else 1.0
+    #             )  # (C_source,)
+
+    #             ci_weighted_mean_attr_val = mean_attr_val / source_ci_sum  # (C_target, C_source)
+    #             ci_weighted_mean_attr_abs = mean_attr_abs / source_ci_sum  # (C_target, C_source)
 
+    #             normed_attr_val[target][source] = (
+    #                 ci_weighted_mean_attr_val / mean_target_act_l2[..., None]
+    #             )
+    #             normed_attr_abs[target][source] = (
+    #                 ci_weighted_mean_attr_abs / mean_target_act_l2[..., None]
+    #             )
 
-# TODO symbolic reified in / out handling
+    #     return self.NormalizedAttrs(
+    #         attr=normed_attr_val,
+    #         attr_abs=normed_attr_abs,
+    #     )
diff --git a/spd/dataset_attributions/storage.py b/spd/dataset_attributions/storage.py
index 22dae2e6e..65499bb59 100644
--- a/spd/dataset_attributions/storage.py
+++ b/spd/dataset_attributions/storage.py
@@ -48,17 +48,15 @@ class DatasetAttributionStorage:
 
     def __init__(
         self,
-        attr: AttrDict,
-        attr_abs: AttrDict,
-        mean_squared_attr: AttrDict,
+        # attr: AttrDict,
+        # attr_abs: AttrDict,
         vocab_size: int,
         ci_threshold: float,
         n_batches_processed: int,
         n_tokens_processed: int,
     ):
-        self.attr = attr
-        self.attr_abs = attr_abs
-        self.mean_squared_attr = mean_squared_attr
+        # self.attr = attr
+        # self.attr_abs = attr_abs
         self.vocab_size = vocab_size
         self.ci_threshold = ci_threshold
         self.n_batches_processed = n_batches_processed
@@ -66,70 +64,51 @@ def __init__(
 
     @property
     def n_components(self) -> int:
-        total = 0
-        for target_layer in self.attr:
-            if target_layer == "output":
-                continue
-            first_source = next(iter(self.attr[target_layer].values()))
-            total += first_source.shape[0]
-        return total
-
-    @staticmethod
-    def _parse_key(key: str) -> tuple[str, int]:
-        layer, idx_str = key.rsplit(":", 1)
-        return layer, int(idx_str)
+        raise NotImplementedError("Not implemented")
+        # total = 0
+        # for target_layer in self.attr:
+        #     if target_layer == "output":
+        #         continue
+        #     first_source = next(iter(self.attr[target_layer].values()))
+        #     total += first_source.shape[0]
+        # return total
+
+    # @staticmethod
+    # def _parse_key(key: str) -> tuple[str, int]:
+    #     layer, idx_str = key.rsplit(":", 1)
+    #     return layer, int(idx_str)
 
     def has_source(self, key: str) -> bool:
-        layer, idx = self._parse_key(key)
-        if layer == "output":
-            return False
-        for target_sources in self.attr.values():
-            if layer in target_sources:
-                return 0 <= idx < target_sources[layer].shape[1]
-        return False
+        raise NotImplementedError("Not implemented")
+        # layer, idx = self._parse_key(key)
+        # if layer == "output":
+        #     return False
+        # for target_sources in self.attr.values():
+        #     if layer in target_sources:
+        #         return 0 <= idx < target_sources[layer].shape[1]
+        # return False
 
     def has_target(self, key: str) -> bool:
-        layer, idx = self._parse_key(key)
-        match layer:
-            case "embed":
-                return False
-            case "output":
-                return 0 <= idx < self.vocab_size
-            case _:
-                if layer not in self.attr:
-                    return False
-                first_source = next(iter(self.attr[layer].values()))
-                return 0 <= idx < first_source.shape[0]
-
-    def _get_attr_dict(self, metric: AttrMetric) -> AttrDict:
-        match metric:
-            case "attr":
-                return self.attr
-            case "attr_abs":
-                return self.attr_abs
-            case "mean_squared_attr":
-                return self.mean_squared_attr
-
-    def get_attribution(
-        self,
-        source_key: str,
-        target_key: str,
-        metric: AttrMetric,
-        w_unembed: Tensor | None = None,
-    ) -> float:
-        source_layer, source_idx = self._parse_key(source_key)
-        target_layer, target_idx = self._parse_key(target_key)
-        assert source_layer != "output", f"output tokens cannot be sources: {source_key}"
-
-        attrs = self._get_attr_dict(metric)
-        attr_matrix = attrs[target_layer][source_layer]
-
-        if target_layer == "output":
-            assert w_unembed is not None, "w_unembed required for output target queries"
-            w_unembed = w_unembed.to(attr_matrix.device)
-            return (attr_matrix[:, source_idx] @ w_unembed[:, target_idx]).item()
-
-        return attr_matrix[target_idx, source_idx].item()
+        raise NotImplementedError("Not implemented")
+
+    #     layer, idx = self._parse_key(key)
+    #     match layer:
+    #         case "embed":
+    #             return False
+    #         case "output":
+    #             return 0 <= idx < self.vocab_size
+    #         case _:
+    #             if layer not in self.attr:
+    #                 return False
+    #             first_source = next(iter(self.attr[layer].values()))
+    #             return 0 <= idx < first_source.shape[0]
+
+    # def _get_attr_dict(self, metric: AttrMetric) -> AttrDict:
+    #     match metric:
+    #         case "attr":
+    #             return self.attr
+    #         case "attr_abs":
+    #             return self.attr_abs
 
     def get_top_sources(
         self,
@@ -139,27 +118,29 @@ def get_top_sources(
         metric: AttrMetric,
         w_unembed: Tensor | None = None,
     ) -> list[DatasetAttributionEntry]:
-        target_layer, target_idx = self._parse_key(target_key)
-        attrs = self._get_attr_dict(metric)
+        raise NotImplementedError("Not implemented")
+        # target_layer, target_idx = self._parse_key(target_key)
 
-        if target_layer == "output":
-            assert w_unembed is not None, "w_unembed required for output target queries"
+        # attrs = self._get_attr_dict(metric)
 
-        value_segments: list[Tensor] = []
-        layer_names: list[str] = []
+        # if target_layer == "output":
+        #     assert w_unembed is not None, "w_unembed required for output target queries"
 
-        for source_layer, attr_matrix in attrs[target_layer].items():
-            if target_layer == "output":
-                assert w_unembed is not None
-                w = w_unembed.to(attr_matrix.device)
-                values = w[:, target_idx] @ attr_matrix
-            else:
-                values = attr_matrix[target_idx, :]
+        # value_segments: list[Tensor] = []
+        # layer_names: list[str] = []
 
-            value_segments.append(values)
-            layer_names.append(source_layer)
+        # for source_layer, attr_matrix in attrs[target_layer].items():
+        #     if target_layer == "output":
+        #         assert w_unembed is not None
+        #         w = w_unembed.to(attr_matrix.device)
+        #         values = w[:, target_idx] @ attr_matrix
+        #     else:
+        #         values = attr_matrix[target_idx, :]
 
-        return self._top_k_from_segments(value_segments, layer_names, k, sign)
+        #     value_segments.append(values)
+        #     layer_names.append(source_layer)
+
+        # return self._top_k_from_segments(value_segments, layer_names, k, sign)
 
     def get_top_targets(
         self,
@@ -170,161 +151,163 @@ def get_top_targets(
         w_unembed: Tensor | None = None,
         include_outputs: bool = True,
     ) -> list[DatasetAttributionEntry]:
-        source_layer, source_idx = self._parse_key(source_key)
-        attrs = self._get_attr_dict(metric)
-
-        value_segments: list[Tensor] = []
-        layer_names: list[str] = []
-
-        for target_layer, sources in attrs.items():
-            if source_layer not in sources:
-                continue
-
-            attr_matrix = sources[source_layer]
-
-            if target_layer == "output":
-                if not include_outputs:
-                    continue
-                assert w_unembed is not None, "w_unembed required when include_outputs=True"
-                w = w_unembed.to(attr_matrix.device)
-                values = attr_matrix[:, source_idx] @ w
-            else:
-                values = attr_matrix[:, source_idx]
-
-            value_segments.append(values)
-            layer_names.append(target_layer)
-
-        return self._top_k_from_segments(value_segments, layer_names, k, sign)
-
-    def _top_k_from_segments(
-        self,
-        value_segments: list[Tensor],
-        layer_names: list[str],
-        k: int,
-        sign: Literal["positive", "negative"],
-    ) -> list[DatasetAttributionEntry]:
-        if not value_segments:
-            return []
-
-        all_values = torch.cat(value_segments)
-        offsets = [0]
-        for seg in value_segments:
-            offsets.append(offsets[-1] + len(seg))
-
-        is_positive = sign == "positive"
-        top_vals, top_idxs = torch.topk(all_values, min(k, len(all_values)), largest=is_positive)
-
-        mask = top_vals > 0 if is_positive else top_vals < 0
-        top_vals, top_idxs = top_vals[mask], top_idxs[mask]
-
-        results = []
-        for flat_idx, val in zip(top_idxs.tolist(), top_vals.tolist(), strict=True):
-            seg_idx = bisect.bisect_right(offsets, flat_idx) - 1
-            local_idx = flat_idx - offsets[seg_idx]
-            layer = layer_names[seg_idx]
-            results.append(
-                DatasetAttributionEntry(
-                    component_key=f"{layer}:{local_idx}",
-                    layer=layer,
-                    component_idx=local_idx,
-                    value=val,
-                )
-            )
-        return results
+        raise NotImplementedError("Not implemented")
+        # source_layer, source_idx = self._parse_key(source_key)
+        # attrs = self._get_attr_dict(metric)
+
+        # value_segments: list[Tensor] = []
+        # layer_names: list[str] = []
+
+        # for target_layer, sources in attrs.items():
+        #     if source_layer not in sources:
+        #         continue
+
+        #     attr_matrix = sources[source_layer]
+
+        #     if target_layer == "output":
+        #         if not include_outputs:
+        #             continue
+        #         assert w_unembed is not None, "w_unembed required when include_outputs=True"
+        #         w = w_unembed.to(attr_matrix.device)
+        #         values = attr_matrix[:, source_idx] @ w
+        #     else:
+        #         values = attr_matrix[:, source_idx]
+
+        #     value_segments.append(values)
+        #     layer_names.append(target_layer)
+
+        # return self._top_k_from_segments(value_segments, layer_names, k, sign)
+
+    # def _top_k_from_segments(
+    #     self,
+    #     value_segments: list[Tensor],
+    #     layer_names: list[str],
+    #     k: int,
+    #     sign: Literal["positive", "negative"],
+    # ) -> list[DatasetAttributionEntry]:
+    #     if not value_segments:
+    #         return []
+
+    #     all_values = torch.cat(value_segments)
+    #     offsets = [0]
+    #     for seg in value_segments:
+    #         offsets.append(offsets[-1] + len(seg))
+
+    #     is_positive = sign == "positive"
+    #     top_vals, top_idxs = torch.topk(all_values, min(k, len(all_values)), largest=is_positive)
+
+    #     mask = top_vals > 0 if is_positive else top_vals < 0
+    #     top_vals, top_idxs = top_vals[mask], top_idxs[mask]
+
+    #     results = []
+    #     for flat_idx, val in zip(top_idxs.tolist(), top_vals.tolist(), strict=True):
+    #         seg_idx = bisect.bisect_right(offsets, flat_idx) - 1
+    #         local_idx = flat_idx - offsets[seg_idx]
+    #         layer = layer_names[seg_idx]
+    #         results.append(
+    #             DatasetAttributionEntry(
+    #                 component_key=f"{layer}:{local_idx}",
+    #                 layer=layer,
+    #                 component_idx=local_idx,
+    #                 value=val,
+    #             )
+    #         )
+    #     return results
 
     def save(self, path: Path) -> None:
-        path.parent.mkdir(parents=True, exist_ok=True)
-
-        def to_cpu(d: AttrDict) -> AttrDict:
-            return {
-                target: {source: tensor.cpu() for source, tensor in sources.items()}
-                for target, sources in d.items()
-            }
-
-        torch.save(
-            {
-                "attr": to_cpu(self.attr),
-                "attr_abs": to_cpu(self.attr_abs),
-                "mean_squared_attr": to_cpu(self.mean_squared_attr),
-                "vocab_size": self.vocab_size,
-                "ci_threshold": self.ci_threshold,
-                "n_batches_processed": self.n_batches_processed,
-                "n_tokens_processed": self.n_tokens_processed,
-            },
-            path,
-        )
-        size_mb = path.stat().st_size / (1024 * 1024)
-        logger.info(f"Saved dataset attributions to {path} ({size_mb:.1f} MB)")
+        raise NotImplementedError("Not implemented")
+        # path.parent.mkdir(parents=True, exist_ok=True)
+
+        # def to_cpu(d: AttrDict) -> AttrDict:
+        #     return {
+        #         target: {source: tensor.cpu() for source, tensor in sources.items()}
+        #         for target, sources in d.items()
+        #     }
+
+        # torch.save(
+        #     {
+        #         "attr": to_cpu(self.attr),
+        #         "attr_abs": to_cpu(self.attr_abs),
+        #         "vocab_size": self.vocab_size,
+        #         "ci_threshold": self.ci_threshold,
+        #         "n_batches_processed": self.n_batches_processed,
+        #         "n_tokens_processed": self.n_tokens_processed,
+        #     },
+        #     path,
+        # )
+        # size_mb = path.stat().st_size / (1024 * 1024)
+        # logger.info(f"Saved dataset attributions to {path} ({size_mb:.1f} MB)")
 
     @classmethod
     def load(cls, path: Path) -> "DatasetAttributionStorage":
-        data = torch.load(path, weights_only=True)
-        return cls(
-            attr=data["attr"],
-            attr_abs=data["attr_abs"],
-            mean_squared_attr=data["mean_squared_attr"],
-            vocab_size=data["vocab_size"],
-            ci_threshold=data["ci_threshold"],
-            n_batches_processed=data["n_batches_processed"],
-            n_tokens_processed=data["n_tokens_processed"],
-        )
+        raise NotImplementedError("Not implemented")
+        # data = torch.load(path, weights_only=True)
+        # return cls(
+        #     # attr=data["attr"],
+        #     # attr_abs=data["attr_abs"],
+        #     vocab_size=data["vocab_size"],
+        #     ci_threshold=data["ci_threshold"],
+        #     n_batches_processed=data["n_batches_processed"],
+        #     n_tokens_processed=data["n_tokens_processed"],
+        # )
 
     @classmethod
     def merge(cls, paths: list[Path]) -> "DatasetAttributionStorage":
-        """Merge partial attribution files from parallel workers.
-
-        All three metrics are means, so merge is weighted average by n_tokens.
-        (mean_squared_attr is E[x²], not sqrt(E[x²]), so this works.)
-        """
-        assert paths, "No files to merge"
-
-        first = cls.load(paths[0])
-        n = first.n_tokens_processed
-
-        def denormalize(d: AttrDict, n_tokens: int) -> AttrDict:
-            return {
-                target: {source: (tensor * n_tokens).double() for source, tensor in sources.items()}
-                for target, sources in d.items()
-            }
-
-        total_attr = denormalize(first.attr, n)
-        total_attr_abs = denormalize(first.attr_abs, n)
-        total_mean_squared_attr = denormalize(first.mean_squared_attr, n)
-        total_tokens = n
-        total_batches = first.n_batches_processed
-
-        for path in paths[1:]:
-            storage = cls.load(path)
-            assert storage.ci_threshold == first.ci_threshold, "CI threshold mismatch"
-            assert storage.attr.keys() == first.attr.keys(), "Target layer mismatch"
-            n = storage.n_tokens_processed
-
-            for target, sources in storage.attr.items():
-                for source, tensor in sources.items():
-                    total_attr[target][source] += (tensor * n).double()
-                    total_attr_abs[target][source] += (
-                        storage.attr_abs[target][source] * n
-                    ).double()
-                    total_mean_squared_attr[target][source] += (
-                        storage.mean_squared_attr[target][source] * n
-                    ).double()
-            total_tokens += n
-            total_batches += storage.n_batches_processed
-
-        def normalize(d: AttrDict) -> AttrDict:
-            return {
-                target: {
-                    source: (tensor / total_tokens).float() for source, tensor in sources.items()
-                }
-                for target, sources in d.items()
-            }
-
-        return cls(
-            attr=normalize(total_attr),
-            attr_abs=normalize(total_attr_abs),
-            mean_squared_attr=normalize(total_mean_squared_attr),
-            vocab_size=first.vocab_size,
-            ci_threshold=first.ci_threshold,
-            n_batches_processed=total_batches,
-            n_tokens_processed=total_tokens,
-        )
+        raise NotImplementedError("Not implemented")
+        # """Merge partial attribution files from parallel workers.
+
+        # All three metrics are means, so merge is weighted average by n_tokens.
+        # (mean_squared_attr is E[x²], not sqrt(E[x²]), so this works.)
+        # """
+        # assert paths, "No files to merge"
+
+        # first = cls.load(paths[0])
+        # n = first.n_tokens_processed
+
+        # def denormalize(d: AttrDict, n_tokens: int) -> AttrDict:
+        #     return {
+        #         target: {source: (tensor * n_tokens).double() for source, tensor in sources.items()}
+        #         for target, sources in d.items()
+        #     }
+
+        # total_attr = denormalize(first.attr, n)
+        # total_attr_abs = denormalize(first.attr_abs, n)
+        # # total_mean_squared_attr = denormalize(first.mean_squared_attr, n)
+        # total_tokens = n
+        # total_batches = first.n_batches_processed
+
+        # for path in paths[1:]:
+        #     storage = cls.load(path)
+        #     assert storage.ci_threshold == first.ci_threshold, "CI threshold mismatch"
+        #     assert storage.attr.keys() == first.attr.keys(), "Target layer mismatch"
+        #     n = storage.n_tokens_processed
+
+        #     for target, sources in storage.attr.items():
+        #         for source, tensor in sources.items():
+        #             total_attr[target][source] += (tensor * n).double()
+        #             total_attr_abs[target][source] += (
+        #                 storage.attr_abs[target][source] * n
+        #             ).double()
+        #             total_mean_squared_attr[target][source] += (
+        #                 storage.mean_squared_attr[target][source] * n
+        #             ).double()
+        #     total_tokens += n
+        #     total_batches += storage.n_batches_processed
+
+        # def normalize(d: AttrDict) -> AttrDict:
+        #     return {
+        #         target: {
+        #             source: (tensor / total_tokens).float() for source, tensor in sources.items()
+        #         }
+        #         for target, sources in d.items()
+        #     }
+
+        # return cls(
+        #     attr=normalize(total_attr),
+        #     attr_abs=normalize(total_attr_abs),
+        #     mean_squared_attr=normalize(total_mean_squared_attr),
+        #     vocab_size=first.vocab_size,
+        #     ci_threshold=first.ci_threshold,
+        #     n_batches_processed=total_batches,
+        #     n_tokens_processed=total_tokens,
+        # )

From a557576d7ada303e8d243f72614a6272ba491dff Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Tue, 24 Feb 2026 15:27:28 +0000
Subject: [PATCH 53/62] Rewrite dataset attribution storage with explicit edge
 types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Storage now has four structurally distinct edge types instead of a
uniform dict-of-dicts: regular (component→component), embed
(embed→component), unembed (component→unembed in residual space),
and embed_unembed (embed→unembed). w_unembed is stored alongside
attribution data so consumers never need to provide the projection
matrix. Dropped mean_squared_attr metric and has_source/has_target
methods — query methods return [] for nonexistent components.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../backend/routers/dataset_attributions.py   |  99 +---
 .../ui/DatasetAttributionsSection.svelte      |  19 -
 .../src/lib/api/datasetAttributions.ts        |   8 +-
 spd/dataset_attributions/CLAUDE.md            |  38 +-
 spd/dataset_attributions/harvest.py           |  21 +-
 spd/dataset_attributions/harvester.py         |   1 -
 spd/dataset_attributions/storage.py           | 512 ++++++++++--------
 spd/topological_interp/interpret.py           |  12 +-
 spd/topological_interp/scripts/run.py         |   3 -
 tests/dataset_attributions/test_storage.py    | 172 +++---
 10 files changed, 424 insertions(+), 461 deletions(-)

diff --git a/spd/app/backend/routers/dataset_attributions.py b/spd/app/backend/routers/dataset_attributions.py
index adbd4592b..1fe1e66fa 100644
--- a/spd/app/backend/routers/dataset_attributions.py
+++ b/spd/app/backend/routers/dataset_attributions.py
@@ -7,16 +7,14 @@
 from typing import Annotated, Literal
 
 from fastapi import APIRouter, HTTPException, Query
-from jaxtyping import Float
 from pydantic import BaseModel
-from torch import Tensor
 
 from spd.app.backend.dependencies import DepLoadedRun
 from spd.app.backend.utils import log_errors
 from spd.dataset_attributions.storage import AttrMetric, DatasetAttributionStorage
 from spd.dataset_attributions.storage import DatasetAttributionEntry as StorageEntry
 
-ATTR_METRICS: list[AttrMetric] = ["attr", "attr_abs", "mean_squared_attr"]
+ATTR_METRICS: list[AttrMetric] = ["attr", "attr_abs"]
 
 
 class DatasetAttributionEntry(BaseModel):
@@ -45,7 +43,6 @@ class ComponentAttributions(BaseModel):
 class AllMetricAttributions(BaseModel):
     attr: ComponentAttributions
     attr_abs: ComponentAttributions
-    mean_squared_attr: ComponentAttributions
 
 
 router = APIRouter(prefix="/api/dataset_attributions", tags=["dataset_attributions"])
@@ -55,36 +52,12 @@ class AllMetricAttributions(BaseModel):
 )
 
 
-def _storage_key(canonical_layer: str, component_idx: int) -> str:
-    return f"{canonical_layer}:{component_idx}"
-
-
 def _require_storage(loaded: DepLoadedRun) -> DatasetAttributionStorage:
     if loaded.attributions is None:
         raise HTTPException(status_code=404, detail=NOT_AVAILABLE_MSG)
     return loaded.attributions.get_attributions()
 
 
-def _require_source(storage: DatasetAttributionStorage, component_key: str) -> None:
-    if not storage.has_source(component_key):
-        raise HTTPException(
-            status_code=404,
-            detail=f"Component {component_key} not found as source in attributions",
-        )
-
-
-def _require_target(storage: DatasetAttributionStorage, component_key: str) -> None:
-    if not storage.has_target(component_key):
-        raise HTTPException(
-            status_code=404,
-            detail=f"Component {component_key} not found as target in attributions",
-        )
-
-
-def _get_w_unembed(loaded: DepLoadedRun) -> Float[Tensor, "d_model vocab"]:
-    return loaded.topology.get_unembed_weight()
-
-
 def _to_api_entries(
     entries: list[StorageEntry], loaded: DepLoadedRun
 ) -> list[DatasetAttributionEntry]:
@@ -108,47 +81,20 @@ def _get_component_attributions_for_metric(
     component_key: str,
     k: int,
     metric: AttrMetric,
-    is_source: bool,
-    is_target: bool,
-    w_unembed: Float[Tensor, "d_model vocab"] | None,
 ) -> ComponentAttributions:
     return ComponentAttributions(
         positive_sources=_to_api_entries(
             storage.get_top_sources(component_key, k, "positive", metric), loaded
-        )
-        if is_target
-        else [],
+        ),
         negative_sources=_to_api_entries(
             storage.get_top_sources(component_key, k, "negative", metric), loaded
-        )
-        if is_target
-        else [],
+        ),
         positive_targets=_to_api_entries(
-            storage.get_top_targets(
-                component_key,
-                k,
-                "positive",
-                metric,
-                w_unembed=w_unembed,
-                include_outputs=w_unembed is not None,
-            ),
-            loaded,
-        )
-        if is_source
-        else [],
+            storage.get_top_targets(component_key, k, "positive", metric), loaded
+        ),
         negative_targets=_to_api_entries(
-            storage.get_top_targets(
-                component_key,
-                k,
-                "negative",
-                metric,
-                w_unembed=w_unembed,
-                include_outputs=w_unembed is not None,
-            ),
-            loaded,
-        )
-        if is_source
-        else [],
+            storage.get_top_targets(component_key, k, "negative", metric), loaded
+        ),
     )
 
 
@@ -181,25 +127,14 @@ def get_component_attributions(
     loaded: DepLoadedRun,
     k: Annotated[int, Query(ge=1)] = 10,
 ) -> AllMetricAttributions:
-    """Get all attribution data for a component across all 3 metrics."""
+    """Get all attribution data for a component across all metrics."""
     storage = _require_storage(loaded)
-    component_key = _storage_key(layer, component_idx)
-
-    is_source = storage.has_source(component_key)
-    is_target = storage.has_target(component_key)
-
-    if not is_source and not is_target:
-        raise HTTPException(
-            status_code=404,
-            detail=f"Component {component_key} not found in attributions",
-        )
-
-    w_unembed = _get_w_unembed(loaded) if is_source else None
+    component_key = f"{layer}:{component_idx}"
 
     return AllMetricAttributions(
         **{
             metric: _get_component_attributions_for_metric(
-                storage, loaded, component_key, k, metric, is_source, is_target, w_unembed
+                storage, loaded, component_key, k, metric
             )
             for metric in ATTR_METRICS
         }
@@ -217,13 +152,8 @@ def get_attribution_sources(
     metric: AttrMetric = "attr",
 ) -> list[DatasetAttributionEntry]:
     storage = _require_storage(loaded)
-    target_key = _storage_key(layer, component_idx)
-    _require_target(storage, target_key)
-
-    w_unembed = _get_w_unembed(loaded) if layer == "output" else None
-
     return _to_api_entries(
-        storage.get_top_sources(target_key, k, sign, metric, w_unembed=w_unembed), loaded
+        storage.get_top_sources(f"{layer}:{component_idx}", k, sign, metric), loaded
     )
 
 
@@ -238,11 +168,6 @@ def get_attribution_targets(
     metric: AttrMetric = "attr",
 ) -> list[DatasetAttributionEntry]:
     storage = _require_storage(loaded)
-    source_key = _storage_key(layer, component_idx)
-    _require_source(storage, source_key)
-
-    w_unembed = _get_w_unembed(loaded)
-
     return _to_api_entries(
-        storage.get_top_targets(source_key, k, sign, metric, w_unembed=w_unembed), loaded
+        storage.get_top_targets(f"{layer}:{component_idx}", k, sign, metric), loaded
     )
diff --git a/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte b/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
index 1d8799d63..8363c0f88 100644
--- a/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
+++ b/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
@@ -50,9 +50,6 @@
     const absMaxSource = $derived(maxAbs(attributions.attr_abs.positive_sources[0]?.value ?? 0, attributions.attr_abs.negative_sources[0]?.value ?? 0));
     const absMaxTarget = $derived(maxAbs(attributions.attr_abs.positive_targets[0]?.value ?? 0, attributions.attr_abs.negative_targets[0]?.value ?? 0));
 
-    // mean_squared_attr: unsigned (positive only)
-    const rmsMaxSource = $derived(attributions.mean_squared_attr.positive_sources[0]?.value ?? 0);
-    const rmsMaxTarget = $derived(attributions.mean_squared_attr.positive_targets[0]?.value ?? 0);
 </script>
 
 <div class="section">
@@ -65,10 +62,6 @@
             <input type="radio" name="dataset-attr-metric" checked={selectedMetric === "attr_abs"} onchange={() => (selectedMetric = "attr_abs")} />
             <span class="stat-label">Abs Target</span>
         </label>
-        <label class="radio-item">
-            <input type="radio" name="dataset-attr-metric" checked={selectedMetric === "mean_squared_attr"} onchange={() => (selectedMetric = "mean_squared_attr")} />
-            <span class="stat-label">RMS</span>
-        </label>
     </div>
 
     {#if selectedMetric === "attr"}
@@ -95,18 +88,6 @@
             pageSize={COMPONENT_CARD_CONSTANTS.DATASET_ATTRIBUTIONS_PAGE_SIZE}
             onClick={handleClick}
         />
-    {:else}
-        <EdgeAttributionGrid
-            title="Dataset Attributions"
-            incomingLabel="Incoming"
-            outgoingLabel="Outgoing"
-            incomingPositive={toEdgeAttribution(attributions.mean_squared_attr.positive_sources, rmsMaxSource)}
-            incomingNegative={[]}
-            outgoingPositive={toEdgeAttribution(attributions.mean_squared_attr.positive_targets, rmsMaxTarget)}
-            outgoingNegative={[]}
-            pageSize={COMPONENT_CARD_CONSTANTS.DATASET_ATTRIBUTIONS_PAGE_SIZE}
-            onClick={handleClick}
-        />
     {/if}
 </div>
 
diff --git a/spd/app/frontend/src/lib/api/datasetAttributions.ts b/spd/app/frontend/src/lib/api/datasetAttributions.ts
index 158c68f6f..ad8638913 100644
--- a/spd/app/frontend/src/lib/api/datasetAttributions.ts
+++ b/spd/app/frontend/src/lib/api/datasetAttributions.ts
@@ -19,17 +19,11 @@ export type SignedAttributions = {
     negative_targets: DatasetAttributionEntry[];
 };
 
-export type UnsignedAttributions = {
-    positive_sources: DatasetAttributionEntry[];
-    positive_targets: DatasetAttributionEntry[];
-};
-
-export type AttrMetric = "attr" | "attr_abs" | "mean_squared_attr";
+export type AttrMetric = "attr" | "attr_abs";
 
 export type AllMetricAttributions = {
     attr: SignedAttributions;
     attr_abs: SignedAttributions;
-    mean_squared_attr: UnsignedAttributions;
 };
 
 export type DatasetAttributionsMetadata = {
diff --git a/spd/dataset_attributions/CLAUDE.md b/spd/dataset_attributions/CLAUDE.md
index ba2c75b9f..f91e68d75 100644
--- a/spd/dataset_attributions/CLAUDE.md
+++ b/spd/dataset_attributions/CLAUDE.md
@@ -42,36 +42,39 @@ SPD_OUT_DIR/dataset_attributions/<run_id>/
 
 `AttributionRepo.open(run_id)` loads the latest `da-*` subrun that has a `dataset_attributions.pt`.
 
-## Three Attribution Metrics
+## Attribution Metrics
 
-The harvester accumulates three metrics simultaneously:
+Two metrics: `AttrMetric = Literal["attr", "attr_abs"]`
 
 | Metric | Formula | Description |
 |--------|---------|-------------|
 | `attr` | E[∂y/∂x · x] | Signed mean attribution |
 | `attr_abs` | E[∂\|y\|/∂x · x] | Attribution to absolute value of target (2 backward passes) |
-| `mean_squared_attr` | E[(∂y/∂x · x)²] | Mean squared attribution (pre-sqrt, mergeable across workers) |
 
-Naming convention: modifier *before* `attr` applies to the target (e.g. `attr_abs` = attribution to |target|). Modifier *after* applies to the attribution itself (e.g. `squared_attr` = squared attribution).
+Naming convention: modifier *before* `attr` applies to the target (e.g. `attr_abs` = attribution to |target|).
 
 ## Architecture
 
 ### Storage (`storage.py`)
 
-`DatasetAttributionStorage` stores three nested dicts:
-```
-attrs[target_layer][source_layer] = Tensor[target_d, source_d]
-```
+`DatasetAttributionStorage` stores four structurally distinct edge types:
+
+| Edge type | Fields | Shape | Has abs? |
+|-----------|--------|-------|----------|
+| component → component | `regular_attr`, `regular_attr_abs` | `dict[target, dict[source, (tgt_c, src_c)]]` | yes |
+| embed → component | `embed_attr`, `embed_attr_abs` | `dict[target, (tgt_c, vocab)]` | yes |
+| component → unembed | `unembed_attr` | `dict[source, (d_model, src_c)]` | no |
+| embed → unembed | `embed_unembed_attr` | `(d_model, vocab)` | no |
 
 All layer names use **canonical addressing** (`"embed"`, `"0.glu.up"`, `"output"`).
 
-For output targets, `target_d = d_model`. Output token attributions computed on-the-fly: `attr @ w_unembed[:, token_id]`.
+Unembed edges are stored in residual space (d_model dimensions). `w_unembed` is stored alongside the attribution data, so output token attributions are computed on-the-fly internally — callers never need to provide the projection matrix. No abs variant for unembed edges because abs is a nonlinear operation incompatible with residual-space storage.
 
-Key methods: `get_attribution()`, `get_top_sources()`, `get_top_targets()` — all take an `AttrMetric` parameter to select which metric dict to query. `merge(paths)` classmethod for combining worker results.
+Key methods: `get_top_sources(key, k, sign, metric)`, `get_top_targets(key, k, sign, metric)`. Both return `[]` for nonexistent components. `merge(paths)` classmethod for combining worker results via weighted average by n_tokens.
 
 ### Harvester (`harvester.py`)
 
-Accumulates attributions using gradient × activation. Uses **concrete module paths** internally (talks to model cache/CI). Key optimizations:
+Accumulates attributions using gradient × activation. Uses **concrete module paths** internally (talks to model cache/CI). Four accumulator groups mirror the storage edge types. Key optimizations:
 1. Sum outputs over positions before gradients (reduces backward passes)
 2. Output-residual storage (O(d_model) instead of O(vocab))
 3. `scatter_add_` for embed sources, vectorized `.add_()` for components (>14x faster than per-element loops)
@@ -98,12 +101,13 @@ Orchestrates the pipeline: loads model, builds gradient connectivity, runs batch
 
 ## Query Methods
 
-All query methods take `metric: AttrMetric` (`"attr"`, `"attr_abs"`, or `"mean_squared_attr"`).
+All query methods take `metric: AttrMetric` (`"attr"` or `"attr_abs"`).
 
-| Method | w_unembed? | Description |
-|--------|-----------|-------------|
-| `get_top_sources(target_key, k, sign, metric)` | If output target | Top sources → target |
-| `get_top_targets(source_key, k, sign, metric)` | If include_outputs | Top targets ← source |
-| `get_attribution(source_key, target_key, metric)` | If output target | Single attribution value |
+| Method | Description |
+|--------|-------------|
+| `get_top_sources(target_key, k, sign, metric)` | Top sources → target |
+| `get_top_targets(source_key, k, sign, metric)` | Top targets ← source |
 
 Key format: `"embed:{token_id}"`, `"0.glu.up:{c_idx}"`, `"output:{token_id}"`.
+
+Note: `attr_abs` returns empty for output targets (unembed edges have no abs variant).
diff --git a/spd/dataset_attributions/harvest.py b/spd/dataset_attributions/harvest.py
index d55fc52c9..7280b9286 100644
--- a/spd/dataset_attributions/harvest.py
+++ b/spd/dataset_attributions/harvest.py
@@ -180,18 +180,21 @@ def harvest_attributions(
 
     # Translate concrete paths to canonical for storage
     to_canon = topology.target_to_canon
-    normalized = harvester.normalized_attrs()
 
-    def canonicalize(d: dict[str, dict[str, Tensor]]) -> dict[str, dict[str, Tensor]]:
-        return {
-            to_canon(target): {to_canon(src): tensor for src, tensor in src_attrs.items()}
-            for target, src_attrs in d.items()
-        }
+    def canon_nested(d: dict[str, dict[str, Tensor]]) -> dict[str, dict[str, Tensor]]:
+        return {to_canon(t): {to_canon(s): v for s, v in srcs.items()} for t, srcs in d.items()}
+
+    def canon_keys(d: dict[str, Tensor]) -> dict[str, Tensor]:
+        return {to_canon(k): v for k, v in d.items()}
 
     storage = DatasetAttributionStorage(
-        attr=canonicalize(normalized.attr),
-        attr_abs=canonicalize(normalized.attr_abs),
-        mean_squared_attr=canonicalize(normalized.mean_squared_attr),
+        regular_attr=canon_nested(harvester._regular_layers_acc),
+        regular_attr_abs=canon_nested(harvester._regular_layers_acc_abs),
+        embed_attr=canon_keys(harvester._embed_tgts_acc),
+        embed_attr_abs=canon_keys(harvester._embed_tgts_acc_abs),
+        unembed_attr=canon_keys(harvester._unembed_srcs_acc),
+        embed_unembed_attr=harvester._emb_unemb_attr_acc,
+        w_unembed=topology.get_unembed_weight(),
         vocab_size=vocab_size,
         ci_threshold=config.ci_threshold,
         n_batches_processed=harvester.n_batches,
diff --git a/spd/dataset_attributions/harvester.py b/spd/dataset_attributions/harvester.py
index 50adbd42d..1bcad5ae2 100644
--- a/spd/dataset_attributions/harvester.py
+++ b/spd/dataset_attributions/harvester.py
@@ -276,7 +276,6 @@ def _process_component_targets(
                     attr_acc.add_(ci_weighted_attr_val_sum)
                     attr_acc_abs.add_(ci_weighted_attr_abs_sum)
 
-
     # def normalized_attrs(self) -> NormalizedAttrs:
     #     """Return the accumulated attributions normalized by n_tokens.
 
diff --git a/spd/dataset_attributions/storage.py b/spd/dataset_attributions/storage.py
index 65499bb59..2dbf67651 100644
--- a/spd/dataset_attributions/storage.py
+++ b/spd/dataset_attributions/storage.py
@@ -1,18 +1,22 @@
 """Storage classes for dataset attributions.
 
-Stored as nested dicts: attrs[target_layer][source_layer] = Tensor[target_d, source_d]
+Four edge types, each with its own shape:
+- regular:        component → component  [tgt_c, src_c]  (signed + abs)
+- embed:          embed → component      [tgt_c, vocab]  (signed + abs)
+- unembed:        component → unembed    [d_model, src_c] (signed only, residual space)
+- embed_unembed:  embed → unembed        [d_model, vocab] (signed only, residual space)
 
-Three attribution metrics are stored:
-- attr: mean attribution of source to target (signed)
-- attr_abs: mean attribution of source to |target| (always positive for positive activations)
-- mean_squared_attr: mean of squared attributions (pre-sqrt, for mergeable RMS)
+Output (unembed) attributions are stored in residual space. Token-level attributions
+are computed on-the-fly via w_unembed projection.
 
-For output targets, target_d = d_model (residual stream dimension).
-Output token attributions are computed on-the-fly via w_unembed.
+Abs variants are unavailable for unembed edges because abs is a nonlinear operation
+incompatible with the residual-space storage trick.
 """
 
 import bisect
+from collections.abc import Callable
 from dataclasses import dataclass
+from functools import partial
 from pathlib import Path
 from typing import Literal
 
@@ -21,8 +25,7 @@
 
 from spd.log import logger
 
-AttrDict = dict[str, dict[str, Tensor]]
-AttrMetric = Literal["attr", "attr_abs", "mean_squared_attr"]
+AttrMetric = Literal["attr", "attr_abs"]
 
 
 @dataclass
@@ -48,67 +51,66 @@ class DatasetAttributionStorage:
 
     def __init__(
         self,
-        # attr: AttrDict,
-        # attr_abs: AttrDict,
+        regular_attr: dict[str, dict[str, Tensor]],
+        regular_attr_abs: dict[str, dict[str, Tensor]],
+        embed_attr: dict[str, Tensor],
+        embed_attr_abs: dict[str, Tensor],
+        unembed_attr: dict[str, Tensor],
+        embed_unembed_attr: Tensor,
+        w_unembed: Tensor,
         vocab_size: int,
         ci_threshold: float,
         n_batches_processed: int,
         n_tokens_processed: int,
     ):
-        # self.attr = attr
-        # self.attr_abs = attr_abs
+        self.regular_attr = regular_attr
+        self.regular_attr_abs = regular_attr_abs
+        self.embed_attr = embed_attr
+        self.embed_attr_abs = embed_attr_abs
+        self.unembed_attr = unembed_attr
+        self.embed_unembed_attr = embed_unembed_attr
+        self.w_unembed = w_unembed
         self.vocab_size = vocab_size
         self.ci_threshold = ci_threshold
         self.n_batches_processed = n_batches_processed
         self.n_tokens_processed = n_tokens_processed
 
+    @property
+    def target_layers(self) -> set[str]:
+        return self.regular_attr.keys() | self.embed_attr.keys()
+
+    def _target_n_components(self, layer: str) -> int | None:
+        """Number of target components for a layer, or None if not a target."""
+        if layer in self.embed_attr:
+            return self.embed_attr[layer].shape[0]
+        if layer in self.regular_attr:
+            first_source = next(iter(self.regular_attr[layer].values()))
+            return first_source.shape[0]
+        return None
+
     @property
     def n_components(self) -> int:
-        raise NotImplementedError("Not implemented")
-        # total = 0
-        # for target_layer in self.attr:
-        #     if target_layer == "output":
-        #         continue
-        #     first_source = next(iter(self.attr[target_layer].values()))
-        #     total += first_source.shape[0]
-        # return total
-
-    # @staticmethod
-    # def _parse_key(key: str) -> tuple[str, int]:
-    #     layer, idx_str = key.rsplit(":", 1)
-    #     return layer, int(idx_str)
-
-    def has_source(self, key: str) -> bool:
-        raise NotImplementedError("Not implemented")
-        # layer, idx = self._parse_key(key)
-        # if layer == "output":
-        #     return False
-        # for target_sources in self.attr.values():
-        #     if layer in target_sources:
-        #         return 0 <= idx < target_sources[layer].shape[1]
-        # return False
-
-    def has_target(self, key: str) -> bool:
-        raise NotImplementedError("Not implemented")
-
-    #     layer, idx = self._parse_key(key)
-    #     match layer:
-    #         case "embed":
-    #             return False
-    #         case "output":
-    #             return 0 <= idx < self.vocab_size
-    #         case _:
-    #             if layer not in self.attr:
-    #                 return False
-    #             first_source = next(iter(self.attr[layer].values()))
-    #             return 0 <= idx < first_source.shape[0]
-
-    # def _get_attr_dict(self, metric: AttrMetric) -> AttrDict:
-    #     match metric:
-    #         case "attr":
-    #             return self.attr
-    #         case "attr_abs":
-    #             return self.attr_abs
+        total = 0
+        for layer in self.target_layers:
+            n = self._target_n_components(layer)
+            assert n is not None
+            total += n
+        return total
+
+    @staticmethod
+    def _parse_key(key: str) -> tuple[str, int]:
+        layer, idx_str = key.rsplit(":", 1)
+        return layer, int(idx_str)
+
+    def _select_metric(
+        self, metric: AttrMetric
+    ) -> tuple[dict[str, dict[str, Tensor]], dict[str, Tensor]]:
+        """Return (regular_dict, embed_dict) for the given metric."""
+        match metric:
+            case "attr":
+                return self.regular_attr, self.embed_attr
+            case "attr_abs":
+                return self.regular_attr_abs, self.embed_attr_abs
 
     def get_top_sources(
         self,
@@ -116,31 +118,44 @@ def get_top_sources(
         k: int,
         sign: Literal["positive", "negative"],
         metric: AttrMetric,
-        w_unembed: Tensor | None = None,
     ) -> list[DatasetAttributionEntry]:
-        raise NotImplementedError("Not implemented")
-        # target_layer, target_idx = self._parse_key(target_key)
-
-        # attrs = self._get_attr_dict(metric)
-
-        # if target_layer == "output":
-        #     assert w_unembed is not None, "w_unembed required for output target queries"
-
-        # value_segments: list[Tensor] = []
-        # layer_names: list[str] = []
-
-        # for source_layer, attr_matrix in attrs[target_layer].items():
-        #     if target_layer == "output":
-        #         assert w_unembed is not None
-        #         w = w_unembed.to(attr_matrix.device)
-        #         values = w[:, target_idx] @ attr_matrix
-        #     else:
-        #         values = attr_matrix[target_idx, :]
-
-        #     value_segments.append(values)
-        #     layer_names.append(source_layer)
-
-        # return self._top_k_from_segments(value_segments, layer_names, k, sign)
+        target_layer, target_idx = self._parse_key(target_key)
+
+        value_segments: list[Tensor] = []
+        layer_names: list[str] = []
+
+        if target_layer == "output":
+            if metric == "attr_abs":
+                return []
+            w = self.w_unembed[:, target_idx].to(self.embed_unembed_attr.device)
+
+            # Component sources via unembed_attr
+            for source_layer, attr_matrix in self.unembed_attr.items():
+                values = w @ attr_matrix  # (d_model,) @ (d_model, src_c) → (src_c,)
+                value_segments.append(values)
+                layer_names.append(source_layer)
+
+            # Embed source via embed_unembed_attr
+            values = w @ self.embed_unembed_attr  # (d_model,) @ (d_model, vocab) → (vocab,)
+            value_segments.append(values)
+            layer_names.append("embed")
+        else:
+            regular, embed = self._select_metric(metric)
+
+            # Component sources
+            if target_layer in regular:
+                for source_layer, attr_matrix in regular[target_layer].items():
+                    values = attr_matrix[target_idx, :]
+                    value_segments.append(values)
+                    layer_names.append(source_layer)
+
+            # Embed source
+            if target_layer in embed:
+                values = embed[target_layer][target_idx, :]
+                value_segments.append(values)
+                layer_names.append("embed")
+
+        return self._top_k_from_segments(value_segments, layer_names, k, sign)
 
     def get_top_targets(
         self,
@@ -148,166 +163,193 @@ def get_top_targets(
         k: int,
         sign: Literal["positive", "negative"],
         metric: AttrMetric,
-        w_unembed: Tensor | None = None,
         include_outputs: bool = True,
     ) -> list[DatasetAttributionEntry]:
-        raise NotImplementedError("Not implemented")
-        # source_layer, source_idx = self._parse_key(source_key)
-        # attrs = self._get_attr_dict(metric)
-
-        # value_segments: list[Tensor] = []
-        # layer_names: list[str] = []
-
-        # for target_layer, sources in attrs.items():
-        #     if source_layer not in sources:
-        #         continue
-
-        #     attr_matrix = sources[source_layer]
-
-        #     if target_layer == "output":
-        #         if not include_outputs:
-        #             continue
-        #         assert w_unembed is not None, "w_unembed required when include_outputs=True"
-        #         w = w_unembed.to(attr_matrix.device)
-        #         values = attr_matrix[:, source_idx] @ w
-        #     else:
-        #         values = attr_matrix[:, source_idx]
-
-        #     value_segments.append(values)
-        #     layer_names.append(target_layer)
-
-        # return self._top_k_from_segments(value_segments, layer_names, k, sign)
-
-    # def _top_k_from_segments(
-    #     self,
-    #     value_segments: list[Tensor],
-    #     layer_names: list[str],
-    #     k: int,
-    #     sign: Literal["positive", "negative"],
-    # ) -> list[DatasetAttributionEntry]:
-    #     if not value_segments:
-    #         return []
-
-    #     all_values = torch.cat(value_segments)
-    #     offsets = [0]
-    #     for seg in value_segments:
-    #         offsets.append(offsets[-1] + len(seg))
-
-    #     is_positive = sign == "positive"
-    #     top_vals, top_idxs = torch.topk(all_values, min(k, len(all_values)), largest=is_positive)
-
-    #     mask = top_vals > 0 if is_positive else top_vals < 0
-    #     top_vals, top_idxs = top_vals[mask], top_idxs[mask]
-
-    #     results = []
-    #     for flat_idx, val in zip(top_idxs.tolist(), top_vals.tolist(), strict=True):
-    #         seg_idx = bisect.bisect_right(offsets, flat_idx) - 1
-    #         local_idx = flat_idx - offsets[seg_idx]
-    #         layer = layer_names[seg_idx]
-    #         results.append(
-    #             DatasetAttributionEntry(
-    #                 component_key=f"{layer}:{local_idx}",
-    #                 layer=layer,
-    #                 component_idx=local_idx,
-    #                 value=val,
-    #             )
-    #         )
-    #     return results
+        source_layer, source_idx = self._parse_key(source_key)
+
+        value_segments: list[Tensor] = []
+        layer_names: list[str] = []
+
+        if source_layer == "embed":
+            regular, embed = self._select_metric(metric)
+
+            for target_layer, attr_matrix in embed.items():
+                values = attr_matrix[:, source_idx]  # (tgt_c,)
+                value_segments.append(values)
+                layer_names.append(target_layer)
+
+            if include_outputs and metric == "attr":
+                residual = self.embed_unembed_attr[:, source_idx]  # (d_model,)
+                values = residual @ self.w_unembed  # (d_model,) @ (d_model, vocab) → (vocab,)
+                value_segments.append(values)
+                layer_names.append("output")
+        else:
+            regular, embed = self._select_metric(metric)
+
+            for target_layer, sources in regular.items():
+                if source_layer not in sources:
+                    continue
+                values = sources[source_layer][:, source_idx]  # (tgt_c,)
+                value_segments.append(values)
+                layer_names.append(target_layer)
+
+            if include_outputs and metric == "attr" and source_layer in self.unembed_attr:
+                residual = self.unembed_attr[source_layer][:, source_idx]  # (d_model,)
+                values = residual @ self.w_unembed  # (d_model,) @ (d_model, vocab) → (vocab,)
+                value_segments.append(values)
+                layer_names.append("output")
+
+        return self._top_k_from_segments(value_segments, layer_names, k, sign)
+
+    def _top_k_from_segments(
+        self,
+        value_segments: list[Tensor],
+        layer_names: list[str],
+        k: int,
+        sign: Literal["positive", "negative"],
+    ) -> list[DatasetAttributionEntry]:
+        if not value_segments:
+            return []
+
+        all_values = torch.cat(value_segments)
+        offsets = [0]
+        for seg in value_segments:
+            offsets.append(offsets[-1] + len(seg))
+
+        is_positive = sign == "positive"
+        top_vals, top_idxs = torch.topk(all_values, min(k, len(all_values)), largest=is_positive)
+
+        mask = top_vals > 0 if is_positive else top_vals < 0
+        top_vals, top_idxs = top_vals[mask], top_idxs[mask]
+
+        results = []
+        for flat_idx, val in zip(top_idxs.tolist(), top_vals.tolist(), strict=True):
+            seg_idx = bisect.bisect_right(offsets, flat_idx) - 1
+            local_idx = flat_idx - offsets[seg_idx]
+            layer = layer_names[seg_idx]
+            results.append(
+                DatasetAttributionEntry(
+                    component_key=f"{layer}:{local_idx}",
+                    layer=layer,
+                    component_idx=local_idx,
+                    value=val,
+                )
+            )
+        return results
 
     def save(self, path: Path) -> None:
-        raise NotImplementedError("Not implemented")
-        # path.parent.mkdir(parents=True, exist_ok=True)
-
-        # def to_cpu(d: AttrDict) -> AttrDict:
-        #     return {
-        #         target: {source: tensor.cpu() for source, tensor in sources.items()}
-        #         for target, sources in d.items()
-        #     }
-
-        # torch.save(
-        #     {
-        #         "attr": to_cpu(self.attr),
-        #         "attr_abs": to_cpu(self.attr_abs),
-        #         "vocab_size": self.vocab_size,
-        #         "ci_threshold": self.ci_threshold,
-        #         "n_batches_processed": self.n_batches_processed,
-        #         "n_tokens_processed": self.n_tokens_processed,
-        #     },
-        #     path,
-        # )
-        # size_mb = path.stat().st_size / (1024 * 1024)
-        # logger.info(f"Saved dataset attributions to {path} ({size_mb:.1f} MB)")
+        path.parent.mkdir(parents=True, exist_ok=True)
+
+        to_cpu_nested = partial(_d_map_nested, lambda x: x.cpu())
+        to_cpu_flat = partial(_d_map, lambda x: x.cpu())
+
+        torch.save(
+            {
+                "regular_attr": to_cpu_nested(self.regular_attr),
+                "regular_attr_abs": to_cpu_nested(self.regular_attr_abs),
+                "embed_attr": to_cpu_flat(self.embed_attr),
+                "embed_attr_abs": to_cpu_flat(self.embed_attr_abs),
+                "unembed_attr": to_cpu_flat(self.unembed_attr),
+                "embed_unembed_attr": self.embed_unembed_attr.cpu(),
+                "w_unembed": self.w_unembed.cpu(),
+                "vocab_size": self.vocab_size,
+                "ci_threshold": self.ci_threshold,
+                "n_batches_processed": self.n_batches_processed,
+                "n_tokens_processed": self.n_tokens_processed,
+            },
+            path,
+        )
+        size_mb = path.stat().st_size / (1024 * 1024)
+        logger.info(f"Saved dataset attributions to {path} ({size_mb:.1f} MB)")
 
     @classmethod
     def load(cls, path: Path) -> "DatasetAttributionStorage":
-        raise NotImplementedError("Not implemented")
-        # data = torch.load(path, weights_only=True)
-        # return cls(
-        #     # attr=data["attr"],
-        #     # attr_abs=data["attr_abs"],
-        #     vocab_size=data["vocab_size"],
-        #     ci_threshold=data["ci_threshold"],
-        #     n_batches_processed=data["n_batches_processed"],
-        #     n_tokens_processed=data["n_tokens_processed"],
-        # )
+        data = torch.load(path, weights_only=True)
+        return cls(
+            regular_attr=data["regular_attr"],
+            regular_attr_abs=data["regular_attr_abs"],
+            embed_attr=data["embed_attr"],
+            embed_attr_abs=data["embed_attr_abs"],
+            unembed_attr=data["unembed_attr"],
+            embed_unembed_attr=data["embed_unembed_attr"],
+            w_unembed=data["w_unembed"],
+            vocab_size=data["vocab_size"],
+            ci_threshold=data["ci_threshold"],
+            n_batches_processed=data["n_batches_processed"],
+            n_tokens_processed=data["n_tokens_processed"],
+        )
 
     @classmethod
     def merge(cls, paths: list[Path]) -> "DatasetAttributionStorage":
-        raise NotImplementedError("Not implemented")
-        # """Merge partial attribution files from parallel workers.
-
-        # All three metrics are means, so merge is weighted average by n_tokens.
-        # (mean_squared_attr is E[x²], not sqrt(E[x²]), so this works.)
-        # """
-        # assert paths, "No files to merge"
-
-        # first = cls.load(paths[0])
-        # n = first.n_tokens_processed
-
-        # def denormalize(d: AttrDict, n_tokens: int) -> AttrDict:
-        #     return {
-        #         target: {source: (tensor * n_tokens).double() for source, tensor in sources.items()}
-        #         for target, sources in d.items()
-        #     }
-
-        # total_attr = denormalize(first.attr, n)
-        # total_attr_abs = denormalize(first.attr_abs, n)
-        # # total_mean_squared_attr = denormalize(first.mean_squared_attr, n)
-        # total_tokens = n
-        # total_batches = first.n_batches_processed
-
-        # for path in paths[1:]:
-        #     storage = cls.load(path)
-        #     assert storage.ci_threshold == first.ci_threshold, "CI threshold mismatch"
-        #     assert storage.attr.keys() == first.attr.keys(), "Target layer mismatch"
-        #     n = storage.n_tokens_processed
-
-        #     for target, sources in storage.attr.items():
-        #         for source, tensor in sources.items():
-        #             total_attr[target][source] += (tensor * n).double()
-        #             total_attr_abs[target][source] += (
-        #                 storage.attr_abs[target][source] * n
-        #             ).double()
-        #             total_mean_squared_attr[target][source] += (
-        #                 storage.mean_squared_attr[target][source] * n
-        #             ).double()
-        #     total_tokens += n
-        #     total_batches += storage.n_batches_processed
-
-        # def normalize(d: AttrDict) -> AttrDict:
-        #     return {
-        #         target: {
-        #             source: (tensor / total_tokens).float() for source, tensor in sources.items()
-        #         }
-        #         for target, sources in d.items()
-        #     }
-
-        # return cls(
-        #     attr=normalize(total_attr),
-        #     attr_abs=normalize(total_attr_abs),
-        #     mean_squared_attr=normalize(total_mean_squared_attr),
-        #     vocab_size=first.vocab_size,
-        #     ci_threshold=first.ci_threshold,
-        #     n_batches_processed=total_batches,
-        #     n_tokens_processed=total_tokens,
-        # )
+        """Merge partial attribution files from parallel workers.
+
+        Values are treated as means — merge is weighted average by n_tokens.
+        """
+        assert paths, "No files to merge"
+
+        first = cls.load(paths[0])
+        n = first.n_tokens_processed
+
+        denorm_nested = partial(_d_map_nested, lambda x: (x * n).double())
+        denorm_flat = partial(_d_map, lambda x: (x * n).double())
+
+        total_regular = denorm_nested(first.regular_attr)
+        total_regular_abs = denorm_nested(first.regular_attr_abs)
+        total_embed = denorm_flat(first.embed_attr)
+        total_embed_abs = denorm_flat(first.embed_attr_abs)
+        total_unembed = denorm_flat(first.unembed_attr)
+        total_embed_unembed = (first.embed_unembed_attr * n).double()
+        total_tokens = n
+        total_batches = first.n_batches_processed
+
+        for path in paths[1:]:
+            storage = cls.load(path)
+            assert storage.ci_threshold == first.ci_threshold, "CI threshold mismatch"
+            n = storage.n_tokens_processed
+
+            for target, sources in storage.regular_attr.items():
+                for source, tensor in sources.items():
+                    total_regular[target][source] += (tensor * n).double()
+                    total_regular_abs[target][source] += (
+                        storage.regular_attr_abs[target][source] * n
+                    ).double()
+
+            for target, tensor in storage.embed_attr.items():
+                total_embed[target] += (tensor * n).double()
+                total_embed_abs[target] += (storage.embed_attr_abs[target] * n).double()
+
+            for source, tensor in storage.unembed_attr.items():
+                total_unembed[source] += (tensor * n).double()
+
+            total_embed_unembed += (storage.embed_unembed_attr * n).double()
+            total_tokens += n
+            total_batches += storage.n_batches_processed
+
+        norm_nested = partial(_d_map_nested, lambda x: (x / total_tokens).float())
+        norm_flat = partial(_d_map, lambda x: (x / total_tokens).float())
+
+        return cls(
+            regular_attr=norm_nested(total_regular),
+            regular_attr_abs=norm_nested(total_regular_abs),
+            embed_attr=norm_flat(total_embed),
+            embed_attr_abs=norm_flat(total_embed_abs),
+            unembed_attr=norm_flat(total_unembed),
+            embed_unembed_attr=(total_embed_unembed / total_tokens).float(),
+            w_unembed=first.w_unembed,
+            vocab_size=first.vocab_size,
+            ci_threshold=first.ci_threshold,
+            n_batches_processed=total_batches,
+            n_tokens_processed=total_tokens,
+        )
+
+
+def _d_map_nested(
+    f: Callable[[Tensor], Tensor], d: dict[str, dict[str, Tensor]]
+) -> dict[str, dict[str, Tensor]]:
+    return {
+        target: {source: f(v) for source, v in sources.items()} for target, sources in d.items()
+    }
+
+
+def _d_map(f: Callable[[Tensor], Tensor], d: dict[str, Tensor]) -> dict[str, Tensor]:
+    return {k: f(v) for k, v in d.items()}
diff --git a/spd/topological_interp/interpret.py b/spd/topological_interp/interpret.py
index 1850435e1..15a430818 100644
--- a/spd/topological_interp/interpret.py
+++ b/spd/topological_interp/interpret.py
@@ -16,8 +16,6 @@
 from pathlib import Path
 from typing import Literal
 
-from torch import Tensor
-
 from spd.app.backend.app_tokenizer import AppTokenizer
 from spd.autointerp.llm_api import LLMError, LLMJob, LLMResult, map_llm_calls
 from spd.autointerp.schemas import ModelMetadata
@@ -57,7 +55,6 @@ def run_topological_interp(
     model_metadata: ModelMetadata,
     db_path: Path,
     tokenizer_name: str,
-    w_unembed: Tensor,
 ) -> None:
     logger.info("Loading tokenizer...")
     app_tok = AppTokenizer.from_pretrained(tokenizer_name)
@@ -111,14 +108,7 @@ def get(
             key: str, k: int, sign: Literal["positive", "negative"]
         ) -> list[DatasetAttributionEntry]:
             return _translate_entries(
-                attribution_storage.get_top_targets(
-                    _to_canon(key),
-                    k=k,
-                    sign=sign,
-                    metric=metric,
-                    w_unembed=w_unembed,
-                    include_outputs=True,
-                )
+                attribution_storage.get_top_targets(_to_canon(key), k=k, sign=sign, metric=metric)
             )
 
         return get
diff --git a/spd/topological_interp/scripts/run.py b/spd/topological_interp/scripts/run.py
index c37d550a8..c74ece827 100644
--- a/spd/topological_interp/scripts/run.py
+++ b/spd/topological_interp/scripts/run.py
@@ -42,8 +42,6 @@ def main(
     logger.info("Loading adapter and model metadata...")
     adapter = adapter_from_id(decomposition_id)
     assert isinstance(adapter, SPDAdapter)
-    w_unembed = adapter._topology.get_unembed_weight()
-
     logger.info("Loading harvest data...")
     if harvest_subrun_id is not None:
         harvest = HarvestRepo(decomposition_id, subrun_id=harvest_subrun_id, readonly=True)
@@ -79,7 +77,6 @@ def main(
         model_metadata=adapter.model_metadata,
         db_path=db_path,
         tokenizer_name=adapter.tokenizer_name,
-        w_unembed=w_unembed,
     )
 
 
diff --git a/tests/dataset_attributions/test_storage.py b/tests/dataset_attributions/test_storage.py
index 4f394f509..fea091c9a 100644
--- a/tests/dataset_attributions/test_storage.py
+++ b/tests/dataset_attributions/test_storage.py
@@ -15,33 +15,30 @@
 C1 = 2  # components in layer 1
 
 
-def _make_attr_dict(seed: int = 0) -> dict[str, dict[str, Tensor]]:
-    """Build attr dict for the test topology.
+def _make_storage(
+    seed: int = 0, n_batches: int = 10, n_tokens: int = 640
+) -> DatasetAttributionStorage:
+    """Build storage for test topology.
 
     Sources by target:
-        "0.glu.up": ["embed"]             -> shape (C0, VOCAB_SIZE)
-        "1.glu.up": ["embed", "0.glu.up"] -> shape (C1, VOCAB_SIZE), (C1, C0)
-        "output":   ["0.glu.up", "1.glu.up"] -> shape (D_MODEL, C0), (D_MODEL, C1)
+        "0.glu.up": ["embed"]             -> embed edge (C0, VOCAB_SIZE)
+        "1.glu.up": ["embed", "0.glu.up"] -> embed edge (C1, VOCAB_SIZE) + regular (C1, C0)
+        "output":   ["0.glu.up", "1.glu.up"] -> unembed (D_MODEL, C0), (D_MODEL, C1)
+        "output":   ["embed"]             -> embed_unembed (D_MODEL, VOCAB_SIZE)
     """
     g = torch.Generator().manual_seed(seed)
 
     def rand(*shape: int) -> Tensor:
         return torch.randn(*shape, generator=g)
 
-    return {
-        LAYER_0: {"embed": rand(C0, VOCAB_SIZE)},
-        LAYER_1: {"embed": rand(C1, VOCAB_SIZE), LAYER_0: rand(C1, C0)},
-        "output": {LAYER_0: rand(D_MODEL, C0), LAYER_1: rand(D_MODEL, C1)},
-    }
-
-
-def _make_storage(
-    seed: int = 0, n_batches: int = 10, n_tokens: int = 640
-) -> DatasetAttributionStorage:
     return DatasetAttributionStorage(
-        attr=_make_attr_dict(seed),
-        attr_abs=_make_attr_dict(seed + 100),
-        mean_squared_attr=_make_attr_dict(seed + 200),
+        regular_attr={LAYER_1: {LAYER_0: rand(C1, C0)}},
+        regular_attr_abs={LAYER_1: {LAYER_0: rand(C1, C0)}},
+        embed_attr={LAYER_0: rand(C0, VOCAB_SIZE), LAYER_1: rand(C1, VOCAB_SIZE)},
+        embed_attr_abs={LAYER_0: rand(C0, VOCAB_SIZE), LAYER_1: rand(C1, VOCAB_SIZE)},
+        unembed_attr={LAYER_0: rand(D_MODEL, C0), LAYER_1: rand(D_MODEL, C1)},
+        embed_unembed_attr=rand(D_MODEL, VOCAB_SIZE),
+        w_unembed=rand(D_MODEL, VOCAB_SIZE),
         vocab_size=VOCAB_SIZE,
         ci_threshold=1e-6,
         n_batches_processed=n_batches,
@@ -50,67 +47,79 @@ def _make_storage(
 
 
 class TestNComponents:
-    def test_counts_non_output_targets(self):
+    def test_counts_all_target_layers(self):
         storage = _make_storage()
+        # LAYER_0 is only in embed_attr, LAYER_1 is in both — both count
         assert storage.n_components == C0 + C1
 
 
-class TestHasSource:
-    def test_embed_token(self):
+class TestGetTopSources:
+    def test_component_target_returns_entries(self):
         storage = _make_storage()
-        assert storage.has_source("embed:0")
-        assert storage.has_source(f"embed:{VOCAB_SIZE - 1}")
+        results = storage.get_top_sources(f"{LAYER_1}:0", k=5, sign="positive", metric="attr")
+        assert all(r.value > 0 for r in results)
+        assert len(results) <= 5
 
-    def test_embed_oob(self):
+    def test_component_target_includes_embed(self):
         storage = _make_storage()
-        assert not storage.has_source(f"embed:{VOCAB_SIZE}")
-        assert not storage.has_source("embed:-1")
+        results = storage.get_top_sources(f"{LAYER_1}:0", k=20, sign="positive", metric="attr")
+        layers = {r.layer for r in results}
+        # Should include both component and embed sources
+        assert "embed" in layers or LAYER_0 in layers
 
-    def test_component_source(self):
+    def test_output_target(self):
         storage = _make_storage()
-        assert storage.has_source(f"{LAYER_0}:0")
-        assert storage.has_source(f"{LAYER_0}:{C0 - 1}")
+        results = storage.get_top_sources("output:0", k=5, sign="positive", metric="attr")
+        assert len(results) <= 5
 
-    def test_component_source_oob(self):
+    def test_output_target_attr_abs_returns_empty(self):
         storage = _make_storage()
-        assert not storage.has_source(f"{LAYER_0}:{C0}")
+        results = storage.get_top_sources("output:0", k=5, sign="positive", metric="attr_abs")
+        assert results == []
 
-    def test_output_never_source(self):
+    def test_target_only_in_embed_attr(self):
         storage = _make_storage()
-        assert not storage.has_source("output:0")
+        # LAYER_0 is only in embed_attr, not in regular_attr
+        results = storage.get_top_sources(f"{LAYER_0}:0", k=5, sign="positive", metric="attr")
+        assert len(results) <= 5
+        assert all(r.layer == "embed" for r in results)
 
-    def test_layer_not_present(self):
+    def test_attr_abs_metric(self):
         storage = _make_storage()
-        assert not storage.has_source("nonexistent:0")
+        results = storage.get_top_sources(f"{LAYER_1}:0", k=5, sign="positive", metric="attr_abs")
+        assert len(results) <= 5
 
 
-class TestHasTarget:
-    def test_component_target(self):
-        storage = _make_storage()
-        assert storage.has_target(f"{LAYER_0}:0")
-        assert storage.has_target(f"{LAYER_1}:{C1 - 1}")
-
-    def test_component_target_oob(self):
+class TestGetTopTargets:
+    def test_component_source(self):
         storage = _make_storage()
-        assert not storage.has_target(f"{LAYER_0}:{C0}")
-        assert not storage.has_target(f"{LAYER_1}:{C1}")
+        results = storage.get_top_targets(
+            f"{LAYER_0}:0", k=5, sign="positive", metric="attr", include_outputs=False
+        )
+        assert len(results) <= 5
+        assert all(r.value > 0 for r in results)
 
-    def test_output_target(self):
+    def test_embed_source(self):
         storage = _make_storage()
-        assert storage.has_target("output:0")
-        assert storage.has_target(f"output:{VOCAB_SIZE - 1}")
+        results = storage.get_top_targets(
+            "embed:0", k=5, sign="positive", metric="attr", include_outputs=False
+        )
+        assert len(results) <= 5
 
-    def test_output_target_oob(self):
+    def test_include_outputs(self):
         storage = _make_storage()
-        assert not storage.has_target(f"output:{VOCAB_SIZE}")
+        results = storage.get_top_targets(f"{LAYER_0}:0", k=20, sign="positive", metric="attr")
+        assert len(results) > 0
 
-    def test_embed_never_target(self):
+    def test_embed_source_with_outputs(self):
         storage = _make_storage()
-        assert not storage.has_target("embed:0")
+        results = storage.get_top_targets("embed:0", k=20, sign="positive", metric="attr")
+        assert len(results) > 0
 
-    def test_layer_not_present(self):
+    def test_attr_abs_skips_output_targets(self):
         storage = _make_storage()
-        assert not storage.has_target("nonexistent:0")
+        results = storage.get_top_targets(f"{LAYER_0}:0", k=20, sign="positive", metric="attr_abs")
+        assert all(r.layer != "output" for r in results)
 
 
 class TestSaveLoad:
@@ -127,14 +136,23 @@ def test_roundtrip(self, tmp_path: Path):
         assert loaded.n_tokens_processed == original.n_tokens_processed
         assert loaded.n_components == original.n_components
 
-        for attr_name in ("attr", "attr_abs", "mean_squared_attr"):
-            orig_dict = getattr(original, attr_name)
-            load_dict = getattr(loaded, attr_name)
-            assert orig_dict.keys() == load_dict.keys()
-            for target in orig_dict:
-                assert orig_dict[target].keys() == load_dict[target].keys()
-                for source in orig_dict[target]:
-                    torch.testing.assert_close(load_dict[target][source], orig_dict[target][source])
+        # Check regular_attr roundtrip
+        for target in original.regular_attr:
+            for source in original.regular_attr[target]:
+                torch.testing.assert_close(
+                    loaded.regular_attr[target][source], original.regular_attr[target][source]
+                )
+
+        # Check embed_attr roundtrip
+        for target in original.embed_attr:
+            torch.testing.assert_close(loaded.embed_attr[target], original.embed_attr[target])
+
+        # Check unembed_attr roundtrip
+        for source in original.unembed_attr:
+            torch.testing.assert_close(loaded.unembed_attr[source], original.unembed_attr[source])
+
+        # Check embed_unembed_attr roundtrip
+        torch.testing.assert_close(loaded.embed_unembed_attr, original.embed_unembed_attr)
 
 
 class TestMerge:
@@ -156,13 +174,21 @@ def test_two_workers_weighted_average(self, tmp_path: Path):
 
         n1, n2 = s1.n_tokens_processed, s2.n_tokens_processed
         total = n1 + n2
-        for target in s1.attr:
-            for source in s1.attr[target]:
-                expected = (s1.attr[target][source] * n1 + s2.attr[target][source] * n2) / total
+
+        # Check regular_attr merge
+        for target in s1.regular_attr:
+            for source in s1.regular_attr[target]:
+                expected = (
+                    s1.regular_attr[target][source] * n1 + s2.regular_attr[target][source] * n2
+                ) / total
                 torch.testing.assert_close(
-                    merged.attr[target][source], expected, atol=1e-5, rtol=1e-5
+                    merged.regular_attr[target][source], expected, atol=1e-5, rtol=1e-5
                 )
 
+        # Check embed_unembed_attr merge
+        expected = (s1.embed_unembed_attr * n1 + s2.embed_unembed_attr * n2) / total
+        torch.testing.assert_close(merged.embed_unembed_attr, expected, atol=1e-5, rtol=1e-5)
+
     def test_unequal_token_counts(self, tmp_path: Path):
         s1 = _make_storage(seed=0, n_batches=3, n_tokens=192)
         s2 = _make_storage(seed=42, n_batches=7, n_tokens=448)
@@ -179,11 +205,13 @@ def test_unequal_token_counts(self, tmp_path: Path):
 
         n1, n2 = s1.n_tokens_processed, s2.n_tokens_processed
         total = n1 + n2
-        for target in s1.attr:
-            for source in s1.attr[target]:
-                expected = (s1.attr[target][source] * n1 + s2.attr[target][source] * n2) / total
+        for target in s1.regular_attr:
+            for source in s1.regular_attr[target]:
+                expected = (
+                    s1.regular_attr[target][source] * n1 + s2.regular_attr[target][source] * n2
+                ) / total
                 torch.testing.assert_close(
-                    merged.attr[target][source], expected, atol=1e-5, rtol=1e-5
+                    merged.regular_attr[target][source], expected, atol=1e-5, rtol=1e-5
                 )
 
     def test_single_file(self, tmp_path: Path):
@@ -194,8 +222,8 @@ def test_single_file(self, tmp_path: Path):
         merged = DatasetAttributionStorage.merge([path])
 
         assert merged.n_tokens_processed == original.n_tokens_processed
-        for target in original.attr:
-            for source in original.attr[target]:
+        for target in original.regular_attr:
+            for source in original.regular_attr[target]:
                 torch.testing.assert_close(
-                    merged.attr[target][source], original.attr[target][source]
+                    merged.regular_attr[target][source], original.regular_attr[target][source]
                 )

From 48d318c9e033ec77bfe04a2917f1865071620bae Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Tue, 24 Feb 2026 15:31:38 +0000
Subject: [PATCH 54/62] Fix embed path not removed from unembed sources in
 harvester

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/dataset_attributions/harvester.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spd/dataset_attributions/harvester.py b/spd/dataset_attributions/harvester.py
index 1bcad5ae2..18e8b74e3 100644
--- a/spd/dataset_attributions/harvester.py
+++ b/spd/dataset_attributions/harvester.py
@@ -75,6 +75,7 @@ def __init__(
 
         unembed_sources = sources_by_regular_target[self.unembed_path].copy()
         del sources_by_regular_target[self.unembed_path]
+        unembed_sources.remove(self.embed_path)
 
         self._emb_unemb_attr_acc = torch.zeros(
             (self.unembed_module.in_features, self.embedding_module.num_embeddings),

From b44115a046429736bb14a639ea0a9e4f29640dbd Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Tue, 24 Feb 2026 15:55:47 +0000
Subject: [PATCH 55/62] =?UTF-8?q?Rename=20topological=5Finterp=20=E2=86=92?=
 =?UTF-8?q?=20graph=5Finterp=20and=20integrate=20into=20SPD=20app?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Rename spd/topological_interp/ → spd/graph_interp/ with all classes,
  functions, CLI entry points, and references updated
- Remove redundant `direction` field from PromptEdge (determined by pass_name)
- Save prompt edges during interpretation (was defined but never called)
- Add get_all_prompt_edges() to DB and repo
- New backend router /api/graph_interp/ with labels, detail, and graph endpoints
- Add graph_interp to RunState, LoadedRun, and DataSourcesResponse
- Frontend: GraphInterpBadge component (side-by-side with autointerp in component card)
- Frontend: Model Graph tab with SVG DAG visualization, filtering, zoom/pan
- Frontend: Graph interp section in Data Sources tab

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md                                     |  10 +-
 pyproject.toml                                |   2 +-
 spd/app/backend/routers/__init__.py           |   2 +
 spd/app/backend/routers/data_sources.py       |  16 +
 spd/app/backend/routers/graph_interp.py       | 191 +++++++
 spd/app/backend/routers/runs.py               |   4 +
 spd/app/backend/server.py                     |   2 +
 spd/app/backend/state.py                      |   2 +
 .../src/components/DataSourcesTab.svelte      |  26 +-
 .../frontend/src/components/ModelGraph.svelte | 520 ++++++++++++++++++
 .../src/components/ModelGraphTab.svelte       |  53 ++
 .../frontend/src/components/RunView.svelte    |  20 +-
 .../prompt-attr/ComponentNodeCard.svelte      |  23 +-
 .../src/components/ui/GraphInterpBadge.svelte | 129 +++++
 spd/app/frontend/src/lib/api/dataSources.ts   |   7 +
 spd/app/frontend/src/lib/api/graphInterp.ts   |  61 ++
 spd/app/frontend/src/lib/api/index.ts         |   1 +
 spd/app/frontend/src/lib/api/runs.ts          |   1 +
 spd/app/frontend/src/lib/useRun.svelte.ts     |  21 +-
 spd/autointerp/prompt_helpers.py              |   2 +-
 .../CLAUDE.md                                 |  20 +-
 spd/graph_interp/__init__.py                  |   1 +
 .../config.py                                 |   8 +-
 .../db.py                                     |  17 +-
 .../graph_context.py                          |   4 +-
 .../interpret.py                              |  55 +-
 .../ordering.py                               |   2 +-
 .../prompts.py                                |   6 +-
 .../repo.py                                   |  27 +-
 .../schemas.py                                |  13 +-
 .../scripts/__init__.py                       |   0
 .../scripts/export_html.py                    |  31 +-
 .../scripts/run.py                            |  22 +-
 .../scripts/run_slurm.py                      |  27 +-
 spd/graph_interp/scripts/run_slurm_cli.py     |  27 +
 spd/topological_interp/__init__.py            |   1 -
 .../scripts/run_slurm_cli.py                  |  27 -
 tests/app/test_server_api.py                  |   1 +
 38 files changed, 1233 insertions(+), 149 deletions(-)
 create mode 100644 spd/app/backend/routers/graph_interp.py
 create mode 100644 spd/app/frontend/src/components/ModelGraph.svelte
 create mode 100644 spd/app/frontend/src/components/ModelGraphTab.svelte
 create mode 100644 spd/app/frontend/src/components/ui/GraphInterpBadge.svelte
 create mode 100644 spd/app/frontend/src/lib/api/graphInterp.ts
 rename spd/{topological_interp => graph_interp}/CLAUDE.md (81%)
 create mode 100644 spd/graph_interp/__init__.py
 rename spd/{topological_interp => graph_interp}/config.py (77%)
 rename spd/{topological_interp => graph_interp}/db.py (95%)
 rename spd/{topological_interp => graph_interp}/graph_context.py (96%)
 rename spd/{topological_interp => graph_interp}/interpret.py (91%)
 rename spd/{topological_interp => graph_interp}/ordering.py (98%)
 rename spd/{topological_interp => graph_interp}/prompts.py (97%)
 rename spd/{topological_interp => graph_interp}/repo.py (73%)
 rename spd/{topological_interp => graph_interp}/schemas.py (51%)
 rename spd/{topological_interp => graph_interp}/scripts/__init__.py (100%)
 rename spd/{topological_interp => graph_interp}/scripts/export_html.py (88%)
 rename spd/{topological_interp => graph_interp}/scripts/run.py (81%)
 rename spd/{topological_interp => graph_interp}/scripts/run_slurm.py (68%)
 create mode 100644 spd/graph_interp/scripts/run_slurm_cli.py
 delete mode 100644 spd/topological_interp/__init__.py
 delete mode 100644 spd/topological_interp/scripts/run_slurm_cli.py

diff --git a/CLAUDE.md b/CLAUDE.md
index 477ae1f2f..a50958707 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -137,9 +137,9 @@ Each experiment (`spd/experiments/{tms,resid_mlp,lm}/`) contains:
 - `spd/harvest/` - Offline GPU pipeline for collecting component statistics (correlations, token stats, activation examples)
 - `spd/autointerp/` - LLM-based automated interpretation of components
 - `spd/dataset_attributions/` - Multi-GPU pipeline for computing component-to-component attribution strengths aggregated over training data
-- `spd/topological_interp/` - Context-aware component labeling using graph structure (attributions + correlations)
-- Data stored at `SPD_OUT_DIR/{harvest,autointerp,dataset_attributions,topological_interp}/<run_id>/`
-- See `spd/harvest/CLAUDE.md`, `spd/autointerp/CLAUDE.md`, `spd/dataset_attributions/CLAUDE.md`, and `spd/topological_interp/CLAUDE.md` for details
+- `spd/graph_interp/` - Context-aware component labeling using graph structure (attributions + correlations)
+- Data stored at `SPD_OUT_DIR/{harvest,autointerp,dataset_attributions,graph_interp}/<run_id>/`
+- See `spd/harvest/CLAUDE.md`, `spd/autointerp/CLAUDE.md`, `spd/dataset_attributions/CLAUDE.md`, and `spd/graph_interp/CLAUDE.md` for details
 
 **Output Directory (`SPD_OUT_DIR`):**
 
@@ -167,7 +167,7 @@ Each experiment (`spd/experiments/{tms,resid_mlp,lm}/`) contains:
 │   ├── dataset_attributions/        # Dataset attributions (see dataset_attributions/CLAUDE.md)
 │   ├── harvest/                     # Statistics collection (see harvest/CLAUDE.md)
 │   ├── postprocess/                 # Unified postprocessing pipeline (harvest + attributions + autointerp)
-│   ├── topological_interp/          # Context-aware interpretation (see topological_interp/CLAUDE.md)
+│   ├── graph_interp/                # Context-aware interpretation (see graph_interp/CLAUDE.md)
 │   ├── pretrain/                    # Target model pretraining (see pretrain/CLAUDE.md)
 │   ├── experiments/                 # Experiment implementations
 │   │   ├── tms/                     # Toy Model of Superposition
@@ -203,7 +203,7 @@ Each experiment (`spd/experiments/{tms,resid_mlp,lm}/`) contains:
 | `spd-autointerp` | `spd/autointerp/scripts/run_slurm_cli.py` | Submit autointerp SLURM job |
 | `spd-attributions` | `spd/dataset_attributions/scripts/run_slurm_cli.py` | Submit dataset attribution SLURM job |
 | `spd-postprocess` | `spd/postprocess/cli.py` | Unified postprocessing pipeline (harvest + attributions + interpret + evals) |
-| `spd-topological-interp` | `spd/topological_interp/scripts/run_slurm_cli.py` | Submit topological interpretation SLURM job |
+| `spd-graph-interp` | `spd/graph_interp/scripts/run_slurm_cli.py` | Submit graph interpretation SLURM job |
 | `spd-clustering` | `spd/clustering/scripts/run_pipeline.py` | Clustering pipeline |
 | `spd-pretrain` | `spd/pretrain/scripts/run_slurm_cli.py` | Pretrain target models |
 
diff --git a/pyproject.toml b/pyproject.toml
index 0a9265125..b82675005 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,7 +57,7 @@ spd-harvest = "spd.harvest.scripts.run_slurm_cli:cli"
 spd-autointerp = "spd.autointerp.scripts.run_slurm_cli:cli"
 spd-attributions = "spd.dataset_attributions.scripts.run_slurm_cli:cli"
 spd-postprocess = "spd.postprocess.cli:cli"
-spd-topological-interp = "spd.topological_interp.scripts.run_slurm_cli:cli"
+spd-graph-interp = "spd.graph_interp.scripts.run_slurm_cli:cli"
 
 [build-system]
 requires = ["setuptools", "wheel"]
diff --git a/spd/app/backend/routers/__init__.py b/spd/app/backend/routers/__init__.py
index b7a6f8ed3..e0d55056d 100644
--- a/spd/app/backend/routers/__init__.py
+++ b/spd/app/backend/routers/__init__.py
@@ -7,6 +7,7 @@
 from spd.app.backend.routers.data_sources import router as data_sources_router
 from spd.app.backend.routers.dataset_attributions import router as dataset_attributions_router
 from spd.app.backend.routers.dataset_search import router as dataset_search_router
+from spd.app.backend.routers.graph_interp import router as graph_interp_router
 from spd.app.backend.routers.graphs import router as graphs_router
 from spd.app.backend.routers.intervention import router as intervention_router
 from spd.app.backend.routers.pretrain_info import router as pretrain_info_router
@@ -20,6 +21,7 @@
     "correlations_router",
     "data_sources_router",
     "dataset_attributions_router",
+    "graph_interp_router",
     "dataset_search_router",
     "graphs_router",
     "intervention_router",
diff --git a/spd/app/backend/routers/data_sources.py b/spd/app/backend/routers/data_sources.py
index 5287d91bd..0481f64d7 100644
--- a/spd/app/backend/routers/data_sources.py
+++ b/spd/app/backend/routers/data_sources.py
@@ -33,10 +33,17 @@ class AttributionsInfo(BaseModel):
     ci_threshold: float
 
 
+class GraphInterpInfo(BaseModel):
+    subrun_id: str
+    config: dict[str, Any] | None
+    label_counts: dict[str, int]
+
+
 class DataSourcesResponse(BaseModel):
     harvest: HarvestInfo | None
     autointerp: AutointerpInfo | None
     attributions: AttributionsInfo | None
+    graph_interp: GraphInterpInfo | None
 
 
 router = APIRouter(prefix="/api/data_sources", tags=["data_sources"])
@@ -75,8 +82,17 @@ def get_data_sources(loaded: DepLoadedRun) -> DataSourcesResponse:
             ci_threshold=storage.ci_threshold,
         )
 
+    graph_interp_info: GraphInterpInfo | None = None
+    if loaded.graph_interp is not None:
+        graph_interp_info = GraphInterpInfo(
+            subrun_id=loaded.graph_interp.subrun_id,
+            config=loaded.graph_interp.get_config(),
+            label_counts=loaded.graph_interp.get_label_counts(),
+        )
+
     return DataSourcesResponse(
         harvest=harvest_info,
         autointerp=autointerp_info,
         attributions=attributions_info,
+        graph_interp=graph_interp_info,
     )
diff --git a/spd/app/backend/routers/graph_interp.py b/spd/app/backend/routers/graph_interp.py
new file mode 100644
index 000000000..79ba45f2b
--- /dev/null
+++ b/spd/app/backend/routers/graph_interp.py
@@ -0,0 +1,191 @@
+"""Graph interpretation endpoints.
+
+Serves context-aware component labels (output/input/unified) and the
+prompt-edge graph produced by the graph_interp pipeline.
+"""
+
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+
+from spd.app.backend.dependencies import DepLoadedRun
+from spd.app.backend.utils import log_errors
+from spd.graph_interp.schemas import LabelResult
+from spd.topology import TransformerTopology
+
+
+def _concrete_to_canonical_key(concrete_key: str, topology: TransformerTopology) -> str:
+    layer, idx = concrete_key.rsplit(":", 1)
+    canonical = topology.target_to_canon(layer)
+    return f"{canonical}:{idx}"
+
+
+def _canonical_to_concrete_key(
+    canonical_layer: str, component_idx: int, topology: TransformerTopology
+) -> str:
+    concrete = topology.canon_to_target(canonical_layer)
+    return f"{concrete}:{component_idx}"
+
+
+# -- Schemas -------------------------------------------------------------------
+
+
+class GraphInterpHeadline(BaseModel):
+    label: str
+    confidence: str
+    output_label: str | None
+    input_label: str | None
+
+
+class LabelDetail(BaseModel):
+    label: str
+    confidence: str
+    reasoning: str
+    prompt: str
+
+
+class GraphInterpDetail(BaseModel):
+    output: LabelDetail | None
+    input: LabelDetail | None
+    unified: LabelDetail | None
+
+
+class GraphNode(BaseModel):
+    component_key: str
+    label: str
+    confidence: str
+
+
+class GraphEdge(BaseModel):
+    source: str
+    target: str
+    attribution: float
+    pass_name: str
+
+
+class ModelGraphResponse(BaseModel):
+    nodes: list[GraphNode]
+    edges: list[GraphEdge]
+
+
+# -- Router --------------------------------------------------------------------
+
+router = APIRouter(prefix="/api/graph_interp", tags=["graph_interp"])
+
+
+@router.get("/labels")
+@log_errors
+def get_all_labels(loaded: DepLoadedRun) -> dict[str, GraphInterpHeadline]:
+    repo = loaded.graph_interp
+    if repo is None:
+        return {}
+
+    topology = loaded.topology
+    unified = repo.get_all_unified_labels()
+    output = repo.get_all_output_labels()
+    input_ = repo.get_all_input_labels()
+
+    all_keys = set(unified) | set(output) | set(input_)
+    result: dict[str, GraphInterpHeadline] = {}
+
+    for concrete_key in all_keys:
+        u = unified.get(concrete_key)
+        o = output.get(concrete_key)
+        i = input_.get(concrete_key)
+
+        label = u or o or i
+        assert label is not None
+        try:
+            canonical_key = _concrete_to_canonical_key(concrete_key, topology)
+        except (KeyError, AssertionError):
+            canonical_key = concrete_key
+
+        result[canonical_key] = GraphInterpHeadline(
+            label=label.label,
+            confidence=label.confidence,
+            output_label=o.label if o else None,
+            input_label=i.label if i else None,
+        )
+
+    return result
+
+
+@router.get("/labels/{layer}/{c_idx}")
+@log_errors
+def get_label_detail(layer: str, c_idx: int, loaded: DepLoadedRun) -> GraphInterpDetail:
+    repo = loaded.graph_interp
+    if repo is None:
+        raise HTTPException(status_code=404, detail="Graph interp data not available")
+
+    concrete_key = _canonical_to_concrete_key(layer, c_idx, loaded.topology)
+
+    o = repo.get_output_label(concrete_key)
+    i = repo.get_input_label(concrete_key)
+    u = repo.get_unified_label(concrete_key)
+
+    def to_detail(label: LabelResult | None) -> LabelDetail | None:
+        if label is None:
+            return None
+        return LabelDetail(
+            label=label.label,
+            confidence=label.confidence,
+            reasoning=label.reasoning,
+            prompt=label.prompt,
+        )
+
+    return GraphInterpDetail(
+        output=to_detail(o),
+        input=to_detail(i),
+        unified=to_detail(u),
+    )
+
+
+@router.get("/graph")
+@log_errors
+def get_model_graph(loaded: DepLoadedRun) -> ModelGraphResponse:
+    repo = loaded.graph_interp
+    if repo is None:
+        raise HTTPException(status_code=404, detail="Graph interp data not available")
+
+    topology = loaded.topology
+
+    unified = repo.get_all_unified_labels()
+    nodes = []
+    for concrete_key, label in unified.items():
+        try:
+            canonical_key = _concrete_to_canonical_key(concrete_key, topology)
+        except (KeyError, AssertionError):
+            canonical_key = concrete_key
+        nodes.append(
+            GraphNode(
+                component_key=canonical_key,
+                label=label.label,
+                confidence=label.confidence,
+            )
+        )
+
+    raw_edges = repo.get_all_prompt_edges()
+    edges = []
+    for e in raw_edges:
+        try:
+            comp_canon = _concrete_to_canonical_key(e.component_key, topology)
+            rel_canon = _concrete_to_canonical_key(e.related_key, topology)
+        except (KeyError, AssertionError):
+            comp_canon = e.component_key
+            rel_canon = e.related_key
+
+        match e.pass_name:
+            case "output":
+                source, target = comp_canon, rel_canon
+            case "input":
+                source, target = rel_canon, comp_canon
+
+        edges.append(
+            GraphEdge(
+                source=source,
+                target=target,
+                attribution=e.attribution,
+                pass_name=e.pass_name,
+            )
+        )
+
+    return ModelGraphResponse(nodes=nodes, edges=edges)
diff --git a/spd/app/backend/routers/runs.py b/spd/app/backend/routers/runs.py
index 0989cea54..bd1c3e5a9 100644
--- a/spd/app/backend/routers/runs.py
+++ b/spd/app/backend/routers/runs.py
@@ -15,6 +15,7 @@
 from spd.autointerp.repo import InterpRepo
 from spd.configs import LMTaskConfig
 from spd.dataset_attributions.repo import AttributionRepo
+from spd.graph_interp.repo import GraphInterpRepo
 from spd.harvest.repo import HarvestRepo
 from spd.log import logger
 from spd.models.component_model import ComponentModel, SPDRunInfo
@@ -42,6 +43,7 @@ class LoadedRun(BaseModel):
     backend_user: str
     dataset_attributions_available: bool
     dataset_search_enabled: bool
+    graph_interp_available: bool
 
 
 router = APIRouter(prefix="/api", tags=["runs"])
@@ -128,6 +130,7 @@ def load_run(wandb_path: str, context_length: int, manager: DepStateManager):
         harvest=HarvestRepo.open_most_recent(run_id),
         interp=InterpRepo.open(run_id),
         attributions=AttributionRepo.open(run_id),
+        graph_interp=GraphInterpRepo.open(run_id),
     )
 
     logger.info(f"[API] Run {run.id} loaded on {DEVICE}")
@@ -165,6 +168,7 @@ def get_status(manager: DepStateManager) -> LoadedRun | None:
         backend_user=getpass.getuser(),
         dataset_attributions_available=manager.run_state.attributions is not None,
         dataset_search_enabled=dataset_search_enabled,
+        graph_interp_available=manager.run_state.graph_interp is not None,
     )
 
 
diff --git a/spd/app/backend/server.py b/spd/app/backend/server.py
index 3804ce756..55c2cb4c8 100644
--- a/spd/app/backend/server.py
+++ b/spd/app/backend/server.py
@@ -32,6 +32,7 @@
     data_sources_router,
     dataset_attributions_router,
     dataset_search_router,
+    graph_interp_router,
     graphs_router,
     intervention_router,
     pretrain_info_router,
@@ -158,6 +159,7 @@ async def global_exception_handler(request: Request, exc: Exception) -> JSONResp
 app.include_router(dataset_attributions_router)
 app.include_router(agents_router)
 app.include_router(data_sources_router)
+app.include_router(graph_interp_router)
 app.include_router(pretrain_info_router)
 
 
diff --git a/spd/app/backend/state.py b/spd/app/backend/state.py
index cf71c2bc6..b1b6c0ad5 100644
--- a/spd/app/backend/state.py
+++ b/spd/app/backend/state.py
@@ -13,6 +13,7 @@
 from spd.autointerp.repo import InterpRepo
 from spd.configs import Config
 from spd.dataset_attributions.repo import AttributionRepo
+from spd.graph_interp.repo import GraphInterpRepo
 from spd.harvest.repo import HarvestRepo
 from spd.models.component_model import ComponentModel
 from spd.topology import TransformerTopology
@@ -32,6 +33,7 @@ class RunState:
     harvest: HarvestRepo | None
     interp: InterpRepo | None
     attributions: AttributionRepo | None
+    graph_interp: GraphInterpRepo | None
 
 
 @dataclass
diff --git a/spd/app/frontend/src/components/DataSourcesTab.svelte b/spd/app/frontend/src/components/DataSourcesTab.svelte
index bc9282c27..ff97d4365 100644
--- a/spd/app/frontend/src/components/DataSourcesTab.svelte
+++ b/spd/app/frontend/src/components/DataSourcesTab.svelte
@@ -104,9 +104,9 @@
     {:else if data.status === "error"}
         <p class="status-text error">Failed to load data sources: {data.error}</p>
     {:else if data.status === "loaded"}
-        {@const { harvest, autointerp, attributions } = data.data}
+        {@const { harvest, autointerp, attributions, graph_interp } = data.data}
 
-        {#if !harvest && !autointerp && !attributions}
+        {#if !harvest && !autointerp && !attributions && !graph_interp}
             <p class="status-text">No pipeline data available for this run.</p>
         {/if}
 
@@ -150,6 +150,28 @@
             </section>
         {/if}
 
+        {#if graph_interp}
+            <section class="source-section">
+                <h3 class="section-title">Graph Interp</h3>
+                <div class="info-grid">
+                    <span class="label">Subrun</span>
+                    <span class="value mono">{graph_interp.subrun_id}</span>
+
+                    {#each Object.entries(graph_interp.label_counts) as [key, value] (key)}
+                        <span class="label">{key} labels</span>
+                        <span class="value">{value.toLocaleString()}</span>
+                    {/each}
+
+                    {#if graph_interp.config}
+                        {#each Object.entries(graph_interp.config) as [key, value] (key)}
+                            <span class="label">{key}</span>
+                            <span class="value mono">{formatConfigValue(value)}</span>
+                        {/each}
+                    {/if}
+                </div>
+            </section>
+        {/if}
+
         {#if autointerp}
             <section class="source-section">
                 <h3 class="section-title">Autointerp</h3>
diff --git a/spd/app/frontend/src/components/ModelGraph.svelte b/spd/app/frontend/src/components/ModelGraph.svelte
new file mode 100644
index 000000000..ae49a25e3
--- /dev/null
+++ b/spd/app/frontend/src/components/ModelGraph.svelte
@@ -0,0 +1,520 @@
+<script lang="ts">
+    import { SvelteMap } from "svelte/reactivity";
+    import type { GraphNode, GraphEdge } from "../lib/api";
+    import { sortRows, getRowKey } from "../lib/graphLayout";
+    import { useZoomPan } from "../lib/useZoomPan.svelte";
+
+    interface Props {
+        nodes: GraphNode[];
+        edges: GraphEdge[];
+    }
+
+    let { nodes, edges }: Props = $props();
+
+    let innerContainer: HTMLDivElement | undefined = $state();
+    const zoom = useZoomPan(() => innerContainer ?? null);
+
+    // -- Controls --
+    let minAttrThreshold = $state(0.1);
+    let searchText = $state("");
+    let showConfidence = $state<Record<string, boolean>>({ high: true, medium: true, low: true });
+    let selectedNodeKey = $state<string | null>(null);
+    let hoveredNodeKey = $state<string | null>(null);
+    let showAllEdges = $state(false);
+
+    // -- Layout computation --
+    type LayoutNode = {
+        key: string;
+        label: string;
+        confidence: string;
+        x: number;
+        y: number;
+        rowKey: string;
+    };
+
+    const ROW_HEIGHT = 60;
+    const NODE_RADIUS = 6;
+    const PADDING_X = 80;
+    const PADDING_Y = 40;
+
+    const layout = $derived.by(() => {
+        // Group nodes by row (layer group)
+        const rowGroups = new SvelteMap<string, GraphNode[]>();
+        for (const node of nodes) {
+            const layer = node.component_key.split(":")[0];
+            const rk = getRowKey(layer);
+            if (!rowGroups.has(rk)) rowGroups.set(rk, []);
+            rowGroups.get(rk)!.push(node);
+        }
+
+        // Sort rows topologically
+        const sortedRowKeys = sortRows([...rowGroups.keys()]);
+
+        // Assign positions
+        const layoutNodes = new SvelteMap<string, LayoutNode>();
+        const rowYMap = new SvelteMap<string, number>();
+
+        // Rows go bottom-to-top (early layers at bottom)
+        const totalRows = sortedRowKeys.length;
+
+        for (let ri = 0; ri < sortedRowKeys.length; ri++) {
+            const rk = sortedRowKeys[ri];
+            const y = PADDING_Y + (totalRows - 1 - ri) * ROW_HEIGHT;
+            rowYMap.set(rk, y);
+
+            const rowNodes = rowGroups.get(rk) ?? [];
+            // Sort nodes within row by component index
+            rowNodes.sort((a, b) => {
+                const aIdx = parseInt(a.component_key.split(":")[1] ?? "0");
+                const bIdx = parseInt(b.component_key.split(":")[1] ?? "0");
+                return aIdx - bIdx;
+            });
+
+            const spacing = Math.max(4, Math.min(14, 800 / Math.max(rowNodes.length, 1)));
+            const rowWidth = (rowNodes.length - 1) * spacing;
+            const startX = PADDING_X + Math.max(0, (800 - rowWidth) / 2);
+
+            for (let ni = 0; ni < rowNodes.length; ni++) {
+                const node = rowNodes[ni];
+                layoutNodes.set(node.component_key, {
+                    key: node.component_key,
+                    label: node.label,
+                    confidence: node.confidence,
+                    x: startX + ni * spacing,
+                    y,
+                    rowKey: rk,
+                });
+            }
+        }
+
+        const width = 800 + PADDING_X * 2;
+        const height = PADDING_Y * 2 + (totalRows - 1) * ROW_HEIGHT;
+
+        return { nodes: layoutNodes, rowYMap, sortedRowKeys, width, height };
+    });
+
+    // -- Filtering --
+    const filteredNodes = $derived.by(() => {
+        const result: LayoutNode[] = [];
+        for (const node of layout.nodes.values()) {
+            if (!showConfidence[node.confidence]) continue;
+            if (searchText && !node.label.toLowerCase().includes(searchText.toLowerCase())) continue;
+            result.push(node);
+        }
+        return result;
+    });
+
+    const filteredNodeKeys = $derived(new Set(filteredNodes.map((n) => n.key)));
+
+    const visibleEdges = $derived.by(() => {
+        const activeKey = selectedNodeKey ?? hoveredNodeKey;
+        return edges.filter((e) => {
+            if (Math.abs(e.attribution) < minAttrThreshold) return false;
+            if (!filteredNodeKeys.has(e.source) || !filteredNodeKeys.has(e.target)) return false;
+            if (!showAllEdges && activeKey) {
+                return e.source === activeKey || e.target === activeKey;
+            }
+            return showAllEdges;
+        });
+    });
+
+    // -- Tooltip --
+    const tooltipNode = $derived(hoveredNodeKey ? layout.nodes.get(hoveredNodeKey) : null);
+
+    const selectedNodeEdges = $derived.by(() => {
+        if (!selectedNodeKey) return [];
+        return edges
+            .filter((e) => e.source === selectedNodeKey || e.target === selectedNodeKey)
+            .sort((a, b) => Math.abs(b.attribution) - Math.abs(a.attribution))
+            .slice(0, 20);
+    });
+
+    function confidenceColor(conf: string): string {
+        switch (conf) {
+            case "high":
+                return "var(--status-positive-bright)";
+            case "medium":
+                return "var(--status-warning)";
+            default:
+                return "var(--text-muted)";
+        }
+    }
+
+    function edgeColor(attr: number): string {
+        return attr >= 0 ? "var(--accent-primary)" : "var(--status-negative)";
+    }
+
+    function edgePath(e: GraphEdge): string {
+        const src = layout.nodes.get(e.source);
+        const tgt = layout.nodes.get(e.target);
+        if (!src || !tgt) return "";
+        const midY = (src.y + tgt.y) / 2;
+        return `M ${src.x} ${src.y} C ${src.x} ${midY}, ${tgt.x} ${midY}, ${tgt.x} ${tgt.y}`;
+    }
+</script>
+
+<div class="model-graph-container">
+    <!-- Controls bar -->
+    <div class="controls-bar">
+        <div class="control-group">
+            <label class="control-label">
+                <input type="checkbox" bind:checked={showConfidence.high} /> High
+            </label>
+            <label class="control-label">
+                <input type="checkbox" bind:checked={showConfidence.medium} /> Med
+            </label>
+            <label class="control-label">
+                <input type="checkbox" bind:checked={showConfidence.low} /> Low
+            </label>
+        </div>
+        <div class="control-group">
+            <label class="control-label">
+                Min attr:
+                <input type="range" min="0" max="1" step="0.01" bind:value={minAttrThreshold} class="range-input" />
+                <span class="range-value">{minAttrThreshold.toFixed(2)}</span>
+            </label>
+        </div>
+        <div class="control-group">
+            <input type="text" placeholder="Search labels..." bind:value={searchText} class="search-input" />
+        </div>
+        <div class="control-group">
+            <label class="control-label">
+                <input type="checkbox" bind:checked={showAllEdges} /> Show all edges
+            </label>
+        </div>
+        <div class="control-group stats">
+            {filteredNodes.length} nodes, {visibleEdges.length} edges
+        </div>
+        <div class="control-group">
+            <button class="zoom-btn" onclick={zoom.zoomIn}>+</button>
+            <button class="zoom-btn" onclick={zoom.zoomOut}>-</button>
+            <button class="zoom-btn" onclick={zoom.reset}>Reset</button>
+        </div>
+    </div>
+
+    <!-- Graph viewport -->
+    <div class="graph-viewport" bind:this={innerContainer} role="img" aria-label="Model graph">
+        <svg
+            width={layout.width}
+            height={layout.height}
+            style="transform: translate({zoom.translateX}px, {zoom.translateY}px) scale({zoom.scale})"
+        >
+            <!-- Row labels -->
+            {#each layout.sortedRowKeys as rk (rk)}
+                {@const y = layout.rowYMap.get(rk)}
+                {#if y !== undefined}
+                    <text x={10} {y} dy="4" class="row-label">{rk}</text>
+                {/if}
+            {/each}
+
+            <!-- Edges -->
+            {#each visibleEdges as e, i (i)}
+                <path
+                    d={edgePath(e)}
+                    fill="none"
+                    stroke={edgeColor(e.attribution)}
+                    stroke-width={Math.max(0.5, Math.abs(e.attribution) * 2)}
+                    opacity={Math.min(0.8, 0.2 + Math.abs(e.attribution) * 0.6)}
+                />
+            {/each}
+
+            <!-- Nodes -->
+            {#each filteredNodes as node (node.key)}
+                <circle
+                    cx={node.x}
+                    cy={node.y}
+                    r={NODE_RADIUS}
+                    fill={confidenceColor(node.confidence)}
+                    opacity={selectedNodeKey === node.key ? 1 : 0.7}
+                    stroke={selectedNodeKey === node.key ? "white" : "none"}
+                    stroke-width={selectedNodeKey === node.key ? 2 : 0}
+                    style="cursor: pointer"
+                    onmouseenter={() => (hoveredNodeKey = node.key)}
+                    onmouseleave={() => (hoveredNodeKey = null)}
+                    onclick={() => (selectedNodeKey = selectedNodeKey === node.key ? null : node.key)}
+                    role="button"
+                    tabindex="0"
+                    aria-label={node.label}
+                />
+            {/each}
+        </svg>
+    </div>
+
+    <!-- Tooltip -->
+    {#if tooltipNode}
+        <div
+            class="tooltip"
+            style="left: {tooltipNode.x * zoom.scale + zoom.translateX + 12}px; top: {tooltipNode.y * zoom.scale +
+                zoom.translateY -
+                8}px"
+        >
+            <div class="tooltip-label">{tooltipNode.label}</div>
+            <div class="tooltip-meta">
+                <span class="tooltip-confidence confidence-{tooltipNode.confidence}">{tooltipNode.confidence}</span>
+                <span class="tooltip-key">{tooltipNode.key}</span>
+            </div>
+        </div>
+    {/if}
+
+    <!-- Selected node detail -->
+    {#if selectedNodeKey}
+        {@const selectedNode = layout.nodes.get(selectedNodeKey)}
+        {#if selectedNode}
+            <div class="detail-panel">
+                <div class="detail-header">
+                    <span class="detail-label">{selectedNode.label}</span>
+                    <span class="confidence confidence-{selectedNode.confidence}">{selectedNode.confidence}</span>
+                    <button class="close-btn" onclick={() => (selectedNodeKey = null)}>x</button>
+                </div>
+                <div class="detail-key">{selectedNode.key}</div>
+                <div class="detail-edges">
+                    {#if selectedNodeEdges.length > 0}
+                        <div class="edge-list">
+                            {#each selectedNodeEdges as e, i (i)}
+                                {@const other = e.source === selectedNodeKey ? e.target : e.source}
+                                {@const otherNode = layout.nodes.get(other)}
+                                <div class="edge-item">
+                                    <span class="edge-dir">{e.source === selectedNodeKey ? "to" : "from"}</span>
+                                    <span class="edge-label">{otherNode?.label ?? other}</span>
+                                    <span class="edge-attr" style="color: {edgeColor(e.attribution)}"
+                                        >{e.attribution.toFixed(3)}</span
+                                    >
+                                </div>
+                            {/each}
+                        </div>
+                    {:else}
+                        <span class="no-edges">No edges</span>
+                    {/if}
+                </div>
+            </div>
+        {/if}
+    {/if}
+</div>
+
+<style>
+    .model-graph-container {
+        flex: 1;
+        display: flex;
+        flex-direction: column;
+        min-height: 0;
+        position: relative;
+    }
+
+    .controls-bar {
+        display: flex;
+        align-items: center;
+        gap: var(--space-3);
+        padding: var(--space-2) var(--space-3);
+        background: var(--bg-surface);
+        border-bottom: 1px solid var(--border-default);
+        flex-shrink: 0;
+        flex-wrap: wrap;
+    }
+
+    .control-group {
+        display: flex;
+        align-items: center;
+        gap: var(--space-2);
+        font-size: var(--text-xs);
+        color: var(--text-secondary);
+        font-family: var(--font-sans);
+    }
+
+    .control-group.stats {
+        color: var(--text-muted);
+        margin-left: auto;
+    }
+
+    .control-label {
+        display: flex;
+        align-items: center;
+        gap: 4px;
+        cursor: pointer;
+        white-space: nowrap;
+    }
+
+    .range-input {
+        width: 80px;
+    }
+
+    .range-value {
+        font-family: var(--font-mono);
+        min-width: 32px;
+    }
+
+    .search-input {
+        padding: 2px var(--space-2);
+        font-size: var(--text-xs);
+        background: var(--bg-inset);
+        color: var(--text-primary);
+        border: 1px solid var(--border-default);
+        border-radius: var(--radius-sm);
+        width: 140px;
+    }
+
+    .zoom-btn {
+        padding: 2px var(--space-2);
+        font-size: var(--text-xs);
+        background: var(--bg-inset);
+        color: var(--text-secondary);
+        border: 1px solid var(--border-default);
+        border-radius: var(--radius-sm);
+        cursor: pointer;
+    }
+
+    .zoom-btn:hover {
+        background: var(--bg-surface);
+    }
+
+    .graph-viewport {
+        flex: 1;
+        overflow: hidden;
+        background: var(--bg-base);
+        position: relative;
+    }
+
+    .graph-viewport svg {
+        transform-origin: 0 0;
+    }
+
+    .row-label {
+        font-size: 9px;
+        fill: var(--text-muted);
+        font-family: var(--font-mono);
+    }
+
+    /* Tooltip */
+    .tooltip {
+        position: absolute;
+        pointer-events: none;
+        background: var(--bg-elevated);
+        border: 1px solid var(--border-default);
+        border-radius: var(--radius-md);
+        padding: var(--space-2);
+        z-index: 10;
+        max-width: 280px;
+    }
+
+    .tooltip-label {
+        font-size: var(--text-sm);
+        font-weight: 500;
+        color: var(--text-primary);
+    }
+
+    .tooltip-meta {
+        display: flex;
+        gap: var(--space-2);
+        align-items: center;
+        margin-top: 2px;
+    }
+
+    .tooltip-confidence {
+        font-size: 10px;
+        font-weight: 600;
+        text-transform: uppercase;
+    }
+
+    .tooltip-key {
+        font-size: 10px;
+        font-family: var(--font-mono);
+        color: var(--text-muted);
+    }
+
+    .confidence-high {
+        color: var(--status-positive-bright);
+    }
+    .confidence-medium {
+        color: var(--status-warning);
+    }
+    .confidence-low {
+        color: var(--text-muted);
+    }
+
+    /* Detail panel */
+    .detail-panel {
+        position: absolute;
+        right: var(--space-3);
+        top: 60px;
+        width: 320px;
+        max-height: calc(100% - 80px);
+        overflow-y: auto;
+        background: var(--bg-surface);
+        border: 1px solid var(--border-default);
+        border-radius: var(--radius-md);
+        padding: var(--space-3);
+        z-index: 10;
+        display: flex;
+        flex-direction: column;
+        gap: var(--space-2);
+    }
+
+    .detail-header {
+        display: flex;
+        align-items: center;
+        gap: var(--space-2);
+    }
+
+    .detail-label {
+        font-weight: 600;
+        font-size: var(--text-sm);
+        color: var(--text-primary);
+        flex: 1;
+    }
+
+    .close-btn {
+        background: none;
+        border: none;
+        color: var(--text-muted);
+        cursor: pointer;
+        font-size: var(--text-sm);
+        padding: 0 var(--space-1);
+    }
+
+    .detail-key {
+        font-family: var(--font-mono);
+        font-size: var(--text-xs);
+        color: var(--text-muted);
+    }
+
+    .edge-list {
+        display: flex;
+        flex-direction: column;
+        gap: 2px;
+    }
+
+    .edge-item {
+        display: flex;
+        gap: var(--space-2);
+        font-size: var(--text-xs);
+        align-items: baseline;
+    }
+
+    .edge-dir {
+        color: var(--text-muted);
+        min-width: 28px;
+    }
+
+    .edge-label {
+        flex: 1;
+        color: var(--text-secondary);
+        white-space: nowrap;
+        overflow: hidden;
+        text-overflow: ellipsis;
+    }
+
+    .edge-attr {
+        font-family: var(--font-mono);
+        font-weight: 600;
+    }
+
+    .no-edges {
+        font-size: var(--text-xs);
+        color: var(--text-muted);
+    }
+
+    .confidence {
+        font-size: 10px;
+        font-weight: 600;
+        text-transform: uppercase;
+    }
+</style>
diff --git a/spd/app/frontend/src/components/ModelGraphTab.svelte b/spd/app/frontend/src/components/ModelGraphTab.svelte
new file mode 100644
index 000000000..2e95e315d
--- /dev/null
+++ b/spd/app/frontend/src/components/ModelGraphTab.svelte
@@ -0,0 +1,53 @@
+<script lang="ts">
+    import { onMount } from "svelte";
+    import type { Loadable } from "../lib";
+    import { getModelGraph, type ModelGraphResponse } from "../lib/api";
+    import ModelGraph from "./ModelGraph.svelte";
+
+    let data = $state<Loadable<ModelGraphResponse>>({ status: "uninitialized" });
+
+    onMount(async () => {
+        data = { status: "loading" };
+        try {
+            const result = await getModelGraph();
+            data = { status: "loaded", data: result };
+        } catch (e) {
+            data = { status: "error", error: e };
+        }
+    });
+</script>
+
+<div class="model-graph-tab">
+    {#if data.status === "loading"}
+        <div class="status">Loading model graph...</div>
+    {:else if data.status === "error"}
+        <div class="status error">Failed to load graph: {String(data.error)}</div>
+    {:else if data.status === "loaded"}
+        <ModelGraph nodes={data.data.nodes} edges={data.data.edges} />
+    {:else}
+        <div class="status">Initializing...</div>
+    {/if}
+</div>
+
+<style>
+    .model-graph-tab {
+        flex: 1;
+        display: flex;
+        flex-direction: column;
+        min-height: 0;
+    }
+
+    .status {
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        flex: 1;
+        color: var(--text-muted);
+        font-family: var(--font-sans);
+        font-size: var(--text-sm);
+    }
+
+    .status.error {
+        color: var(--status-negative);
+    }
+</style>
diff --git a/spd/app/frontend/src/components/RunView.svelte b/spd/app/frontend/src/components/RunView.svelte
index 85c6c497d..14599377c 100644
--- a/spd/app/frontend/src/components/RunView.svelte
+++ b/spd/app/frontend/src/components/RunView.svelte
@@ -4,6 +4,7 @@
     import ClusterPathInput from "./ClusterPathInput.svelte";
     import DatasetExplorerTab from "./DatasetExplorerTab.svelte";
     import DataSourcesTab from "./DataSourcesTab.svelte";
+    import ModelGraphTab from "./ModelGraphTab.svelte";
     import PromptAttributionsTab from "./PromptAttributionsTab.svelte";
     import DisplaySettingsDropdown from "./ui/DisplaySettingsDropdown.svelte";
     import ActivationContextsTab from "./ActivationContextsTab.svelte";
@@ -14,7 +15,9 @@
         runState.run?.status === "loaded" && runState.run.data.dataset_search_enabled,
     );
 
-    let activeTab = $state<"prompts" | "components" | "dataset-search" | "data-sources" | null>(null);
+    const graphInterpAvailable = $derived(runState.graphInterpAvailable);
+
+    let activeTab = $state<"prompts" | "components" | "dataset-search" | "model-graph" | "data-sources" | null>(null);
 
     $effect(() => {
         if (runState.prompts.status === "loaded" && activeTab === null) {
@@ -67,6 +70,16 @@
                         Dataset Search
                     </button>
                 {/if}
+                {#if graphInterpAvailable}
+                    <button
+                        type="button"
+                        class="tab-button"
+                        class:active={activeTab === "model-graph"}
+                        onclick={() => (activeTab = "model-graph")}
+                    >
+                        Model Graph
+                    </button>
+                {/if}
                 <button
                     type="button"
                     class="tab-button"
@@ -106,6 +119,11 @@
                     <DatasetExplorerTab />
                 </div>
             {/if}
+            {#if graphInterpAvailable}
+                <div class="tab-content" class:hidden={activeTab !== "model-graph"}>
+                    <ModelGraphTab />
+                </div>
+            {/if}
             <div class="tab-content" class:hidden={activeTab !== "data-sources"}>
                 <DataSourcesTab />
             </div>
diff --git a/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte b/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte
index 640135c76..ede5ea8ef 100644
--- a/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte
+++ b/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte
@@ -11,6 +11,7 @@
     import ComponentCorrelationMetrics from "../ui/ComponentCorrelationMetrics.svelte";
     import DatasetAttributionsSection from "../ui/DatasetAttributionsSection.svelte";
     import EdgeAttributionGrid from "../ui/EdgeAttributionGrid.svelte";
+    import GraphInterpBadge from "../ui/GraphInterpBadge.svelte";
     import InterpretationBadge from "../ui/InterpretationBadge.svelte";
     import SectionHeader from "../ui/SectionHeader.svelte";
     import StatusText from "../ui/StatusText.svelte";
@@ -48,6 +49,7 @@
 
     const clusterId = $derived(runState.clusterMapping?.data[`${layer}:${cIdx}`]);
     const intruderScore = $derived(runState.getIntruderScore(`${layer}:${cIdx}`));
+    const graphInterpLabel = $derived(runState.getGraphInterpLabel(`${layer}:${cIdx}`));
 
     // Handle clicking a correlated component - parse key and pin it at same seqIdx
     function handleCorrelationClick(componentKey: string) {
@@ -219,11 +221,16 @@
         </div>
     </div>
 
-    <InterpretationBadge
-        interpretation={componentData.interpretation}
-        interpretationDetail={componentData.interpretationDetail}
-        onGenerate={componentData.generateInterpretation}
-    />
+    <div class="interpretation-badges">
+        <InterpretationBadge
+            interpretation={componentData.interpretation}
+            interpretationDetail={componentData.interpretationDetail}
+            onGenerate={componentData.generateInterpretation}
+        />
+        {#if graphInterpLabel}
+            <GraphInterpBadge headline={graphInterpLabel} />
+        {/if}
+    </div>
 
     <!-- Activating examples (from harvest data) -->
     <div class="activating-examples-section">
@@ -340,6 +347,12 @@
         color: var(--text-primary);
     }
 
+    .interpretation-badges {
+        display: flex;
+        flex-direction: column;
+        gap: var(--space-2);
+    }
+
     .card-header {
         display: flex;
         flex-direction: column;
diff --git a/spd/app/frontend/src/components/ui/GraphInterpBadge.svelte b/spd/app/frontend/src/components/ui/GraphInterpBadge.svelte
new file mode 100644
index 000000000..2868004db
--- /dev/null
+++ b/spd/app/frontend/src/components/ui/GraphInterpBadge.svelte
@@ -0,0 +1,129 @@
+<script lang="ts">
+    import type { GraphInterpHeadline } from "../../lib/api";
+
+    interface Props {
+        headline: GraphInterpHeadline;
+    }
+
+    let { headline }: Props = $props();
+
+    let expanded = $state(false);
+</script>
+
+<div class="graph-interp-container">
+    <button class="graph-interp-badge" onclick={() => (expanded = !expanded)} type="button">
+        <div class="badge-header">
+            <span class="badge-label">{headline.label}</span>
+            <span class="confidence confidence-{headline.confidence}">{headline.confidence}</span>
+            <span class="source-tag">graph</span>
+        </div>
+        {#if expanded && (headline.output_label || headline.input_label)}
+            <div class="sub-labels">
+                {#if headline.output_label}
+                    <span class="sub-label"><span class="sub-tag">out</span> {headline.output_label}</span>
+                {/if}
+                {#if headline.input_label}
+                    <span class="sub-label"><span class="sub-tag">in</span> {headline.input_label}</span>
+                {/if}
+            </div>
+        {/if}
+    </button>
+</div>
+
+<style>
+    .graph-interp-container {
+        display: flex;
+        flex-direction: column;
+        gap: var(--space-2);
+    }
+
+    .graph-interp-badge {
+        display: flex;
+        flex-direction: column;
+        gap: var(--space-1);
+        padding: var(--space-2) var(--space-3);
+        background: var(--bg-inset);
+        border-radius: var(--radius-md);
+        border-left: 3px solid var(--status-positive-bright);
+        border-top: none;
+        border-right: none;
+        border-bottom: none;
+        cursor: pointer;
+        text-align: left;
+        font: inherit;
+        width: 100%;
+    }
+
+    .graph-interp-badge:hover {
+        background: var(--bg-surface);
+    }
+
+    .badge-header {
+        display: flex;
+        align-items: center;
+        gap: var(--space-2);
+    }
+
+    .badge-label {
+        font-weight: 500;
+        color: var(--text-primary);
+        font-size: var(--text-sm);
+    }
+
+    .confidence {
+        font-size: var(--text-xs);
+        padding: var(--space-1) var(--space-2);
+        border-radius: var(--radius-sm);
+        text-transform: uppercase;
+        font-weight: 600;
+    }
+
+    .confidence-high {
+        background: color-mix(in srgb, var(--status-positive-bright) 20%, transparent);
+        color: var(--status-positive-bright);
+    }
+
+    .confidence-medium {
+        background: color-mix(in srgb, var(--status-warning) 20%, transparent);
+        color: var(--status-warning);
+    }
+
+    .confidence-low {
+        background: color-mix(in srgb, var(--text-muted) 20%, transparent);
+        color: var(--text-muted);
+    }
+
+    .source-tag {
+        font-size: var(--text-xs);
+        padding: var(--space-1) var(--space-2);
+        border-radius: var(--radius-sm);
+        background: color-mix(in srgb, var(--status-positive-bright) 15%, transparent);
+        color: var(--status-positive-bright);
+        font-weight: 600;
+        text-transform: uppercase;
+        margin-left: auto;
+    }
+
+    .sub-labels {
+        display: flex;
+        flex-direction: column;
+        gap: 2px;
+        padding-top: var(--space-1);
+    }
+
+    .sub-label {
+        font-size: var(--text-xs);
+        color: var(--text-secondary);
+        display: flex;
+        align-items: center;
+        gap: var(--space-2);
+    }
+
+    .sub-tag {
+        font-size: 10px;
+        font-weight: 600;
+        text-transform: uppercase;
+        color: var(--text-muted);
+        min-width: 24px;
+    }
+</style>
diff --git a/spd/app/frontend/src/lib/api/dataSources.ts b/spd/app/frontend/src/lib/api/dataSources.ts
index e715af1b1..e27deabec 100644
--- a/spd/app/frontend/src/lib/api/dataSources.ts
+++ b/spd/app/frontend/src/lib/api/dataSources.ts
@@ -25,10 +25,17 @@ export type AttributionsInfo = {
     ci_threshold: number;
 };
 
+export type GraphInterpInfoDS = {
+    subrun_id: string;
+    config: Record<string, unknown> | null;
+    label_counts: Record<string, number>;
+};
+
 export type DataSourcesResponse = {
     harvest: HarvestInfo | null;
     autointerp: AutointerpInfo | null;
     attributions: AttributionsInfo | null;
+    graph_interp: GraphInterpInfoDS | null;
 };
 
 export async function fetchDataSources(): Promise<DataSourcesResponse> {
diff --git a/spd/app/frontend/src/lib/api/graphInterp.ts b/spd/app/frontend/src/lib/api/graphInterp.ts
new file mode 100644
index 000000000..992d0e744
--- /dev/null
+++ b/spd/app/frontend/src/lib/api/graphInterp.ts
@@ -0,0 +1,61 @@
+/**
+ * API client for /api/graph_interp endpoints.
+ */
+
+import { fetchJson } from "./index";
+
+export type GraphInterpHeadline = {
+    label: string;
+    confidence: string;
+    output_label: string | null;
+    input_label: string | null;
+};
+
+export type LabelDetail = {
+    label: string;
+    confidence: string;
+    reasoning: string;
+    prompt: string;
+};
+
+export type GraphInterpDetail = {
+    output: LabelDetail | null;
+    input: LabelDetail | null;
+    unified: LabelDetail | null;
+};
+
+export type GraphNode = {
+    component_key: string;
+    label: string;
+    confidence: string;
+};
+
+export type GraphEdge = {
+    source: string;
+    target: string;
+    attribution: number;
+    pass_name: string;
+};
+
+export type ModelGraphResponse = {
+    nodes: GraphNode[];
+    edges: GraphEdge[];
+};
+
+export type GraphInterpInfo = {
+    subrun_id: string;
+    config: Record<string, unknown> | null;
+    label_counts: Record<string, number>;
+};
+
+export async function getAllGraphInterpLabels(): Promise<Record<string, GraphInterpHeadline>> {
+    return fetchJson<Record<string, GraphInterpHeadline>>("/api/graph_interp/labels");
+}
+
+export async function getGraphInterpDetail(layer: string, cIdx: number): Promise<GraphInterpDetail> {
+    return fetchJson<GraphInterpDetail>(`/api/graph_interp/labels/${layer}/${cIdx}`);
+}
+
+export async function getModelGraph(): Promise<ModelGraphResponse> {
+    return fetchJson<ModelGraphResponse>("/api/graph_interp/graph");
+}
diff --git a/spd/app/frontend/src/lib/api/index.ts b/spd/app/frontend/src/lib/api/index.ts
index 773663636..2175a17a1 100644
--- a/spd/app/frontend/src/lib/api/index.ts
+++ b/spd/app/frontend/src/lib/api/index.ts
@@ -52,4 +52,5 @@ export * from "./intervention";
 export * from "./dataset";
 export * from "./clusters";
 export * from "./dataSources";
+export * from "./graphInterp";
 export * from "./pretrainInfo";
diff --git a/spd/app/frontend/src/lib/api/runs.ts b/spd/app/frontend/src/lib/api/runs.ts
index 1430632a4..bb385a7ed 100644
--- a/spd/app/frontend/src/lib/api/runs.ts
+++ b/spd/app/frontend/src/lib/api/runs.ts
@@ -14,6 +14,7 @@ export type LoadedRun = {
     backend_user: string;
     dataset_attributions_available: boolean;
     dataset_search_enabled: boolean;
+    graph_interp_available: boolean;
 };
 
 export async function getStatus(): Promise<LoadedRun | null> {
diff --git a/spd/app/frontend/src/lib/useRun.svelte.ts b/spd/app/frontend/src/lib/useRun.svelte.ts
index de6d20c7d..32c9eb4b4 100644
--- a/spd/app/frontend/src/lib/useRun.svelte.ts
+++ b/spd/app/frontend/src/lib/useRun.svelte.ts
@@ -7,7 +7,7 @@
 
 import type { Loadable } from ".";
 import * as api from "./api";
-import type { LoadedRun as RunData, InterpretationHeadline } from "./api";
+import type { LoadedRun as RunData, InterpretationHeadline, GraphInterpHeadline } from "./api";
 import type {
     PromptPreview,
     SubcomponentActivationContexts,
@@ -46,6 +46,9 @@ export function useRun() {
     /** Intruder eval scores keyed by component key */
     let intruderScores = $state<Loadable<Record<string, number>>>({ status: "uninitialized" });
 
+    /** Graph interp labels keyed by component key (layer:cIdx) */
+    let graphInterpLabels = $state<Loadable<Record<string, GraphInterpHeadline>>>({ status: "uninitialized" });
+
     /** Cluster mapping for the current run */
     let clusterMapping = $state<ClusterMapping | null>(null);
 
@@ -71,6 +74,7 @@ export function useRun() {
         allTokens = { status: "uninitialized" };
         interpretations = { status: "uninitialized" };
         intruderScores = { status: "uninitialized" };
+        graphInterpLabels = { status: "uninitialized" };
         activationContextsSummary = { status: "uninitialized" };
         _componentDetailsCache = {};
         clusterMapping = null;
@@ -88,6 +92,9 @@ export function useRun() {
         api.getIntruderScores()
             .then((data) => (intruderScores = { status: "loaded", data }))
             .catch((error) => (intruderScores = { status: "error", error }));
+        api.getAllGraphInterpLabels()
+            .then((data) => (graphInterpLabels = { status: "loaded", data }))
+            .catch((error) => (graphInterpLabels = { status: "error", error }));
         api.getAllInterpretations()
             .then((i) => {
                 interpretations = {
@@ -230,6 +237,11 @@ export function useRun() {
         return clusterMapping?.data[key] ?? null;
     }
 
+    function getGraphInterpLabel(componentKey: string): GraphInterpHeadline | null {
+        if (graphInterpLabels.status !== "loaded") return null;
+        return graphInterpLabels.data[componentKey] ?? null;
+    }
+
     return {
         get run() {
             return run;
@@ -237,6 +249,9 @@ export function useRun() {
         get interpretations() {
             return interpretations;
         },
+        get graphInterpLabels() {
+            return graphInterpLabels;
+        },
         get clusterMapping() {
             return clusterMapping;
         },
@@ -252,6 +267,9 @@ export function useRun() {
         get datasetAttributionsAvailable() {
             return run.status === "loaded" && run.data.dataset_attributions_available;
         },
+        get graphInterpAvailable() {
+            return run.status === "loaded" && run.data.graph_interp_available;
+        },
         loadRun,
         clearRun,
         syncStatus,
@@ -259,6 +277,7 @@ export function useRun() {
         getInterpretation,
         setInterpretation,
         getIntruderScore,
+        getGraphInterpLabel,
         getActivationContextDetail,
         loadActivationContextsSummary,
         setClusterMapping,
diff --git a/spd/autointerp/prompt_helpers.py b/spd/autointerp/prompt_helpers.py
index b52c93f64..63e580591 100644
--- a/spd/autointerp/prompt_helpers.py
+++ b/spd/autointerp/prompt_helpers.py
@@ -1,4 +1,4 @@
-"""Shared prompt-building helpers for autointerp and topological interpretation.
+"""Shared prompt-building helpers for autointerp and graph interpretation.
 
 Pure functions for formatting component data into LLM prompt sections.
 """
diff --git a/spd/topological_interp/CLAUDE.md b/spd/graph_interp/CLAUDE.md
similarity index 81%
rename from spd/topological_interp/CLAUDE.md
rename to spd/graph_interp/CLAUDE.md
index 4da233247..327db0e7c 100644
--- a/spd/topological_interp/CLAUDE.md
+++ b/spd/graph_interp/CLAUDE.md
@@ -1,4 +1,4 @@
-# Topological Interpretation Module
+# Graph Interpretation Module
 
 Context-aware component labeling using network graph structure. Unlike standard autointerp (one-shot per component), this module uses dataset attributions to provide graph context: each component's prompt includes labels from already-labeled components connected via the attribution graph.
 
@@ -6,10 +6,10 @@ Context-aware component labeling using network graph structure. Unlike standard
 
 ```bash
 # Via SLURM (standalone)
-spd-topological-interp <decomposition_id> --config config.yaml
+spd-graph-interp <decomposition_id> --config config.yaml
 
 # Direct execution
-python -m spd.topological_interp.scripts.run <decomposition_id> --config_json '{...}'
+python -m spd.graph_interp.scripts.run <decomposition_id> --config_json '{...}'
 ```
 
 Requires `OPENROUTER_API_KEY` env var. Requires both harvest data and dataset attributions to exist.
@@ -27,7 +27,7 @@ All three phases run in a single invocation. Resume is per-phase via completed k
 ## Data Storage
 
 ```
-SPD_OUT_DIR/topological_interp/<decomposition_id>/
+SPD_OUT_DIR/graph_interp/<decomposition_id>/
 └── ti-YYYYMMDD_HHMMSS/
     ├── interp.db       # SQLite: output_labels, input_labels, unified_labels, prompt_edges
     └── config.yaml
@@ -38,24 +38,24 @@ SPD_OUT_DIR/topological_interp/<decomposition_id>/
 - `output_labels`: component_key → label, confidence, reasoning, raw_response, prompt
 - `input_labels`: same schema as output_labels
 - `unified_labels`: same schema as output_labels
-- `prompt_edges`: directed filtered graph of (component, related_key, direction, pass, attribution, related_label)
+- `prompt_edges`: directed filtered graph of (component, related_key, pass, attribution, related_label)
 - `config`: key-value store
 
 ## Architecture
 
 | File | Purpose |
 |------|---------|
-| `config.py` | `TopologicalInterpConfig`, `TopologicalInterpSlurmConfig` |
+| `config.py` | `GraphInterpConfig`, `GraphInterpSlurmConfig` |
 | `schemas.py` | `LabelResult`, `PromptEdge`, path helpers |
-| `db.py` | `TopologicalInterpDB` — SQLite with WAL mode |
+| `db.py` | `GraphInterpDB` — SQLite with WAL mode |
 | `ordering.py` | Topological sort via `CanonicalWeight` from topology module |
 | `graph_context.py` | `RelatedComponent`, gather attributed + co-firing components |
 | `prompts.py` | Three prompt formatters (output, input, unification) |
 | `interpret.py` | Main three-phase execution loop |
-| `repo.py` | `TopologicalInterpRepo` — read-only access to results |
+| `repo.py` | `GraphInterpRepo` — read-only access to results |
 | `scripts/run.py` | CLI entry point (called by SLURM) |
 | `scripts/run_slurm.py` | SLURM submission |
-| `scripts/run_slurm_cli.py` | Thin CLI wrapper for `spd-topological-interp` |
+| `scripts/run_slurm_cli.py` | Thin CLI wrapper for `spd-graph-interp` |
 
 ## Dependencies
 
@@ -68,4 +68,4 @@ SPD_OUT_DIR/topological_interp/<decomposition_id>/
 
 - 0 GPUs, 16 CPUs, 240GB memory (CPU-only, LLM API calls)
 - Depends on both harvest merge AND attribution merge jobs
-- Entry point: `spd-topological-interp`
+- Entry point: `spd-graph-interp`
diff --git a/spd/graph_interp/__init__.py b/spd/graph_interp/__init__.py
new file mode 100644
index 000000000..61e182fda
--- /dev/null
+++ b/spd/graph_interp/__init__.py
@@ -0,0 +1 @@
+"""Graph interpretation: context-aware component labeling using graph structure."""
diff --git a/spd/topological_interp/config.py b/spd/graph_interp/config.py
similarity index 77%
rename from spd/topological_interp/config.py
rename to spd/graph_interp/config.py
index 532c61a4a..e6e7441d3 100644
--- a/spd/topological_interp/config.py
+++ b/spd/graph_interp/config.py
@@ -1,4 +1,4 @@
-"""Topological interpretation configuration."""
+"""Graph interpretation configuration."""
 
 from openrouter.components import Effort
 
@@ -7,7 +7,7 @@
 from spd.settings import DEFAULT_PARTITION_NAME
 
 
-class TopologicalInterpConfig(BaseConfig):
+class GraphInterpConfig(BaseConfig):
     model: str = "google/gemini-3-flash-preview"
     reasoning_effort: Effort = "low"
     attr_metric: AttrMetric = "attr_abs"
@@ -20,7 +20,7 @@ class TopologicalInterpConfig(BaseConfig):
     limit: int | None = None
 
 
-class TopologicalInterpSlurmConfig(BaseConfig):
-    config: TopologicalInterpConfig
+class GraphInterpSlurmConfig(BaseConfig):
+    config: GraphInterpConfig
     partition: str = DEFAULT_PARTITION_NAME
     time: str = "24:00:00"
diff --git a/spd/topological_interp/db.py b/spd/graph_interp/db.py
similarity index 95%
rename from spd/topological_interp/db.py
rename to spd/graph_interp/db.py
index e36c376f8..0af8c796f 100644
--- a/spd/topological_interp/db.py
+++ b/spd/graph_interp/db.py
@@ -1,9 +1,9 @@
-"""SQLite database for topological interpretation data."""
+"""SQLite database for graph interpretation data."""
 
 import sqlite3
 from pathlib import Path
 
-from spd.topological_interp.schemas import LabelResult, PromptEdge
+from spd.graph_interp.schemas import LabelResult, PromptEdge
 
 _SCHEMA = """\
 CREATE TABLE IF NOT EXISTS output_labels (
@@ -36,12 +36,11 @@
 CREATE TABLE IF NOT EXISTS prompt_edges (
     component_key TEXT NOT NULL,
     related_key TEXT NOT NULL,
-    direction TEXT NOT NULL,
     pass TEXT NOT NULL,
     attribution REAL NOT NULL,
     related_label TEXT,
     related_confidence TEXT,
-    PRIMARY KEY (component_key, related_key, direction, pass)
+    PRIMARY KEY (component_key, related_key, pass)
 );
 
 CREATE TABLE IF NOT EXISTS config (
@@ -51,7 +50,7 @@
 """
 
 
-class TopologicalInterpDB:
+class GraphInterpDB:
     def __init__(self, db_path: Path, readonly: bool = False) -> None:
         if readonly:
             self._conn = sqlite3.connect(
@@ -166,7 +165,6 @@ def save_prompt_edges(self, edges: list[PromptEdge]) -> None:
             (
                 e.component_key,
                 e.related_key,
-                e.direction,
                 e.pass_name,
                 e.attribution,
                 e.related_label,
@@ -175,7 +173,7 @@ def save_prompt_edges(self, edges: list[PromptEdge]) -> None:
             for e in edges
         ]
         self._conn.executemany(
-            "INSERT OR REPLACE INTO prompt_edges VALUES (?, ?, ?, ?, ?, ?, ?)",
+            "INSERT OR REPLACE INTO prompt_edges VALUES (?, ?, ?, ?, ?, ?)",
             rows,
         )
         self._conn.commit()
@@ -186,6 +184,10 @@ def get_prompt_edges(self, component_key: str) -> list[PromptEdge]:
         ).fetchall()
         return [_row_to_prompt_edge(row) for row in rows]
 
+    def get_all_prompt_edges(self) -> list[PromptEdge]:
+        rows = self._conn.execute("SELECT * FROM prompt_edges").fetchall()
+        return [_row_to_prompt_edge(row) for row in rows]
+
     # -- Config ----------------------------------------------------------------
 
     def save_config(self, key: str, value: str) -> None:
@@ -219,7 +221,6 @@ def _row_to_prompt_edge(row: sqlite3.Row) -> PromptEdge:
     return PromptEdge(
         component_key=row["component_key"],
         related_key=row["related_key"],
-        direction=row["direction"],
         pass_name=row["pass"],
         attribution=row["attribution"],
         related_label=row["related_label"],
diff --git a/spd/topological_interp/graph_context.py b/spd/graph_interp/graph_context.py
similarity index 96%
rename from spd/topological_interp/graph_context.py
rename to spd/graph_interp/graph_context.py
index 8c9858870..9ac08ad73 100644
--- a/spd/topological_interp/graph_context.py
+++ b/spd/graph_interp/graph_context.py
@@ -5,10 +5,10 @@
 from typing import Literal
 
 from spd.dataset_attributions.storage import DatasetAttributionEntry
+from spd.graph_interp.ordering import parse_component_key
+from spd.graph_interp.schemas import LabelResult
 from spd.harvest.analysis import get_correlated_components
 from spd.harvest.storage import CorrelationStorage
-from spd.topological_interp.ordering import parse_component_key
-from spd.topological_interp.schemas import LabelResult
 
 
 @dataclass
diff --git a/spd/topological_interp/interpret.py b/spd/graph_interp/interpret.py
similarity index 91%
rename from spd/topological_interp/interpret.py
rename to spd/graph_interp/interpret.py
index 15a430818..d018f77e0 100644
--- a/spd/topological_interp/interpret.py
+++ b/spd/graph_interp/interpret.py
@@ -1,4 +1,4 @@
-"""Main three-phase topological interpretation execution.
+"""Main three-phase graph interpretation execution.
 
 Structure:
     output_labels = scan(layers_reversed, step)
@@ -24,30 +24,30 @@
     DatasetAttributionEntry,
     DatasetAttributionStorage,
 )
-from spd.harvest.analysis import get_input_token_stats, get_output_token_stats
-from spd.harvest.repo import HarvestRepo
-from spd.harvest.storage import CorrelationStorage, TokenStatsStorage
-from spd.log import logger
-from spd.topological_interp import graph_context
-from spd.topological_interp.config import TopologicalInterpConfig
-from spd.topological_interp.db import TopologicalInterpDB
-from spd.topological_interp.graph_context import RelatedComponent, get_related_components
-from spd.topological_interp.ordering import group_and_sort_by_layer
-from spd.topological_interp.prompts import (
+from spd.graph_interp import graph_context
+from spd.graph_interp.config import GraphInterpConfig
+from spd.graph_interp.db import GraphInterpDB
+from spd.graph_interp.graph_context import RelatedComponent, get_related_components
+from spd.graph_interp.ordering import group_and_sort_by_layer
+from spd.graph_interp.prompts import (
     LABEL_SCHEMA,
     format_input_prompt,
     format_output_prompt,
     format_unification_prompt,
 )
-from spd.topological_interp.schemas import LabelResult
+from spd.graph_interp.schemas import LabelResult, PromptEdge
+from spd.harvest.analysis import get_input_token_stats, get_output_token_stats
+from spd.harvest.repo import HarvestRepo
+from spd.harvest.storage import CorrelationStorage, TokenStatsStorage
+from spd.log import logger
 
 GetRelated = Callable[[str, dict[str, LabelResult]], list[RelatedComponent]]
 Step = Callable[[list[str], dict[str, LabelResult]], Awaitable[dict[str, LabelResult]]]
 
 
-def run_topological_interp(
+def run_graph_interp(
     openrouter_api_key: str,
-    config: TopologicalInterpConfig,
+    config: GraphInterpConfig,
     harvest: HarvestRepo,
     attribution_storage: DatasetAttributionStorage,
     correlation_storage: CorrelationStorage,
@@ -68,7 +68,7 @@ def run_topological_interp(
 
     layers = group_and_sort_by_layer(all_keys, model_metadata.layer_descriptions)
     total = len(all_keys)
-    logger.info(f"Topological interp: {total} components across {len(layers)} layers")
+    logger.info(f"Graph interp: {total} components across {len(layers)} layers")
 
     # -- Injected behaviours ---------------------------------------------------
 
@@ -151,6 +151,7 @@ def jobs() -> Iterable[LLMJob]:
                 assert o_stats is not None, f"No output token stats for {key}"
 
                 related = get_related(key, labels_so_far)
+                _save_edges(db, key, related, "output")
                 prompt = format_output_prompt(
                     component=component,
                     model_metadata=model_metadata,
@@ -178,6 +179,7 @@ def jobs() -> Iterable[LLMJob]:
                 assert i_stats is not None, f"No input token stats for {key}"
 
                 related = get_related(key, labels_so_far)
+                _save_edges(db, key, related, "input")
                 prompt = format_input_prompt(
                     component=component,
                     model_metadata=model_metadata,
@@ -264,7 +266,7 @@ def jobs() -> Iterable[LLMJob]:
     # -- Run -------------------------------------------------------------------
 
     logger.info("Initializing DB and building scan steps...")
-    db = TopologicalInterpDB(db_path)
+    db = GraphInterpDB(db_path)
 
     metric = config.attr_metric
     get_targets = _make_get_targets(metric)
@@ -352,3 +354,24 @@ def _check_error_rate(n_errors: int, n_done: int) -> None:
         raise RuntimeError(
             f"Error rate {n_errors / total:.0%} ({n_errors}/{total}) exceeds 5% threshold"
         )
+
+
+def _save_edges(
+    db: GraphInterpDB,
+    component_key: str,
+    related: list[RelatedComponent],
+    pass_name: Literal["output", "input"],
+) -> None:
+    edges = [
+        PromptEdge(
+            component_key=component_key,
+            related_key=r.component_key,
+            pass_name=pass_name,
+            attribution=r.attribution,
+            related_label=r.label,
+            related_confidence=r.confidence,
+        )
+        for r in related
+    ]
+    if edges:
+        db.save_prompt_edges(edges)
diff --git a/spd/topological_interp/ordering.py b/spd/graph_interp/ordering.py
similarity index 98%
rename from spd/topological_interp/ordering.py
rename to spd/graph_interp/ordering.py
index 0807b000a..9ef4d5afa 100644
--- a/spd/topological_interp/ordering.py
+++ b/spd/graph_interp/ordering.py
@@ -1,4 +1,4 @@
-"""Layer ordering for topological interpretation.
+"""Layer ordering for graph interpretation.
 
 Uses the topology module's CanonicalWeight system for correct ordering
 across all model architectures. Canonical addresses are provided by
diff --git a/spd/topological_interp/prompts.py b/spd/graph_interp/prompts.py
similarity index 97%
rename from spd/topological_interp/prompts.py
rename to spd/graph_interp/prompts.py
index 93e4a2c0f..01874a745 100644
--- a/spd/topological_interp/prompts.py
+++ b/spd/graph_interp/prompts.py
@@ -1,4 +1,4 @@
-"""Prompt formatters for topological interpretation.
+"""Prompt formatters for graph interpretation.
 
 Three prompts:
 1. Output pass (late→early): "What does this component DO?" — output tokens, says examples, downstream
@@ -17,10 +17,10 @@
     layer_position_note,
 )
 from spd.autointerp.schemas import ModelMetadata
+from spd.graph_interp.graph_context import RelatedComponent
+from spd.graph_interp.schemas import LabelResult
 from spd.harvest.analysis import TokenPRLift
 from spd.harvest.schemas import ComponentData
-from spd.topological_interp.graph_context import RelatedComponent
-from spd.topological_interp.schemas import LabelResult
 
 LABEL_SCHEMA: dict[str, object] = {
     "type": "object",
diff --git a/spd/topological_interp/repo.py b/spd/graph_interp/repo.py
similarity index 73%
rename from spd/topological_interp/repo.py
rename to spd/graph_interp/repo.py
index 7325e512b..9906ff138 100644
--- a/spd/topological_interp/repo.py
+++ b/spd/graph_interp/repo.py
@@ -1,9 +1,9 @@
-"""Topological interpretation data repository.
+"""Graph interpretation data repository.
 
-Owns SPD_OUT_DIR/topological_interp/<decomposition_id>/ and provides read access
+Owns SPD_OUT_DIR/graph_interp/<decomposition_id>/ and provides read access
 to output, input, and unified labels.
 
-Use TopologicalInterpRepo.open() to construct — returns None if no data exists.
+Use GraphInterpRepo.open() to construct — returns None if no data exists.
 """
 
 from pathlib import Path
@@ -11,23 +11,23 @@
 
 import yaml
 
-from spd.topological_interp.db import TopologicalInterpDB
-from spd.topological_interp.schemas import LabelResult, PromptEdge, get_topological_interp_dir
+from spd.graph_interp.db import GraphInterpDB
+from spd.graph_interp.schemas import LabelResult, PromptEdge, get_graph_interp_dir
 
 
-class TopologicalInterpRepo:
-    """Read access to topological interpretation data for a single run."""
+class GraphInterpRepo:
+    """Read access to graph interpretation data for a single run."""
 
-    def __init__(self, db: TopologicalInterpDB, subrun_dir: Path, run_id: str) -> None:
+    def __init__(self, db: GraphInterpDB, subrun_dir: Path, run_id: str) -> None:
         self._db = db
         self._subrun_dir = subrun_dir
         self.subrun_id = subrun_dir.name
         self.run_id = run_id
 
     @classmethod
-    def open(cls, run_id: str) -> "TopologicalInterpRepo | None":
-        """Open topological interp data for a run. Returns None if no data exists."""
-        base_dir = get_topological_interp_dir(run_id)
+    def open(cls, run_id: str) -> "GraphInterpRepo | None":
+        """Open graph interp data for a run. Returns None if no data exists."""
+        base_dir = get_graph_interp_dir(run_id)
         if not base_dir.exists():
             return None
         candidates = sorted(
@@ -41,7 +41,7 @@ def open(cls, run_id: str) -> "TopologicalInterpRepo | None":
         if not db_path.exists():
             return None
         return cls(
-            db=TopologicalInterpDB(db_path, readonly=True),
+            db=GraphInterpDB(db_path, readonly=True),
             subrun_dir=subrun_dir,
             run_id=run_id,
         )
@@ -78,6 +78,9 @@ def get_unified_label(self, component_key: str) -> LabelResult | None:
     def get_prompt_edges(self, component_key: str) -> list[PromptEdge]:
         return self._db.get_prompt_edges(component_key)
 
+    def get_all_prompt_edges(self) -> list[PromptEdge]:
+        return self._db.get_all_prompt_edges()
+
     # -- Stats -----------------------------------------------------------------
 
     def get_label_counts(self) -> dict[str, int]:
diff --git a/spd/topological_interp/schemas.py b/spd/graph_interp/schemas.py
similarity index 51%
rename from spd/topological_interp/schemas.py
rename to spd/graph_interp/schemas.py
index a403aa2c1..ad391e270 100644
--- a/spd/topological_interp/schemas.py
+++ b/spd/graph_interp/schemas.py
@@ -1,4 +1,4 @@
-"""Data types and path helpers for topological interpretation."""
+"""Data types and path helpers for graph interpretation."""
 
 from dataclasses import dataclass
 from pathlib import Path
@@ -6,15 +6,15 @@
 
 from spd.settings import SPD_OUT_DIR
 
-TOPOLOGICAL_INTERP_DIR = SPD_OUT_DIR / "topological_interp"
+GRAPH_INTERP_DIR = SPD_OUT_DIR / "graph_interp"
 
 
-def get_topological_interp_dir(decomposition_id: str) -> Path:
-    return TOPOLOGICAL_INTERP_DIR / decomposition_id
+def get_graph_interp_dir(decomposition_id: str) -> Path:
+    return GRAPH_INTERP_DIR / decomposition_id
 
 
-def get_topological_interp_subrun_dir(decomposition_id: str, subrun_id: str) -> Path:
-    return get_topological_interp_dir(decomposition_id) / subrun_id
+def get_graph_interp_subrun_dir(decomposition_id: str, subrun_id: str) -> Path:
+    return get_graph_interp_dir(decomposition_id) / subrun_id
 
 
 @dataclass
@@ -31,7 +31,6 @@ class LabelResult:
 class PromptEdge:
     component_key: str
     related_key: str
-    direction: Literal["upstream", "downstream"]
     pass_name: Literal["output", "input"]
     attribution: float
     related_label: str | None
diff --git a/spd/topological_interp/scripts/__init__.py b/spd/graph_interp/scripts/__init__.py
similarity index 100%
rename from spd/topological_interp/scripts/__init__.py
rename to spd/graph_interp/scripts/__init__.py
diff --git a/spd/topological_interp/scripts/export_html.py b/spd/graph_interp/scripts/export_html.py
similarity index 88%
rename from spd/topological_interp/scripts/export_html.py
rename to spd/graph_interp/scripts/export_html.py
index 9616214bf..14a5955da 100644
--- a/spd/topological_interp/scripts/export_html.py
+++ b/spd/graph_interp/scripts/export_html.py
@@ -1,9 +1,9 @@
-"""Export topological interpretation data to JSON for the static HTML page.
+"""Export graph interpretation data to JSON for the static HTML page.
 
 Usage:
-    python -m spd.topological_interp.scripts.export_html s-17805b61
-    python -m spd.topological_interp.scripts.export_html s-17805b61 --subrun_id ti-20260223_213443
-    python -m spd.topological_interp.scripts.export_html s-17805b61 --mock
+    python -m spd.graph_interp.scripts.export_html s-17805b61
+    python -m spd.graph_interp.scripts.export_html s-17805b61 --subrun_id ti-20260223_213443
+    python -m spd.graph_interp.scripts.export_html s-17805b61 --mock
 """
 
 import json
@@ -11,9 +11,9 @@
 from dataclasses import asdict
 from typing import Any
 
+from spd.graph_interp.repo import GraphInterpRepo
+from spd.graph_interp.schemas import LabelResult, get_graph_interp_dir
 from spd.settings import SPD_OUT_DIR
-from spd.topological_interp.repo import TopologicalInterpRepo
-from spd.topological_interp.schemas import LabelResult, get_topological_interp_dir
 
 WWW_DIR = SPD_OUT_DIR / "www"
 DATA_DIR = WWW_DIR / "data"
@@ -32,7 +32,7 @@ def _parse_component_key(key: str) -> tuple[str, int]:
     return layer, int(idx_str)
 
 
-def export_from_repo(repo: TopologicalInterpRepo) -> dict[str, Any]:
+def export_from_repo(repo: GraphInterpRepo) -> dict[str, Any]:
     output_labels = repo.get_all_output_labels()
     input_labels = repo.get_all_input_labels()
     unified_labels = repo.get_all_unified_labels()
@@ -228,29 +228,26 @@ def main(
     mock: bool = False,
 ) -> None:
     DATA_DIR.mkdir(parents=True, exist_ok=True)
-    out_path = DATA_DIR / f"topological_interp_{decomposition_id}.json"
+    out_path = DATA_DIR / f"graph_interp_{decomposition_id}.json"
 
     if mock:
         data = generate_mock_data(decomposition_id)
         print(f"Generated mock data: {len(data['components'])} components")
     else:
         if subrun_id is not None:
-            base_dir = get_topological_interp_dir(decomposition_id)
+            base_dir = get_graph_interp_dir(decomposition_id)
             subrun_dir = base_dir / subrun_id
             assert subrun_dir.exists(), f"Subrun dir not found: {subrun_dir}"
             db_path = subrun_dir / "interp.db"
             assert db_path.exists(), f"No interp.db in {subrun_dir}"
-            from spd.topological_interp.db import TopologicalInterpDB
+            from spd.graph_interp.db import GraphInterpDB
 
-            db = TopologicalInterpDB(db_path, readonly=True)
-            repo = TopologicalInterpRepo(db=db, subrun_dir=subrun_dir, run_id=decomposition_id)
+            db = GraphInterpDB(db_path, readonly=True)
+            repo = GraphInterpRepo(db=db, subrun_dir=subrun_dir, run_id=decomposition_id)
         else:
-            repo = TopologicalInterpRepo.open(decomposition_id)
+            repo = GraphInterpRepo.open(decomposition_id)
             if repo is None:
-                print(
-                    f"No topological interp data for {decomposition_id}. "
-                    "Generating mock data instead."
-                )
+                print(f"No graph interp data for {decomposition_id}. Generating mock data instead.")
                 data = generate_mock_data(decomposition_id)
                 with open(out_path, "w") as f:
                     json.dump(data, f)
diff --git a/spd/topological_interp/scripts/run.py b/spd/graph_interp/scripts/run.py
similarity index 81%
rename from spd/topological_interp/scripts/run.py
rename to spd/graph_interp/scripts/run.py
index c74ece827..fe74af365 100644
--- a/spd/topological_interp/scripts/run.py
+++ b/spd/graph_interp/scripts/run.py
@@ -1,7 +1,7 @@
-"""CLI entry point for topological interpretation.
+"""CLI entry point for graph interpretation.
 
 Called by SLURM or directly:
-    python -m spd.topological_interp.scripts.run <decomposition_id> --config_json '{...}'
+    python -m spd.graph_interp.scripts.run <decomposition_id> --config_json '{...}'
 """
 
 import os
@@ -13,11 +13,11 @@
 from spd.adapters import adapter_from_id
 from spd.adapters.spd import SPDAdapter
 from spd.dataset_attributions.repo import AttributionRepo
+from spd.graph_interp.config import GraphInterpConfig
+from spd.graph_interp.interpret import run_graph_interp
+from spd.graph_interp.schemas import get_graph_interp_subrun_dir
 from spd.harvest.repo import HarvestRepo
 from spd.log import logger
-from spd.topological_interp.config import TopologicalInterpConfig
-from spd.topological_interp.interpret import run_topological_interp
-from spd.topological_interp.schemas import get_topological_interp_subrun_dir
 
 
 def main(
@@ -26,18 +26,18 @@ def main(
     harvest_subrun_id: str | None = None,
 ) -> None:
     assert isinstance(config_json, dict), f"Expected dict from fire, got {type(config_json)}"
-    config = TopologicalInterpConfig.model_validate(config_json)
+    config = GraphInterpConfig.model_validate(config_json)
 
     load_dotenv()
     openrouter_api_key = os.environ.get("OPENROUTER_API_KEY")
     assert openrouter_api_key, "OPENROUTER_API_KEY not set"
 
     subrun_id = "ti-" + datetime.now().strftime("%Y%m%d_%H%M%S")
-    subrun_dir = get_topological_interp_subrun_dir(decomposition_id, subrun_id)
+    subrun_dir = get_graph_interp_subrun_dir(decomposition_id, subrun_id)
     subrun_dir.mkdir(parents=True, exist_ok=True)
     config.to_file(subrun_dir / "config.yaml")
     db_path = subrun_dir / "interp.db"
-    logger.info(f"Topological interp run: {subrun_dir}")
+    logger.info(f"Graph interp run: {subrun_dir}")
 
     logger.info("Loading adapter and model metadata...")
     adapter = adapter_from_id(decomposition_id)
@@ -67,7 +67,7 @@ def main(
 
     logger.info("Data loading complete")
 
-    run_topological_interp(
+    run_graph_interp(
         openrouter_api_key=openrouter_api_key,
         config=config,
         harvest=harvest,
@@ -82,12 +82,12 @@ def main(
 
 def get_command(
     decomposition_id: str,
-    config: TopologicalInterpConfig,
+    config: GraphInterpConfig,
     harvest_subrun_id: str | None = None,
 ) -> str:
     config_json = config.model_dump_json(exclude_none=True)
     cmd = (
-        "python -m spd.topological_interp.scripts.run "
+        "python -m spd.graph_interp.scripts.run "
         f"--decomposition_id {decomposition_id} "
         f"--config_json '{config_json}' "
     )
diff --git a/spd/topological_interp/scripts/run_slurm.py b/spd/graph_interp/scripts/run_slurm.py
similarity index 68%
rename from spd/topological_interp/scripts/run_slurm.py
rename to spd/graph_interp/scripts/run_slurm.py
index 9e500efcd..fed1c146b 100644
--- a/spd/topological_interp/scripts/run_slurm.py
+++ b/spd/graph_interp/scripts/run_slurm.py
@@ -1,4 +1,4 @@
-"""SLURM launcher for topological interpretation.
+"""SLURM launcher for graph interpretation.
 
 Submits a single CPU job that runs the three-phase interpretation pipeline.
 Depends on both harvest merge and attribution merge jobs.
@@ -6,29 +6,29 @@
 
 from dataclasses import dataclass
 
+from spd.graph_interp.config import GraphInterpSlurmConfig
+from spd.graph_interp.scripts import run
 from spd.log import logger
-from spd.topological_interp.config import TopologicalInterpSlurmConfig
-from spd.topological_interp.scripts import run
 from spd.utils.slurm import SlurmConfig, SubmitResult, generate_script, submit_slurm_job
 
 
 @dataclass
-class TopologicalInterpSubmitResult:
+class GraphInterpSubmitResult:
     result: SubmitResult
 
 
-def submit_topological_interp(
+def submit_graph_interp(
     decomposition_id: str,
-    config: TopologicalInterpSlurmConfig,
+    config: GraphInterpSlurmConfig,
     dependency_job_ids: list[str],
     snapshot_branch: str | None = None,
     harvest_subrun_id: str | None = None,
-) -> TopologicalInterpSubmitResult:
-    """Submit topological interpretation to SLURM.
+) -> GraphInterpSubmitResult:
+    """Submit graph interpretation to SLURM.
 
     Args:
         decomposition_id: ID of the target decomposition.
-        config: Topological interp SLURM configuration.
+        config: Graph interp SLURM configuration.
         dependency_job_ids: Jobs to wait for (harvest merge + attribution merge).
         snapshot_branch: Git snapshot branch to use.
         harvest_subrun_id: Specific harvest subrun to use.
@@ -39,11 +39,10 @@ def submit_topological_interp(
         harvest_subrun_id=harvest_subrun_id,
     )
 
-    # Chain dependencies: job starts only after ALL dependencies complete
     dependency_str = ":".join(dependency_job_ids) if dependency_job_ids else None
 
     slurm_config = SlurmConfig(
-        job_name="spd-topological-interp",
+        job_name="spd-graph-interp",
         partition=config.partition,
         n_gpus=0,
         cpus_per_task=16,
@@ -54,9 +53,9 @@ def submit_topological_interp(
         comment=decomposition_id,
     )
     script_content = generate_script(slurm_config, cmd)
-    result = submit_slurm_job(script_content, "spd-topological-interp")
+    result = submit_slurm_job(script_content, "spd-graph-interp")
 
-    logger.section("Topological interp job submitted")
+    logger.section("Graph interp job submitted")
     logger.values(
         {
             "Job ID": result.job_id,
@@ -67,4 +66,4 @@ def submit_topological_interp(
         }
     )
 
-    return TopologicalInterpSubmitResult(result=result)
+    return GraphInterpSubmitResult(result=result)
diff --git a/spd/graph_interp/scripts/run_slurm_cli.py b/spd/graph_interp/scripts/run_slurm_cli.py
new file mode 100644
index 000000000..a40fbee0b
--- /dev/null
+++ b/spd/graph_interp/scripts/run_slurm_cli.py
@@ -0,0 +1,27 @@
+"""CLI entry point for graph interp SLURM launcher.
+
+Thin wrapper for fast --help. Heavy imports deferred to run_slurm.py.
+
+Usage:
+    spd-graph-interp <decomposition_id> --config graph_interp_config.yaml
+"""
+
+import fire
+
+
+def main(decomposition_id: str, config: str) -> None:
+    """Submit graph interpretation pipeline to SLURM.
+
+    Args:
+        decomposition_id: ID of the target decomposition run.
+        config: Path to GraphInterpSlurmConfig YAML/JSON.
+    """
+    from spd.graph_interp.config import GraphInterpSlurmConfig
+    from spd.graph_interp.scripts.run_slurm import submit_graph_interp
+
+    slurm_config = GraphInterpSlurmConfig.from_file(config)
+    submit_graph_interp(decomposition_id, slurm_config, dependency_job_ids=[])
+
+
+def cli() -> None:
+    fire.Fire(main)
diff --git a/spd/topological_interp/__init__.py b/spd/topological_interp/__init__.py
deleted file mode 100644
index be1d47c12..000000000
--- a/spd/topological_interp/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Topological interpretation: context-aware component labeling using graph structure."""
diff --git a/spd/topological_interp/scripts/run_slurm_cli.py b/spd/topological_interp/scripts/run_slurm_cli.py
deleted file mode 100644
index 1b9aa6c7d..000000000
--- a/spd/topological_interp/scripts/run_slurm_cli.py
+++ /dev/null
@@ -1,27 +0,0 @@
-"""CLI entry point for topological interp SLURM launcher.
-
-Thin wrapper for fast --help. Heavy imports deferred to run_slurm.py.
-
-Usage:
-    spd-topological-interp <decomposition_id> --config topological_interp_config.yaml
-"""
-
-import fire
-
-
-def main(decomposition_id: str, config: str) -> None:
-    """Submit topological interpretation pipeline to SLURM.
-
-    Args:
-        decomposition_id: ID of the target decomposition run.
-        config: Path to TopologicalInterpSlurmConfig YAML/JSON.
-    """
-    from spd.topological_interp.config import TopologicalInterpSlurmConfig
-    from spd.topological_interp.scripts.run_slurm import submit_topological_interp
-
-    slurm_config = TopologicalInterpSlurmConfig.from_file(config)
-    submit_topological_interp(decomposition_id, slurm_config, dependency_job_ids=[])
-
-
-def cli() -> None:
-    fire.Fire(main)
diff --git a/tests/app/test_server_api.py b/tests/app/test_server_api.py
index f39ef385f..cc5ed5a0e 100644
--- a/tests/app/test_server_api.py
+++ b/tests/app/test_server_api.py
@@ -147,6 +147,7 @@ def app_with_state():
             harvest=None,
             interp=None,
             attributions=None,
+            graph_interp=None,
         )
 
         manager = StateManager.get()

From ef4ec4b864675e228115d616a828ef7504ff029a Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Tue, 24 Feb 2026 17:47:07 +0000
Subject: [PATCH 56/62] Store raw attribution sums, normalize at query time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Storage now holds raw (unnormalized) accumulator sums plus normalization
metadata (CI sums, component activation RMS, logit RMS per token).
Normalization happens at query time in get_top_sources/get_top_targets.

This fixes: exact merge (element-wise addition instead of approximate
weighted average), proper output-target normalization via logit RMS,
no NaN from dead components (clamp at query time), and shallow-copy
bug where embed was removed from sources_by_target.

All attribution fields are now private — query methods are the only
public interface.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../ui/DatasetAttributionsSection.svelte      |  42 +++-
 .../src/lib/api/datasetAttributions.ts        |   4 +-
 spd/dataset_attributions/harvest.py           |  23 +-
 spd/dataset_attributions/harvester.py         | 180 +++++++-------
 spd/dataset_attributions/storage.py           | 220 +++++++++---------
 tests/dataset_attributions/test_storage.py    |  94 +++-----
 6 files changed, 271 insertions(+), 292 deletions(-)

diff --git a/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte b/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
index 8363c0f88..b357e472b 100644
--- a/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
+++ b/spd/app/frontend/src/components/ui/DatasetAttributionsSection.svelte
@@ -26,10 +26,7 @@
         }
     }
 
-    function toEdgeAttribution(
-        entries: DatasetAttributionEntry[],
-        maxAbsValue: number,
-    ): EdgeAttribution[] {
+    function toEdgeAttribution(entries: DatasetAttributionEntry[], maxAbsValue: number): EdgeAttribution[] {
         return entries.map((e) => ({
             key: e.component_key,
             value: e.value,
@@ -43,23 +40,46 @@
     }
 
     // attr: signed
-    const attrMaxSource = $derived(maxAbs(attributions.attr.positive_sources[0]?.value ?? 0, attributions.attr.negative_sources[0]?.value ?? 0));
-    const attrMaxTarget = $derived(maxAbs(attributions.attr.positive_targets[0]?.value ?? 0, attributions.attr.negative_targets[0]?.value ?? 0));
+    const attrMaxSource = $derived(
+        maxAbs(attributions.attr.positive_sources[0]?.value ?? 0, attributions.attr.negative_sources[0]?.value ?? 0),
+    );
+    const attrMaxTarget = $derived(
+        maxAbs(attributions.attr.positive_targets[0]?.value ?? 0, attributions.attr.negative_targets[0]?.value ?? 0),
+    );
 
     // attr_abs: signed
-    const absMaxSource = $derived(maxAbs(attributions.attr_abs.positive_sources[0]?.value ?? 0, attributions.attr_abs.negative_sources[0]?.value ?? 0));
-    const absMaxTarget = $derived(maxAbs(attributions.attr_abs.positive_targets[0]?.value ?? 0, attributions.attr_abs.negative_targets[0]?.value ?? 0));
-
+    const absMaxSource = $derived(
+        maxAbs(
+            attributions.attr_abs.positive_sources[0]?.value ?? 0,
+            attributions.attr_abs.negative_sources[0]?.value ?? 0,
+        ),
+    );
+    const absMaxTarget = $derived(
+        maxAbs(
+            attributions.attr_abs.positive_targets[0]?.value ?? 0,
+            attributions.attr_abs.negative_targets[0]?.value ?? 0,
+        ),
+    );
 </script>
 
 <div class="section">
     <div class="metric-selector">
         <label class="radio-item">
-            <input type="radio" name="dataset-attr-metric" checked={selectedMetric === "attr"} onchange={() => (selectedMetric = "attr")} />
+            <input
+                type="radio"
+                name="dataset-attr-metric"
+                checked={selectedMetric === "attr"}
+                onchange={() => (selectedMetric = "attr")}
+            />
             <span class="stat-label">Signed</span>
         </label>
         <label class="radio-item">
-            <input type="radio" name="dataset-attr-metric" checked={selectedMetric === "attr_abs"} onchange={() => (selectedMetric = "attr_abs")} />
+            <input
+                type="radio"
+                name="dataset-attr-metric"
+                checked={selectedMetric === "attr_abs"}
+                onchange={() => (selectedMetric = "attr_abs")}
+            />
             <span class="stat-label">Abs Target</span>
         </label>
     </div>
diff --git a/spd/app/frontend/src/lib/api/datasetAttributions.ts b/spd/app/frontend/src/lib/api/datasetAttributions.ts
index ad8638913..030eae6c6 100644
--- a/spd/app/frontend/src/lib/api/datasetAttributions.ts
+++ b/spd/app/frontend/src/lib/api/datasetAttributions.ts
@@ -31,9 +31,7 @@ export type DatasetAttributionsMetadata = {
 };
 
 export async function getDatasetAttributionsMetadata(): Promise<DatasetAttributionsMetadata> {
-    return fetchJson<DatasetAttributionsMetadata>(
-        apiUrl("/api/dataset_attributions/metadata").toString(),
-    );
+    return fetchJson<DatasetAttributionsMetadata>(apiUrl("/api/dataset_attributions/metadata").toString());
 }
 
 export async function getComponentAttributions(
diff --git a/spd/dataset_attributions/harvest.py b/spd/dataset_attributions/harvest.py
index 7280b9286..a41a5a0f2 100644
--- a/spd/dataset_attributions/harvest.py
+++ b/spd/dataset_attributions/harvest.py
@@ -178,28 +178,7 @@ def harvest_attributions(
         f"Processing complete. Tokens: {harvester.n_tokens:,}, Batches: {harvester.n_batches}"
     )
 
-    # Translate concrete paths to canonical for storage
-    to_canon = topology.target_to_canon
-
-    def canon_nested(d: dict[str, dict[str, Tensor]]) -> dict[str, dict[str, Tensor]]:
-        return {to_canon(t): {to_canon(s): v for s, v in srcs.items()} for t, srcs in d.items()}
-
-    def canon_keys(d: dict[str, Tensor]) -> dict[str, Tensor]:
-        return {to_canon(k): v for k, v in d.items()}
-
-    storage = DatasetAttributionStorage(
-        regular_attr=canon_nested(harvester._regular_layers_acc),
-        regular_attr_abs=canon_nested(harvester._regular_layers_acc_abs),
-        embed_attr=canon_keys(harvester._embed_tgts_acc),
-        embed_attr_abs=canon_keys(harvester._embed_tgts_acc_abs),
-        unembed_attr=canon_keys(harvester._unembed_srcs_acc),
-        embed_unembed_attr=harvester._emb_unemb_attr_acc,
-        w_unembed=topology.get_unembed_weight(),
-        vocab_size=vocab_size,
-        ci_threshold=config.ci_threshold,
-        n_batches_processed=harvester.n_batches,
-        n_tokens_processed=harvester.n_tokens,
-    )
+    storage = harvester.finalize(topology, config.ci_threshold)
 
     if rank is not None:
         worker_dir = output_dir / "worker_states"
diff --git a/spd/dataset_attributions/harvester.py b/spd/dataset_attributions/harvester.py
index 18e8b74e3..0aa95b1a1 100644
--- a/spd/dataset_attributions/harvester.py
+++ b/spd/dataset_attributions/harvester.py
@@ -22,8 +22,10 @@
 from torch import Tensor, nn
 
 from spd.configs import SamplingType
+from spd.dataset_attributions.storage import DatasetAttributionStorage
 from spd.models.component_model import ComponentModel, OutputWithCache
 from spd.models.components import make_mask_infos
+from spd.topology import TransformerTopology
 from spd.utils.general_utils import bf16_autocast
 
 
@@ -71,7 +73,7 @@ def __init__(
         self.n_tokens = 0
         self.output_d_model = unembed_module.in_features
 
-        sources_by_regular_target = self.sources_by_target.copy()
+        sources_by_regular_target = {k: v.copy() for k, v in self.sources_by_target.items()}
 
         unembed_sources = sources_by_regular_target[self.unembed_path].copy()
         del sources_by_regular_target[self.unembed_path]
@@ -133,6 +135,8 @@ def __init__(
             for layer, c in self.model.module_to_c.items()
         }
 
+        self._logit_sq_sum = torch.zeros(self.vocab_size, device=self.device)
+
     def process_batch(self, tokens: Int[Tensor, "batch seq"]) -> None:
         """Accumulate attributions from one batch."""
         self.n_batches += 1
@@ -181,9 +185,10 @@ def pre_unembed_hook(_mod: nn.Module, args: tuple[Any, ...], _kwargs: Any) -> No
         cache[f"{self.embed_path}_post_detach"] = embed_out[0]
         cache[f"{self.unembed_path}_pre_detach"] = pre_unembed[0]
 
-        for real_layer, ci_vals in ci.lower_leaky.items():
-            sum_ci = ci_vals.sum(dim=(0, 1))
-            self._ci_sum_accumulator[real_layer].add_(sum_ci)
+        with torch.no_grad():
+            for real_layer, ci_vals in ci.lower_leaky.items():
+                self._ci_sum_accumulator[real_layer].add_(ci_vals.sum(dim=(0, 1)))
+            self._logit_sq_sum.add_(comp_output.output.detach().square().sum(dim=(0, 1)))
 
         for target_layer in self.sources_by_target:
             if target_layer == self.unembed_path:
@@ -211,18 +216,16 @@ def _process_output_targets(
 
         for d_idx in range(self.output_d_model):
             grads = torch.autograd.grad(out_residual_sum[d_idx], source_acts, retain_graph=True)
-            for source_layer, act, grad in zip(source_layers, source_acts, grads, strict=True):
-                if source_layer == self.embed_path:
-                    # token attribution is just grad * act
-                    # because act is just the embedding
-                    token_attr = (grad * act).sum(dim=-1)  # (B S)
-                    self._emb_unemb_attr_acc[d_idx].scatter_add_(
-                        0, tokens.flatten(), token_attr.flatten()
-                    )
-                else:
-                    # Per-component: sum grad*act*ci over batch and sequence
-                    ci_weighted_attr = (grad * act * ci[source_layer]).sum(dim=(0, 1))
-                    self._unembed_srcs_acc[source_layer][d_idx].add_(ci_weighted_attr)
+            with torch.no_grad():
+                for source_layer, act, grad in zip(source_layers, source_acts, grads, strict=True):
+                    if source_layer == self.embed_path:
+                        token_attr = (grad * act).sum(dim=-1)  # (B S)
+                        self._emb_unemb_attr_acc[d_idx].scatter_add_(
+                            0, tokens.flatten(), token_attr.flatten()
+                        )
+                    else:
+                        ci_weighted_attr = (grad * act * ci[source_layer]).sum(dim=(0, 1))
+                        self._unembed_srcs_acc[source_layer][d_idx].add_(ci_weighted_attr)
 
     def _process_component_targets(
         self,
@@ -239,75 +242,94 @@ def _process_component_targets(
         target_acts_raw = cache[f"{target_layer}_pre_detach"]
 
         target_acts = target_acts_raw.sum(dim=(0, 1))
-        target_acts_abs = target_acts_raw.abs().sum(dim=(0, 1))
 
         source_layers = self.sources_by_target[target_layer]
+        if not source_layers:
+            return
         source_acts = [cache[f"{s}_post_detach"] for s in source_layers]
 
         for t_idx in torch.where(alive_targets)[0].tolist():
             grads_val = torch.autograd.grad(target_acts[t_idx], source_acts, retain_graph=True)
-            grads_abs = torch.autograd.grad(target_acts_abs[t_idx], source_acts, retain_graph=True)
+            self._accumulate_grads(
+                grads_val,
+                source_layers,
+                source_acts,
+                tokens,
+                ci,
+                target_layer,
+                t_idx,
+                self._embed_tgts_acc,
+                self._regular_layers_acc,
+            )
+            del grads_val
+
+            abs_scalar = target_acts_raw[:, :, t_idx].abs().sum()
+            grads_abs = torch.autograd.grad(abs_scalar, source_acts, retain_graph=True)
+            self._accumulate_grads(
+                grads_abs,
+                source_layers,
+                source_acts,
+                tokens,
+                ci,
+                target_layer,
+                t_idx,
+                self._embed_tgts_acc_abs,
+                self._regular_layers_acc_abs,
+            )
+            del grads_abs
 
-            for source_layer, act, grad_val, grad_abs in zip(
-                source_layers, source_acts, grads_val, grads_abs, strict=True
-            ):
+    def _accumulate_grads(
+        self,
+        grads: tuple[Tensor, ...],
+        source_layers: list[str],
+        source_acts: list[Tensor],
+        tokens: Int[Tensor, "batch seq"],
+        ci: dict[str, Tensor],
+        target_layer: str,
+        t_idx: int,
+        embed_acc: dict[str, Tensor],
+        regular_acc: dict[str, dict[str, Tensor]],
+    ) -> None:
+        with torch.no_grad():
+            for source_layer, act, grad in zip(source_layers, source_acts, grads, strict=True):
                 if source_layer == self.embed_path:
-                    # token attribution is just grad * act
-                    # because act is just the embedding
-                    tok_embeddings = act
-
-                    token_attr = (grad_val * tok_embeddings).sum(dim=-1)  # (B S)
-                    token_attr_abs = (grad_abs * tok_embeddings).sum(dim=-1)  # (B S)
-
-                    acc = self._embed_tgts_acc[source_layer][t_idx]
-                    acc_abs = self._embed_tgts_acc_abs[source_layer][t_idx]
-
-                    acc.scatter_add_(0, tokens.flatten(), token_attr.flatten())
-                    acc_abs.scatter_add_(0, tokens.flatten(), token_attr_abs.flatten())
+                    token_attr = (grad * act).sum(dim=-1)  # (B S)
+                    embed_acc[target_layer][t_idx].scatter_add_(
+                        0, tokens.flatten(), token_attr.flatten()
+                    )
                 else:
-                    ci_weighted_attr_val = grad_val * act * ci[source_layer]  # (B S C)
-                    ci_weighted_attr_abs = grad_abs * act * ci[source_layer]  # (B S C)
-
-                    ci_weighted_attr_abs_sum = ci_weighted_attr_abs.sum(dim=(0, 1))  # (C,)
-                    ci_weighted_attr_val_sum = ci_weighted_attr_val.sum(dim=(0, 1))  # (C,)
-
-                    attr_acc = self._regular_layers_acc[target_layer][source_layer][t_idx]
-                    attr_acc_abs = self._regular_layers_acc_abs[target_layer][source_layer][t_idx]
-
-                    attr_acc.add_(ci_weighted_attr_val_sum)
-                    attr_acc_abs.add_(ci_weighted_attr_abs_sum)
-
-    # def normalized_attrs(self) -> NormalizedAttrs:
-    #     """Return the accumulated attributions normalized by n_tokens.
-
-    #     mean_squared_attr is pre-sqrt so it can be merged across workers.
-    #     """
-    #     normed_attr_val = defaultdict[str, dict[str, Float[Tensor, "c_target c_source"]]](dict)
-    #     normed_attr_abs = defaultdict[str, dict[str, Float[Tensor, "c_target c_source"]]](dict)
-
-    #     for target in self._attr_val_accumulator:
-    #         mean_squared_act = self._square_act_accumulator[target] / self.n_tokens
-    #         mean_target_act_l2 = mean_squared_act.sqrt()  # (C_target,)
-
-    #         for source in self.sources_by_target[target]:
-    #             mean_attr_val = self._attr_val_accumulator[target][source]  # (C_target, C_source)
-    #             mean_attr_abs = self._attr_abs_accumulator[target][source]  # (C_target, C_source)
-
-    #             source_ci_sum = (
-    #                 self._ci_sum_accumulator[source] if source != self.embed_path else 1.0
-    #             )  # (C_source,)
-
-    #             ci_weighted_mean_attr_val = mean_attr_val / source_ci_sum  # (C_target, C_source)
-    #             ci_weighted_mean_attr_abs = mean_attr_abs / source_ci_sum  # (C_target, C_source)
-
-    #             normed_attr_val[target][source] = (
-    #                 ci_weighted_mean_attr_val / mean_target_act_l2[..., None]
-    #             )
-    #             normed_attr_abs[target][source] = (
-    #                 ci_weighted_mean_attr_abs / mean_target_act_l2[..., None]
-    #             )
-
-    #     return self.NormalizedAttrs(
-    #         attr=normed_attr_val,
-    #         attr_abs=normed_attr_abs,
-    #     )
+                    ci_weighted = (grad * act * ci[source_layer]).sum(dim=(0, 1))  # (C,)
+                    regular_acc[target_layer][source_layer][t_idx].add_(ci_weighted)
+
+    def finalize(
+        self, topology: TransformerTopology, ci_threshold: float
+    ) -> DatasetAttributionStorage:
+        """Package raw accumulators into storage. No normalization — that happens at query time."""
+        assert self.n_tokens > 0, "No batches processed"
+
+        to_canon = topology.target_to_canon
+
+        def _canon_nested(acc: dict[str, dict[str, Tensor]]) -> dict[str, dict[str, Tensor]]:
+            return {
+                to_canon(t): {to_canon(s): v for s, v in srcs.items()} for t, srcs in acc.items()
+            }
+
+        def _canon(acc: dict[str, Tensor]) -> dict[str, Tensor]:
+            return {to_canon(k): v for k, v in acc.items()}
+
+        return DatasetAttributionStorage(
+            regular_attr=_canon_nested(self._regular_layers_acc),
+            regular_attr_abs=_canon_nested(self._regular_layers_acc_abs),
+            embed_attr=_canon(self._embed_tgts_acc),
+            embed_attr_abs=_canon(self._embed_tgts_acc_abs),
+            unembed_attr=_canon(self._unembed_srcs_acc),
+            embed_unembed_attr=self._emb_unemb_attr_acc,
+            w_unembed=topology.get_unembed_weight(),
+            ci_sum=_canon(self._ci_sum_accumulator),
+            component_act_sq_sum=_canon(self._square_component_act_accumulator),
+            logit_sq_sum=self._logit_sq_sum,
+            vocab_size=self.vocab_size,
+            ci_threshold=ci_threshold,
+            n_batches_processed=self.n_batches,
+            n_tokens_processed=self.n_tokens,
+        )
diff --git a/spd/dataset_attributions/storage.py b/spd/dataset_attributions/storage.py
index 2dbf67651..f8a47e40c 100644
--- a/spd/dataset_attributions/storage.py
+++ b/spd/dataset_attributions/storage.py
@@ -1,22 +1,25 @@
 """Storage classes for dataset attributions.
 
+Stores raw (unnormalized) attribution sums. Normalization happens at query time using
+stored metadata (CI sums, activation RMS, logit RMS).
+
 Four edge types, each with its own shape:
 - regular:        component → component  [tgt_c, src_c]  (signed + abs)
 - embed:          embed → component      [tgt_c, vocab]  (signed + abs)
 - unembed:        component → unembed    [d_model, src_c] (signed only, residual space)
 - embed_unembed:  embed → unembed        [d_model, vocab] (signed only, residual space)
 
-Output (unembed) attributions are stored in residual space. Token-level attributions
-are computed on-the-fly via w_unembed projection.
-
 Abs variants are unavailable for unembed edges because abs is a nonlinear operation
 incompatible with the residual-space storage trick.
+
+Normalization formula:
+    normed[t, s] = raw[t, s] / ci_sum[s] / target_rms[t]
+- ci_sum omitted for embed sources (no CI for embeddings)
+- target_rms is component activation RMS for component targets, logit RMS for output targets
 """
 
 import bisect
-from collections.abc import Callable
 from dataclasses import dataclass
-from functools import partial
 from pathlib import Path
 from typing import Literal
 
@@ -27,6 +30,8 @@
 
 AttrMetric = Literal["attr", "attr_abs"]
 
+EPS = 1e-10
+
 
 @dataclass
 class DatasetAttributionEntry:
@@ -43,6 +48,9 @@ class DatasetAttributionStorage:
 
     All layer names use canonical addressing (e.g., "embed", "0.glu.up", "output").
 
+    Internally stores raw sums — normalization applied at query time.
+    Public interface: get_top_sources(), get_top_targets(), save/load/merge.
+
     Key formats:
         - embed tokens: "embed:{token_id}"
         - component layers: "canonical_layer:c_idx" (e.g., "0.glu.up:5")
@@ -58,18 +66,24 @@ def __init__(
         unembed_attr: dict[str, Tensor],
         embed_unembed_attr: Tensor,
         w_unembed: Tensor,
+        ci_sum: dict[str, Tensor],
+        component_act_sq_sum: dict[str, Tensor],
+        logit_sq_sum: Tensor,
         vocab_size: int,
         ci_threshold: float,
         n_batches_processed: int,
         n_tokens_processed: int,
     ):
-        self.regular_attr = regular_attr
-        self.regular_attr_abs = regular_attr_abs
-        self.embed_attr = embed_attr
-        self.embed_attr_abs = embed_attr_abs
-        self.unembed_attr = unembed_attr
-        self.embed_unembed_attr = embed_unembed_attr
-        self.w_unembed = w_unembed
+        self._regular_attr = regular_attr
+        self._regular_attr_abs = regular_attr_abs
+        self._embed_attr = embed_attr
+        self._embed_attr_abs = embed_attr_abs
+        self._unembed_attr = unembed_attr
+        self._embed_unembed_attr = embed_unembed_attr
+        self._w_unembed = w_unembed
+        self._ci_sum = ci_sum
+        self._component_act_sq_sum = component_act_sq_sum
+        self._logit_sq_sum = logit_sq_sum
         self.vocab_size = vocab_size
         self.ci_threshold = ci_threshold
         self.n_batches_processed = n_batches_processed
@@ -77,14 +91,13 @@ def __init__(
 
     @property
     def target_layers(self) -> set[str]:
-        return self.regular_attr.keys() | self.embed_attr.keys()
+        return self._regular_attr.keys() | self._embed_attr.keys()
 
     def _target_n_components(self, layer: str) -> int | None:
-        """Number of target components for a layer, or None if not a target."""
-        if layer in self.embed_attr:
-            return self.embed_attr[layer].shape[0]
-        if layer in self.regular_attr:
-            first_source = next(iter(self.regular_attr[layer].values()))
+        if layer in self._embed_attr:
+            return self._embed_attr[layer].shape[0]
+        if layer in self._regular_attr:
+            first_source = next(iter(self._regular_attr[layer].values()))
             return first_source.shape[0]
         return None
 
@@ -105,12 +118,23 @@ def _parse_key(key: str) -> tuple[str, int]:
     def _select_metric(
         self, metric: AttrMetric
     ) -> tuple[dict[str, dict[str, Tensor]], dict[str, Tensor]]:
-        """Return (regular_dict, embed_dict) for the given metric."""
         match metric:
             case "attr":
-                return self.regular_attr, self.embed_attr
+                return self._regular_attr, self._embed_attr
             case "attr_abs":
-                return self.regular_attr_abs, self.embed_attr_abs
+                return self._regular_attr_abs, self._embed_attr_abs
+
+    def _component_rms(self, layer: str) -> Tensor:
+        """RMS activation for a component layer. Shape (n_components,)."""
+        return (self._component_act_sq_sum[layer] / self.n_tokens_processed).sqrt().clamp(min=EPS)
+
+    def _logit_rms(self) -> Tensor:
+        """RMS logit per token. Shape (vocab,)."""
+        return (self._logit_sq_sum / self.n_tokens_processed).sqrt().clamp(min=EPS)
+
+    def _ci_norm(self, layer: str) -> Tensor:
+        """CI sum for a source layer, clamped. Shape (n_components,)."""
+        return self._ci_sum[layer].clamp(min=EPS)
 
     def get_top_sources(
         self,
@@ -127,32 +151,30 @@ def get_top_sources(
         if target_layer == "output":
             if metric == "attr_abs":
                 return []
-            w = self.w_unembed[:, target_idx].to(self.embed_unembed_attr.device)
+            w = self._w_unembed[:, target_idx].to(self._embed_unembed_attr.device)
+            target_norm = self._logit_rms()[target_idx]
 
-            # Component sources via unembed_attr
-            for source_layer, attr_matrix in self.unembed_attr.items():
-                values = w @ attr_matrix  # (d_model,) @ (d_model, src_c) → (src_c,)
-                value_segments.append(values)
+            for source_layer, attr_matrix in self._unembed_attr.items():
+                raw = w @ attr_matrix  # (src_c,)
+                value_segments.append(raw / self._ci_norm(source_layer) / target_norm)
                 layer_names.append(source_layer)
 
-            # Embed source via embed_unembed_attr
-            values = w @ self.embed_unembed_attr  # (d_model,) @ (d_model, vocab) → (vocab,)
-            value_segments.append(values)
+            raw = w @ self._embed_unembed_attr  # (vocab,)
+            value_segments.append(raw / target_norm)
             layer_names.append("embed")
         else:
             regular, embed = self._select_metric(metric)
+            target_norm = self._component_rms(target_layer)[target_idx]
 
-            # Component sources
             if target_layer in regular:
                 for source_layer, attr_matrix in regular[target_layer].items():
-                    values = attr_matrix[target_idx, :]
-                    value_segments.append(values)
+                    raw = attr_matrix[target_idx, :]  # (src_c,)
+                    value_segments.append(raw / self._ci_norm(source_layer) / target_norm)
                     layer_names.append(source_layer)
 
-            # Embed source
             if target_layer in embed:
-                values = embed[target_layer][target_idx, :]
-                value_segments.append(values)
+                raw = embed[target_layer][target_idx, :]  # (vocab,)
+                value_segments.append(raw / target_norm)
                 layer_names.append("embed")
 
         return self._top_k_from_segments(value_segments, layer_names, k, sign)
@@ -174,29 +196,30 @@ def get_top_targets(
             regular, embed = self._select_metric(metric)
 
             for target_layer, attr_matrix in embed.items():
-                values = attr_matrix[:, source_idx]  # (tgt_c,)
-                value_segments.append(values)
+                raw = attr_matrix[:, source_idx]  # (tgt_c,)
+                value_segments.append(raw / self._component_rms(target_layer))
                 layer_names.append(target_layer)
 
             if include_outputs and metric == "attr":
-                residual = self.embed_unembed_attr[:, source_idx]  # (d_model,)
-                values = residual @ self.w_unembed  # (d_model,) @ (d_model, vocab) → (vocab,)
-                value_segments.append(values)
+                residual = self._embed_unembed_attr[:, source_idx]  # (d_model,)
+                raw = residual @ self._w_unembed  # (vocab,)
+                value_segments.append(raw / self._logit_rms())
                 layer_names.append("output")
         else:
             regular, embed = self._select_metric(metric)
+            ci = self._ci_norm(source_layer)[source_idx]
 
             for target_layer, sources in regular.items():
                 if source_layer not in sources:
                     continue
-                values = sources[source_layer][:, source_idx]  # (tgt_c,)
-                value_segments.append(values)
+                raw = sources[source_layer][:, source_idx]  # (tgt_c,)
+                value_segments.append(raw / ci / self._component_rms(target_layer))
                 layer_names.append(target_layer)
 
-            if include_outputs and metric == "attr" and source_layer in self.unembed_attr:
-                residual = self.unembed_attr[source_layer][:, source_idx]  # (d_model,)
-                values = residual @ self.w_unembed  # (d_model,) @ (d_model, vocab) → (vocab,)
-                value_segments.append(values)
+            if include_outputs and metric == "attr" and source_layer in self._unembed_attr:
+                residual = self._unembed_attr[source_layer][:, source_idx]  # (d_model,)
+                raw = residual @ self._w_unembed  # (vocab,)
+                value_segments.append(raw / ci / self._logit_rms())
                 layer_names.append("output")
 
         return self._top_k_from_segments(value_segments, layer_names, k, sign)
@@ -239,19 +262,18 @@ def _top_k_from_segments(
 
     def save(self, path: Path) -> None:
         path.parent.mkdir(parents=True, exist_ok=True)
-
-        to_cpu_nested = partial(_d_map_nested, lambda x: x.cpu())
-        to_cpu_flat = partial(_d_map, lambda x: x.cpu())
-
         torch.save(
             {
-                "regular_attr": to_cpu_nested(self.regular_attr),
-                "regular_attr_abs": to_cpu_nested(self.regular_attr_abs),
-                "embed_attr": to_cpu_flat(self.embed_attr),
-                "embed_attr_abs": to_cpu_flat(self.embed_attr_abs),
-                "unembed_attr": to_cpu_flat(self.unembed_attr),
-                "embed_unembed_attr": self.embed_unembed_attr.cpu(),
-                "w_unembed": self.w_unembed.cpu(),
+                "regular_attr": _to_cpu_nested(self._regular_attr),
+                "regular_attr_abs": _to_cpu_nested(self._regular_attr_abs),
+                "embed_attr": _to_cpu(self._embed_attr),
+                "embed_attr_abs": _to_cpu(self._embed_attr_abs),
+                "unembed_attr": _to_cpu(self._unembed_attr),
+                "embed_unembed_attr": self._embed_unembed_attr.cpu(),
+                "w_unembed": self._w_unembed.cpu(),
+                "ci_sum": _to_cpu(self._ci_sum),
+                "component_act_sq_sum": _to_cpu(self._component_act_sq_sum),
+                "logit_sq_sum": self._logit_sq_sum.cpu(),
                 "vocab_size": self.vocab_size,
                 "ci_threshold": self.ci_threshold,
                 "n_batches_processed": self.n_batches_processed,
@@ -273,6 +295,9 @@ def load(cls, path: Path) -> "DatasetAttributionStorage":
             unembed_attr=data["unembed_attr"],
             embed_unembed_attr=data["embed_unembed_attr"],
             w_unembed=data["w_unembed"],
+            ci_sum=data["ci_sum"],
+            component_act_sq_sum=data["component_act_sq_sum"],
+            logit_sq_sum=data["logit_sq_sum"],
             vocab_size=data["vocab_size"],
             ci_threshold=data["ci_threshold"],
             n_batches_processed=data["n_batches_processed"],
@@ -283,73 +308,50 @@ def load(cls, path: Path) -> "DatasetAttributionStorage":
     def merge(cls, paths: list[Path]) -> "DatasetAttributionStorage":
         """Merge partial attribution files from parallel workers.
 
-        Values are treated as means — merge is weighted average by n_tokens.
+        All stored values are raw sums — merge is element-wise addition.
         """
         assert paths, "No files to merge"
 
-        first = cls.load(paths[0])
-        n = first.n_tokens_processed
-
-        denorm_nested = partial(_d_map_nested, lambda x: (x * n).double())
-        denorm_flat = partial(_d_map, lambda x: (x * n).double())
-
-        total_regular = denorm_nested(first.regular_attr)
-        total_regular_abs = denorm_nested(first.regular_attr_abs)
-        total_embed = denorm_flat(first.embed_attr)
-        total_embed_abs = denorm_flat(first.embed_attr_abs)
-        total_unembed = denorm_flat(first.unembed_attr)
-        total_embed_unembed = (first.embed_unembed_attr * n).double()
-        total_tokens = n
-        total_batches = first.n_batches_processed
+        merged = cls.load(paths[0])
 
         for path in paths[1:]:
-            storage = cls.load(path)
-            assert storage.ci_threshold == first.ci_threshold, "CI threshold mismatch"
-            n = storage.n_tokens_processed
+            other = cls.load(path)
+            assert other.ci_threshold == merged.ci_threshold, "CI threshold mismatch"
 
-            for target, sources in storage.regular_attr.items():
+            for target, sources in other._regular_attr.items():
                 for source, tensor in sources.items():
-                    total_regular[target][source] += (tensor * n).double()
-                    total_regular_abs[target][source] += (
-                        storage.regular_attr_abs[target][source] * n
-                    ).double()
+                    merged._regular_attr[target][source] += tensor
+                    merged._regular_attr_abs[target][source] += other._regular_attr_abs[target][
+                        source
+                    ]
 
-            for target, tensor in storage.embed_attr.items():
-                total_embed[target] += (tensor * n).double()
-                total_embed_abs[target] += (storage.embed_attr_abs[target] * n).double()
+            for target, tensor in other._embed_attr.items():
+                merged._embed_attr[target] += tensor
+                merged._embed_attr_abs[target] += other._embed_attr_abs[target]
 
-            for source, tensor in storage.unembed_attr.items():
-                total_unembed[source] += (tensor * n).double()
+            for source, tensor in other._unembed_attr.items():
+                merged._unembed_attr[source] += tensor
 
-            total_embed_unembed += (storage.embed_unembed_attr * n).double()
-            total_tokens += n
-            total_batches += storage.n_batches_processed
+            merged._embed_unembed_attr += other._embed_unembed_attr
 
-        norm_nested = partial(_d_map_nested, lambda x: (x / total_tokens).float())
-        norm_flat = partial(_d_map, lambda x: (x / total_tokens).float())
+            for layer in other._ci_sum:
+                merged._ci_sum[layer] += other._ci_sum[layer]
 
-        return cls(
-            regular_attr=norm_nested(total_regular),
-            regular_attr_abs=norm_nested(total_regular_abs),
-            embed_attr=norm_flat(total_embed),
-            embed_attr_abs=norm_flat(total_embed_abs),
-            unembed_attr=norm_flat(total_unembed),
-            embed_unembed_attr=(total_embed_unembed / total_tokens).float(),
-            w_unembed=first.w_unembed,
-            vocab_size=first.vocab_size,
-            ci_threshold=first.ci_threshold,
-            n_batches_processed=total_batches,
-            n_tokens_processed=total_tokens,
-        )
+            for layer in other._component_act_sq_sum:
+                merged._component_act_sq_sum[layer] += other._component_act_sq_sum[layer]
+
+            merged._logit_sq_sum += other._logit_sq_sum
+            merged.n_tokens_processed += other.n_tokens_processed
+            merged.n_batches_processed += other.n_batches_processed
+
+        return merged
 
 
-def _d_map_nested(
-    f: Callable[[Tensor], Tensor], d: dict[str, dict[str, Tensor]]
-) -> dict[str, dict[str, Tensor]]:
+def _to_cpu_nested(d: dict[str, dict[str, Tensor]]) -> dict[str, dict[str, Tensor]]:
     return {
-        target: {source: f(v) for source, v in sources.items()} for target, sources in d.items()
+        target: {source: v.cpu() for source, v in sources.items()} for target, sources in d.items()
     }
 
 
-def _d_map(f: Callable[[Tensor], Tensor], d: dict[str, Tensor]) -> dict[str, Tensor]:
-    return {k: f(v) for k, v in d.items()}
+def _to_cpu(d: dict[str, Tensor]) -> dict[str, Tensor]:
+    return {k: v.cpu() for k, v in d.items()}
diff --git a/tests/dataset_attributions/test_storage.py b/tests/dataset_attributions/test_storage.py
index fea091c9a..a6383e7da 100644
--- a/tests/dataset_attributions/test_storage.py
+++ b/tests/dataset_attributions/test_storage.py
@@ -39,6 +39,9 @@ def rand(*shape: int) -> Tensor:
         unembed_attr={LAYER_0: rand(D_MODEL, C0), LAYER_1: rand(D_MODEL, C1)},
         embed_unembed_attr=rand(D_MODEL, VOCAB_SIZE),
         w_unembed=rand(D_MODEL, VOCAB_SIZE),
+        ci_sum={LAYER_0: rand(C0).abs() + 1.0, LAYER_1: rand(C1).abs() + 1.0},
+        component_act_sq_sum={LAYER_0: rand(C0).abs() + 1.0, LAYER_1: rand(C1).abs() + 1.0},
+        logit_sq_sum=rand(VOCAB_SIZE).abs() + 1.0,
         vocab_size=VOCAB_SIZE,
         ci_threshold=1e-6,
         n_batches_processed=n_batches,
@@ -49,7 +52,6 @@ def rand(*shape: int) -> Tensor:
 class TestNComponents:
     def test_counts_all_target_layers(self):
         storage = _make_storage()
-        # LAYER_0 is only in embed_attr, LAYER_1 is in both — both count
         assert storage.n_components == C0 + C1
 
 
@@ -64,7 +66,6 @@ def test_component_target_includes_embed(self):
         storage = _make_storage()
         results = storage.get_top_sources(f"{LAYER_1}:0", k=20, sign="positive", metric="attr")
         layers = {r.layer for r in results}
-        # Should include both component and embed sources
         assert "embed" in layers or LAYER_0 in layers
 
     def test_output_target(self):
@@ -79,7 +80,6 @@ def test_output_target_attr_abs_returns_empty(self):
 
     def test_target_only_in_embed_attr(self):
         storage = _make_storage()
-        # LAYER_0 is only in embed_attr, not in regular_attr
         results = storage.get_top_sources(f"{LAYER_0}:0", k=5, sign="positive", metric="attr")
         assert len(results) <= 5
         assert all(r.layer == "embed" for r in results)
@@ -89,6 +89,11 @@ def test_attr_abs_metric(self):
         results = storage.get_top_sources(f"{LAYER_1}:0", k=5, sign="positive", metric="attr_abs")
         assert len(results) <= 5
 
+    def test_no_nan_in_results(self):
+        storage = _make_storage()
+        results = storage.get_top_sources(f"{LAYER_1}:0", k=20, sign="positive", metric="attr")
+        assert all(not torch.isnan(torch.tensor(r.value)) for r in results)
+
 
 class TestGetTopTargets:
     def test_component_source(self):
@@ -136,27 +141,23 @@ def test_roundtrip(self, tmp_path: Path):
         assert loaded.n_tokens_processed == original.n_tokens_processed
         assert loaded.n_components == original.n_components
 
-        # Check regular_attr roundtrip
-        for target in original.regular_attr:
-            for source in original.regular_attr[target]:
-                torch.testing.assert_close(
-                    loaded.regular_attr[target][source], original.regular_attr[target][source]
-                )
-
-        # Check embed_attr roundtrip
-        for target in original.embed_attr:
-            torch.testing.assert_close(loaded.embed_attr[target], original.embed_attr[target])
+    def test_roundtrip_query_consistency(self, tmp_path: Path):
+        original = _make_storage()
+        path = tmp_path / "attrs.pt"
+        original.save(path)
+        loaded = DatasetAttributionStorage.load(path)
 
-        # Check unembed_attr roundtrip
-        for source in original.unembed_attr:
-            torch.testing.assert_close(loaded.unembed_attr[source], original.unembed_attr[source])
+        orig_results = original.get_top_sources(f"{LAYER_1}:0", k=5, sign="positive", metric="attr")
+        load_results = loaded.get_top_sources(f"{LAYER_1}:0", k=5, sign="positive", metric="attr")
 
-        # Check embed_unembed_attr roundtrip
-        torch.testing.assert_close(loaded.embed_unembed_attr, original.embed_unembed_attr)
+        assert len(orig_results) == len(load_results)
+        for orig, loaded in zip(orig_results, load_results, strict=True):
+            assert orig.component_key == loaded.component_key
+            assert abs(orig.value - loaded.value) < 1e-5
 
 
 class TestMerge:
-    def test_two_workers_weighted_average(self, tmp_path: Path):
+    def test_two_workers_additive(self, tmp_path: Path):
         s1 = _make_storage(seed=0, n_batches=5, n_tokens=320)
         s2 = _make_storage(seed=42, n_batches=5, n_tokens=320)
 
@@ -169,50 +170,6 @@ def test_two_workers_weighted_average(self, tmp_path: Path):
 
         assert merged.n_batches_processed == 10
         assert merged.n_tokens_processed == 640
-        assert merged.vocab_size == VOCAB_SIZE
-        assert merged.ci_threshold == s1.ci_threshold
-
-        n1, n2 = s1.n_tokens_processed, s2.n_tokens_processed
-        total = n1 + n2
-
-        # Check regular_attr merge
-        for target in s1.regular_attr:
-            for source in s1.regular_attr[target]:
-                expected = (
-                    s1.regular_attr[target][source] * n1 + s2.regular_attr[target][source] * n2
-                ) / total
-                torch.testing.assert_close(
-                    merged.regular_attr[target][source], expected, atol=1e-5, rtol=1e-5
-                )
-
-        # Check embed_unembed_attr merge
-        expected = (s1.embed_unembed_attr * n1 + s2.embed_unembed_attr * n2) / total
-        torch.testing.assert_close(merged.embed_unembed_attr, expected, atol=1e-5, rtol=1e-5)
-
-    def test_unequal_token_counts(self, tmp_path: Path):
-        s1 = _make_storage(seed=0, n_batches=3, n_tokens=192)
-        s2 = _make_storage(seed=42, n_batches=7, n_tokens=448)
-
-        p1 = tmp_path / "rank_0.pt"
-        p2 = tmp_path / "rank_1.pt"
-        s1.save(p1)
-        s2.save(p2)
-
-        merged = DatasetAttributionStorage.merge([p1, p2])
-
-        assert merged.n_tokens_processed == 640
-        assert merged.n_batches_processed == 10
-
-        n1, n2 = s1.n_tokens_processed, s2.n_tokens_processed
-        total = n1 + n2
-        for target in s1.regular_attr:
-            for source in s1.regular_attr[target]:
-                expected = (
-                    s1.regular_attr[target][source] * n1 + s2.regular_attr[target][source] * n2
-                ) / total
-                torch.testing.assert_close(
-                    merged.regular_attr[target][source], expected, atol=1e-5, rtol=1e-5
-                )
 
     def test_single_file(self, tmp_path: Path):
         original = _make_storage(seed=7, n_batches=10, n_tokens=640)
@@ -222,8 +179,9 @@ def test_single_file(self, tmp_path: Path):
         merged = DatasetAttributionStorage.merge([path])
 
         assert merged.n_tokens_processed == original.n_tokens_processed
-        for target in original.regular_attr:
-            for source in original.regular_attr[target]:
-                torch.testing.assert_close(
-                    merged.regular_attr[target][source], original.regular_attr[target][source]
-                )
+
+        orig_results = original.get_top_sources(f"{LAYER_1}:0", k=5, sign="positive", metric="attr")
+        merge_results = merged.get_top_sources(f"{LAYER_1}:0", k=5, sign="positive", metric="attr")
+        for o, m in zip(orig_results, merge_results, strict=True):
+            assert o.component_key == m.component_key
+            assert abs(o.value - m.value) < 1e-5

From 7298cd7255dfbb6678caa3826345414e4a404b3b Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Tue, 24 Feb 2026 19:43:34 +0000
Subject: [PATCH 57/62] Fix n_batches removal, detach tensors on save, handle
 output source queries

- Remove n_batches_processed from all consumers (storage, routers, frontend,
  graph-interp, tests) after field was dropped from storage
- Add .detach().cpu() in save to prevent requires_grad leaking to disk
- Return empty list for get_top_targets("output:*") since output can't be a source
- Fix shallow copy bug in harvester: deep copy source lists to prevent
  embed being removed from sources_by_target
- Fix embed accumulator shape: use num_embeddings not d_model
- Restore fire.Fire in run_worker.py (was accidentally replaced with hardcoded call)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/app/backend/routers/data_sources.py       |   2 -
 .../backend/routers/dataset_attributions.py   |   3 -
 .../src/components/DataSourcesTab.svelte      |   3 -
 spd/app/frontend/src/lib/api/dataSources.ts   |   1 -
 spd/dataset_attributions/harvest.py           |  10 +-
 spd/dataset_attributions/harvester.py         | 215 ++++++++----------
 spd/dataset_attributions/storage.py           |  24 +-
 spd/graph_interp/scripts/run.py               |   2 +-
 tests/dataset_attributions/test_storage.py    |  15 +-
 9 files changed, 114 insertions(+), 161 deletions(-)

diff --git a/spd/app/backend/routers/data_sources.py b/spd/app/backend/routers/data_sources.py
index 0481f64d7..6888b339f 100644
--- a/spd/app/backend/routers/data_sources.py
+++ b/spd/app/backend/routers/data_sources.py
@@ -28,7 +28,6 @@ class AutointerpInfo(BaseModel):
 
 class AttributionsInfo(BaseModel):
     subrun_id: str
-    n_batches_processed: int
     n_tokens_processed: int
     ci_threshold: float
 
@@ -77,7 +76,6 @@ def get_data_sources(loaded: DepLoadedRun) -> DataSourcesResponse:
         storage = loaded.attributions.get_attributions()
         attributions_info = AttributionsInfo(
             subrun_id=loaded.attributions.subrun_id,
-            n_batches_processed=storage.n_batches_processed,
             n_tokens_processed=storage.n_tokens_processed,
             ci_threshold=storage.ci_threshold,
         )
diff --git a/spd/app/backend/routers/dataset_attributions.py b/spd/app/backend/routers/dataset_attributions.py
index 1fe1e66fa..178eefc72 100644
--- a/spd/app/backend/routers/dataset_attributions.py
+++ b/spd/app/backend/routers/dataset_attributions.py
@@ -27,7 +27,6 @@ class DatasetAttributionEntry(BaseModel):
 
 class DatasetAttributionMetadata(BaseModel):
     available: bool
-    n_batches_processed: int | None
     n_tokens_processed: int | None
     n_component_layer_keys: int | None
     ci_threshold: float | None
@@ -104,7 +103,6 @@ def get_attribution_metadata(loaded: DepLoadedRun) -> DatasetAttributionMetadata
     if loaded.attributions is None:
         return DatasetAttributionMetadata(
             available=False,
-            n_batches_processed=None,
             n_tokens_processed=None,
             n_component_layer_keys=None,
             ci_threshold=None,
@@ -112,7 +110,6 @@ def get_attribution_metadata(loaded: DepLoadedRun) -> DatasetAttributionMetadata
     storage = loaded.attributions.get_attributions()
     return DatasetAttributionMetadata(
         available=True,
-        n_batches_processed=storage.n_batches_processed,
         n_tokens_processed=storage.n_tokens_processed,
         n_component_layer_keys=storage.n_components,
         ci_threshold=storage.ci_threshold,
diff --git a/spd/app/frontend/src/components/DataSourcesTab.svelte b/spd/app/frontend/src/components/DataSourcesTab.svelte
index ff97d4365..6dd4d31c1 100644
--- a/spd/app/frontend/src/components/DataSourcesTab.svelte
+++ b/spd/app/frontend/src/components/DataSourcesTab.svelte
@@ -138,9 +138,6 @@
                     <span class="label">Subrun</span>
                     <span class="value mono">{attributions.subrun_id}</span>
 
-                    <span class="label">Batches</span>
-                    <span class="value">{attributions.n_batches_processed.toLocaleString()}</span>
-
                     <span class="label">Tokens</span>
                     <span class="value">{attributions.n_tokens_processed.toLocaleString()}</span>
 
diff --git a/spd/app/frontend/src/lib/api/dataSources.ts b/spd/app/frontend/src/lib/api/dataSources.ts
index e27deabec..ac20b7220 100644
--- a/spd/app/frontend/src/lib/api/dataSources.ts
+++ b/spd/app/frontend/src/lib/api/dataSources.ts
@@ -20,7 +20,6 @@ export type AutointerpInfo = {
 
 export type AttributionsInfo = {
     subrun_id: string;
-    n_batches_processed: number;
     n_tokens_processed: number;
     ci_threshold: number;
 };
diff --git a/spd/dataset_attributions/harvest.py b/spd/dataset_attributions/harvest.py
index a41a5a0f2..3b6e18cee 100644
--- a/spd/dataset_attributions/harvest.py
+++ b/spd/dataset_attributions/harvest.py
@@ -50,7 +50,7 @@ def _build_alive_masks(
     """
 
     component_alive = {
-        embed_path: torch.ones(vocab_size, dtype=torch.bool),
+        embed_path: torch.ones(vocab_size, dtype=torch.bool),  # TODO(oli): maybe remove this
         **{
             layer: torch.zeros(model.module_to_c[layer], dtype=torch.bool)
             for layer in model.target_module_paths
@@ -174,9 +174,7 @@ def harvest_attributions(
         batch = extract_batch_data(batch_data).to(device)
         harvester.process_batch(batch)
 
-    logger.info(
-        f"Processing complete. Tokens: {harvester.n_tokens:,}, Batches: {harvester.n_batches}"
-    )
+    logger.info(f"Processing complete. Tokens: {harvester.n_tokens:,}")
 
     storage = harvester.finalize(topology, config.ci_threshold)
 
@@ -201,9 +199,7 @@ def merge_attributions(output_dir: Path) -> None:
 
     output_path = output_dir / "dataset_attributions.pt"
     merged.save(output_path)
-    logger.info(
-        f"Total: {merged.n_batches_processed} batches, {merged.n_tokens_processed:,} tokens"
-    )
+    logger.info(f"Total: {merged.n_tokens_processed:,} tokens")
 
     # TODO(oli): reenable this
     # disabled deletion for testing, posterity and retries
diff --git a/spd/dataset_attributions/harvester.py b/spd/dataset_attributions/harvester.py
index 0aa95b1a1..26aeae1f6 100644
--- a/spd/dataset_attributions/harvester.py
+++ b/spd/dataset_attributions/harvester.py
@@ -60,86 +60,86 @@ def __init__(
     ):
         self.model = model
         self.sources_by_target = sources_by_target
-        self.vocab_size = vocab_size
         self.component_alive = component_alive
         self.sampling = sampling
         self.embed_path = embed_path
         self.embedding_module = embedding_module
         self.unembed_path = unembed_path
         self.unembed_module = unembed_module
-        self.device = device
-
-        self.n_batches = 0
-        self.n_tokens = 0
         self.output_d_model = unembed_module.in_features
+        self.device = device
 
-        sources_by_regular_target = {k: v.copy() for k, v in self.sources_by_target.items()}
-
-        unembed_sources = sources_by_regular_target[self.unembed_path].copy()
-        del sources_by_regular_target[self.unembed_path]
-        unembed_sources.remove(self.embed_path)
-
-        self._emb_unemb_attr_acc = torch.zeros(
-            (self.unembed_module.in_features, self.embedding_module.num_embeddings),
-            device=self.device,
+        # attribution accumulators
+        self._straight_through_attr_acc = torch.zeros(
+            (self.output_d_model, self.embedding_module.num_embeddings), device=self.device
         )
+        self._embed_tgts_acc = self._get_embed_targets_attr_accumulator(sources_by_target)
+        self._embed_tgts_acc_abs = self._get_embed_targets_attr_accumulator(sources_by_target)
+        self._unembed_srcs_acc = self._get_unembed_sources_attr_accumulator(sources_by_target)
+        self._regular_layers_acc = self._get_regular_layer_attr_accumulator(sources_by_target)
+        self._regular_layers_acc_abs = self._get_regular_layer_attr_accumulator(sources_by_target)
 
-        # we store attributions to the embedding *output*
-        embed_tgts_acc: dict[str, Tensor] = {}
-        embed_tgts_acc_abs: dict[str, Tensor] = {}
-        n_emb = self.embedding_module.num_embeddings
-        for target, sources in sources_by_regular_target.items():
-            if self.embed_path in sources:
-                tgt_c = self.model.module_to_c[target]
-                embed_tgts_acc[target] = torch.zeros((tgt_c, n_emb), device=self.device)
-                embed_tgts_acc_abs[target] = torch.zeros((tgt_c, n_emb), device=self.device)
-                sources.remove(self.embed_path)
-
-        # we use d_model here because we store attributions to the pre-unembed residual
-        # no abs version here because output is always positive
-        unembed_srcs_acc: dict[str, Tensor] = {}
-        d_model = self.unembed_module.in_features
-        for source in unembed_sources:
-            src_c = self.model.module_to_c[source]
-            unembed_srcs_acc[source] = torch.zeros((d_model, src_c), device=self.device)
-
-        # for normal components, we just go C <-> C
-        acc: dict[str, dict[str, Tensor]] = {}
-        acc_abs: dict[str, dict[str, Tensor]] = {}
-        for target_layer, source_layers in sources_by_regular_target.items():
-            acc[target_layer] = {}
-            acc_abs[target_layer] = {}
-            for source_layer in source_layers:
-                tgt_c = self.model.module_to_c[target_layer]
-                src_c = self.model.module_to_c[source_layer]
-                acc[target_layer][source_layer] = torch.zeros((tgt_c, src_c), device=self.device)
-                acc_abs[target_layer][source_layer] = torch.zeros(
-                    (tgt_c, src_c), device=self.device
-                )
-
-        self._embed_tgts_acc = embed_tgts_acc
-        self._embed_tgts_acc_abs = embed_tgts_acc_abs
-
-        self._regular_layers_acc = acc
-        self._regular_layers_acc_abs = acc_abs
-
-        self._unembed_srcs_acc = unembed_srcs_acc
-
+        # rms normalization accumulators
+        self.n_tokens = 0
         self._ci_sum_accumulator = {
-            layer: torch.zeros((c), device=self.device)
+            layer: torch.zeros((c,), device=self.device)
             for layer, c in self.model.module_to_c.items()
         }
-
         self._square_component_act_accumulator = {
-            layer: torch.zeros((c), device=self.device)
+            layer: torch.zeros((c,), device=self.device)
             for layer, c in self.model.module_to_c.items()
         }
-
-        self._logit_sq_sum = torch.zeros(self.vocab_size, device=self.device)
+        self._logit_sq_sum = torch.zeros((self.unembed_module.out_features,), device=self.device)
+
+    def _get_embed_targets_attr_accumulator(
+        self, sources_by_target: dict[str, list[str]]
+    ) -> dict[str, Tensor]:
+        # extract targets who's sources include the embedding
+        embed_targets_attr_accumulators: dict[str, Tensor] = {}
+        for target, sources in sources_by_target.items():
+            if target == self.unembed_path:
+                # ignore straight-through edge
+                continue
+            if self.embed_path in sources:
+                embed_targets_attr_accumulators[target] = torch.zeros(
+                    (self.model.module_to_c[target], self.embedding_module.num_embeddings),
+                    device=self.device,
+                )
+        return embed_targets_attr_accumulators
+
+    def _get_unembed_sources_attr_accumulator(
+        self, sources_by_target: dict[str, list[str]]
+    ) -> dict[str, Tensor]:
+        # extract the unembed's sources
+        unembed_sources_attr_accumulators: dict[str, Tensor] = {}
+        for source in sources_by_target[self.unembed_path]:
+            if source == self.embed_path:
+                # ignore straight-through edge
+                continue
+            unembed_sources_attr_accumulators[source] = torch.zeros(
+                (self.output_d_model, self.model.module_to_c[source]), device=self.device
+            )
+        return unembed_sources_attr_accumulators
+
+    def _get_regular_layer_attr_accumulator(
+        self, sources_by_target: dict[str, list[str]]
+    ) -> dict[str, dict[str, Tensor]]:
+        regular_layers_shapes: dict[str, dict[str, Tensor]] = {}
+        for target, sources in sources_by_target.items():
+            if target == self.unembed_path:
+                continue
+            regular_layers_shapes[target] = {}
+            for source in sources:
+                if source == self.embed_path:
+                    continue
+                regular_layers_shapes[target][source] = torch.zeros(
+                    (self.model.module_to_c[target], self.model.module_to_c[source]),
+                    device=self.device,
+                )
+        return regular_layers_shapes
 
     def process_batch(self, tokens: Int[Tensor, "batch seq"]) -> None:
         """Accumulate attributions from one batch."""
-        self.n_batches += 1
         self.n_tokens += tokens.numel()
 
         # Setup hooks to capture embedding output and pre-unembed residual
@@ -194,8 +194,9 @@ def pre_unembed_hook(_mod: nn.Module, args: tuple[Any, ...], _kwargs: Any) -> No
             if target_layer == self.unembed_path:
                 self._process_output_targets(cache, tokens, ci.lower_leaky)
             else:
-                sum_sq_acts = cache[f"{target_layer}_post_detach"].square().sum(dim=(0, 1))
-                self._square_component_act_accumulator[target_layer].add_(sum_sq_acts)
+                with torch.no_grad():
+                    sum_sq_acts = cache[f"{target_layer}_post_detach"].square().sum(dim=(0, 1))
+                    self._square_component_act_accumulator[target_layer].add_(sum_sq_acts)
                 self._process_component_targets(cache, tokens, ci.lower_leaky, target_layer)
 
     def _process_output_targets(
@@ -220,7 +221,7 @@ def _process_output_targets(
                 for source_layer, act, grad in zip(source_layers, source_acts, grads, strict=True):
                     if source_layer == self.embed_path:
                         token_attr = (grad * act).sum(dim=-1)  # (B S)
-                        self._emb_unemb_attr_acc[d_idx].scatter_add_(
+                        self._straight_through_attr_acc[d_idx].scatter_add_(
                             0, tokens.flatten(), token_attr.flatten()
                         )
                     else:
@@ -242,64 +243,44 @@ def _process_component_targets(
         target_acts_raw = cache[f"{target_layer}_pre_detach"]
 
         target_acts = target_acts_raw.sum(dim=(0, 1))
+        target_acts_abs = target_acts_raw.abs().sum(dim=(0, 1))
 
         source_layers = self.sources_by_target[target_layer]
-        if not source_layers:
-            return
         source_acts = [cache[f"{s}_post_detach"] for s in source_layers]
 
+        def _accumulate_grads(
+            grads: tuple[Tensor, ...],
+            t_idx: int,
+            embed_acc: dict[str, Tensor],
+            regular_acc: dict[str, dict[str, Tensor]],
+        ) -> None:
+            with torch.no_grad():
+                for source_layer, act, grad in zip(source_layers, source_acts, grads, strict=True):
+                    if source_layer == self.embed_path:
+                        token_attr = (grad * act).sum(dim=-1)  # (B S)
+                        embed_acc[target_layer][t_idx].scatter_add_(
+                            0, tokens.flatten(), token_attr.flatten()
+                        )
+                    else:
+                        ci_weighted = (grad * act * ci[source_layer]).sum(dim=(0, 1))  # (C,)
+                        regular_acc[target_layer][source_layer][t_idx].add_(ci_weighted)
+
         for t_idx in torch.where(alive_targets)[0].tolist():
-            grads_val = torch.autograd.grad(target_acts[t_idx], source_acts, retain_graph=True)
-            self._accumulate_grads(
-                grads_val,
-                source_layers,
-                source_acts,
-                tokens,
-                ci,
-                target_layer,
-                t_idx,
-                self._embed_tgts_acc,
-                self._regular_layers_acc,
-            )
-            del grads_val
-
-            abs_scalar = target_acts_raw[:, :, t_idx].abs().sum()
-            grads_abs = torch.autograd.grad(abs_scalar, source_acts, retain_graph=True)
-            self._accumulate_grads(
-                grads_abs,
-                source_layers,
-                source_acts,
-                tokens,
-                ci,
-                target_layer,
-                t_idx,
-                self._embed_tgts_acc_abs,
-                self._regular_layers_acc_abs,
+            grads = torch.autograd.grad(target_acts[t_idx], source_acts, retain_graph=True)
+            _accumulate_grads(
+                grads=grads,
+                t_idx=t_idx,
+                embed_acc=self._embed_tgts_acc,
+                regular_acc=self._regular_layers_acc,
             )
-            del grads_abs
 
-    def _accumulate_grads(
-        self,
-        grads: tuple[Tensor, ...],
-        source_layers: list[str],
-        source_acts: list[Tensor],
-        tokens: Int[Tensor, "batch seq"],
-        ci: dict[str, Tensor],
-        target_layer: str,
-        t_idx: int,
-        embed_acc: dict[str, Tensor],
-        regular_acc: dict[str, dict[str, Tensor]],
-    ) -> None:
-        with torch.no_grad():
-            for source_layer, act, grad in zip(source_layers, source_acts, grads, strict=True):
-                if source_layer == self.embed_path:
-                    token_attr = (grad * act).sum(dim=-1)  # (B S)
-                    embed_acc[target_layer][t_idx].scatter_add_(
-                        0, tokens.flatten(), token_attr.flatten()
-                    )
-                else:
-                    ci_weighted = (grad * act * ci[source_layer]).sum(dim=(0, 1))  # (C,)
-                    regular_acc[target_layer][source_layer][t_idx].add_(ci_weighted)
+            grads_abs = torch.autograd.grad(target_acts_abs[t_idx], source_acts, retain_graph=True)
+            _accumulate_grads(
+                grads=grads_abs,
+                t_idx=t_idx,
+                embed_acc=self._embed_tgts_acc_abs,
+                regular_acc=self._regular_layers_acc_abs,
+            )
 
     def finalize(
         self, topology: TransformerTopology, ci_threshold: float
@@ -323,13 +304,11 @@ def _canon(acc: dict[str, Tensor]) -> dict[str, Tensor]:
             embed_attr=_canon(self._embed_tgts_acc),
             embed_attr_abs=_canon(self._embed_tgts_acc_abs),
             unembed_attr=_canon(self._unembed_srcs_acc),
-            embed_unembed_attr=self._emb_unemb_attr_acc,
+            embed_unembed_attr=self._straight_through_attr_acc,
             w_unembed=topology.get_unembed_weight(),
             ci_sum=_canon(self._ci_sum_accumulator),
             component_act_sq_sum=_canon(self._square_component_act_accumulator),
             logit_sq_sum=self._logit_sq_sum,
-            vocab_size=self.vocab_size,
             ci_threshold=ci_threshold,
-            n_batches_processed=self.n_batches,
             n_tokens_processed=self.n_tokens,
         )
diff --git a/spd/dataset_attributions/storage.py b/spd/dataset_attributions/storage.py
index f8a47e40c..f12956c9a 100644
--- a/spd/dataset_attributions/storage.py
+++ b/spd/dataset_attributions/storage.py
@@ -69,9 +69,7 @@ def __init__(
         ci_sum: dict[str, Tensor],
         component_act_sq_sum: dict[str, Tensor],
         logit_sq_sum: Tensor,
-        vocab_size: int,
         ci_threshold: float,
-        n_batches_processed: int,
         n_tokens_processed: int,
     ):
         self._regular_attr = regular_attr
@@ -84,9 +82,7 @@ def __init__(
         self._ci_sum = ci_sum
         self._component_act_sq_sum = component_act_sq_sum
         self._logit_sq_sum = logit_sq_sum
-        self.vocab_size = vocab_size
         self.ci_threshold = ci_threshold
-        self.n_batches_processed = n_batches_processed
         self.n_tokens_processed = n_tokens_processed
 
     @property
@@ -192,7 +188,9 @@ def get_top_targets(
         value_segments: list[Tensor] = []
         layer_names: list[str] = []
 
-        if source_layer == "embed":
+        if source_layer == "output":
+            return []
+        elif source_layer == "embed":
             regular, embed = self._select_metric(metric)
 
             for target_layer, attr_matrix in embed.items():
@@ -269,14 +267,12 @@ def save(self, path: Path) -> None:
                 "embed_attr": _to_cpu(self._embed_attr),
                 "embed_attr_abs": _to_cpu(self._embed_attr_abs),
                 "unembed_attr": _to_cpu(self._unembed_attr),
-                "embed_unembed_attr": self._embed_unembed_attr.cpu(),
-                "w_unembed": self._w_unembed.cpu(),
+                "embed_unembed_attr": self._embed_unembed_attr.detach().cpu(),
+                "w_unembed": self._w_unembed.detach().cpu(),
                 "ci_sum": _to_cpu(self._ci_sum),
                 "component_act_sq_sum": _to_cpu(self._component_act_sq_sum),
-                "logit_sq_sum": self._logit_sq_sum.cpu(),
-                "vocab_size": self.vocab_size,
+                "logit_sq_sum": self._logit_sq_sum.detach().cpu(),
                 "ci_threshold": self.ci_threshold,
-                "n_batches_processed": self.n_batches_processed,
                 "n_tokens_processed": self.n_tokens_processed,
             },
             path,
@@ -298,9 +294,7 @@ def load(cls, path: Path) -> "DatasetAttributionStorage":
             ci_sum=data["ci_sum"],
             component_act_sq_sum=data["component_act_sq_sum"],
             logit_sq_sum=data["logit_sq_sum"],
-            vocab_size=data["vocab_size"],
             ci_threshold=data["ci_threshold"],
-            n_batches_processed=data["n_batches_processed"],
             n_tokens_processed=data["n_tokens_processed"],
         )
 
@@ -342,16 +336,16 @@ def merge(cls, paths: list[Path]) -> "DatasetAttributionStorage":
 
             merged._logit_sq_sum += other._logit_sq_sum
             merged.n_tokens_processed += other.n_tokens_processed
-            merged.n_batches_processed += other.n_batches_processed
 
         return merged
 
 
 def _to_cpu_nested(d: dict[str, dict[str, Tensor]]) -> dict[str, dict[str, Tensor]]:
     return {
-        target: {source: v.cpu() for source, v in sources.items()} for target, sources in d.items()
+        target: {source: v.detach().cpu() for source, v in sources.items()}
+        for target, sources in d.items()
     }
 
 
 def _to_cpu(d: dict[str, Tensor]) -> dict[str, Tensor]:
-    return {k: v.cpu() for k, v in d.items()}
+    return {k: v.detach().cpu() for k, v in d.items()}
diff --git a/spd/graph_interp/scripts/run.py b/spd/graph_interp/scripts/run.py
index fe74af365..2ed94638b 100644
--- a/spd/graph_interp/scripts/run.py
+++ b/spd/graph_interp/scripts/run.py
@@ -54,7 +54,7 @@ def main(
     assert attributions is not None, f"Dataset attributions required for {decomposition_id}"
     attribution_storage = attributions.get_attributions()
     logger.info(
-        f"  {attribution_storage.n_components} components, {attribution_storage.n_batches_processed} batches"
+        f"  {attribution_storage.n_components} components, {attribution_storage.n_tokens_processed:,} tokens"
     )
 
     logger.info("Loading component correlations...")
diff --git a/tests/dataset_attributions/test_storage.py b/tests/dataset_attributions/test_storage.py
index a6383e7da..8c80ee915 100644
--- a/tests/dataset_attributions/test_storage.py
+++ b/tests/dataset_attributions/test_storage.py
@@ -15,9 +15,7 @@
 C1 = 2  # components in layer 1
 
 
-def _make_storage(
-    seed: int = 0, n_batches: int = 10, n_tokens: int = 640
-) -> DatasetAttributionStorage:
+def _make_storage(seed: int = 0, n_tokens: int = 640) -> DatasetAttributionStorage:
     """Build storage for test topology.
 
     Sources by target:
@@ -42,9 +40,7 @@ def rand(*shape: int) -> Tensor:
         ci_sum={LAYER_0: rand(C0).abs() + 1.0, LAYER_1: rand(C1).abs() + 1.0},
         component_act_sq_sum={LAYER_0: rand(C0).abs() + 1.0, LAYER_1: rand(C1).abs() + 1.0},
         logit_sq_sum=rand(VOCAB_SIZE).abs() + 1.0,
-        vocab_size=VOCAB_SIZE,
         ci_threshold=1e-6,
-        n_batches_processed=n_batches,
         n_tokens_processed=n_tokens,
     )
 
@@ -135,9 +131,7 @@ def test_roundtrip(self, tmp_path: Path):
 
         loaded = DatasetAttributionStorage.load(path)
 
-        assert loaded.vocab_size == original.vocab_size
         assert loaded.ci_threshold == original.ci_threshold
-        assert loaded.n_batches_processed == original.n_batches_processed
         assert loaded.n_tokens_processed == original.n_tokens_processed
         assert loaded.n_components == original.n_components
 
@@ -158,8 +152,8 @@ def test_roundtrip_query_consistency(self, tmp_path: Path):
 
 class TestMerge:
     def test_two_workers_additive(self, tmp_path: Path):
-        s1 = _make_storage(seed=0, n_batches=5, n_tokens=320)
-        s2 = _make_storage(seed=42, n_batches=5, n_tokens=320)
+        s1 = _make_storage(seed=0, n_tokens=320)
+        s2 = _make_storage(seed=42, n_tokens=320)
 
         p1 = tmp_path / "rank_0.pt"
         p2 = tmp_path / "rank_1.pt"
@@ -168,11 +162,10 @@ def test_two_workers_additive(self, tmp_path: Path):
 
         merged = DatasetAttributionStorage.merge([p1, p2])
 
-        assert merged.n_batches_processed == 10
         assert merged.n_tokens_processed == 640
 
     def test_single_file(self, tmp_path: Path):
-        original = _make_storage(seed=7, n_batches=10, n_tokens=640)
+        original = _make_storage(seed=7, n_tokens=640)
         path = tmp_path / "rank_0.pt"
         original.save(path)
 

From 3ccc301adf076632056b24169b8c30d609fe12ca Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Wed, 25 Feb 2026 11:32:11 +0000
Subject: [PATCH 58/62] Add graph interp badge to components tab, prune model
 graph to 500 nodes

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/app/backend/routers/graph_interp.py       | 137 ++++++++++++++++++
 spd/app/backend/routers/runs.py               |   5 +-
 .../ActivationContextsViewer.svelte           |  25 +++-
 .../prompt-attr/OutputNodeCard.svelte         |   1 -
 4 files changed, 161 insertions(+), 7 deletions(-)

diff --git a/spd/app/backend/routers/graph_interp.py b/spd/app/backend/routers/graph_interp.py
index 79ba45f2b..517a1fe40 100644
--- a/spd/app/backend/routers/graph_interp.py
+++ b/spd/app/backend/routers/graph_interp.py
@@ -4,6 +4,8 @@
 prompt-edge graph produced by the graph_interp pipeline.
 """
 
+import random
+
 from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel
 
@@ -12,6 +14,11 @@
 from spd.graph_interp.schemas import LabelResult
 from spd.topology import TransformerTopology
 
+# TODO(oli): Remove MOCK_MODE after real data is available
+MOCK_MODE = False
+
+MAX_GRAPH_NODES = 500
+
 
 def _concrete_to_canonical_key(concrete_key: str, topology: TransformerTopology) -> str:
     layer, idx = concrete_key.rsplit(":", 1)
@@ -75,6 +82,9 @@ class ModelGraphResponse(BaseModel):
 @router.get("/labels")
 @log_errors
 def get_all_labels(loaded: DepLoadedRun) -> dict[str, GraphInterpHeadline]:
+    if MOCK_MODE:
+        return _mock_all_labels(loaded)
+
     repo = loaded.graph_interp
     if repo is None:
         return {}
@@ -112,6 +122,9 @@ def get_all_labels(loaded: DepLoadedRun) -> dict[str, GraphInterpHeadline]:
 @router.get("/labels/{layer}/{c_idx}")
 @log_errors
 def get_label_detail(layer: str, c_idx: int, loaded: DepLoadedRun) -> GraphInterpDetail:
+    if MOCK_MODE:
+        return _mock_label_detail(layer, c_idx)
+
     repo = loaded.graph_interp
     if repo is None:
         raise HTTPException(status_code=404, detail="Graph interp data not available")
@@ -142,6 +155,9 @@ def to_detail(label: LabelResult | None) -> LabelDetail | None:
 @router.get("/graph")
 @log_errors
 def get_model_graph(loaded: DepLoadedRun) -> ModelGraphResponse:
+    if MOCK_MODE:
+        return _mock_model_graph(loaded)
+
     repo = loaded.graph_interp
     if repo is None:
         raise HTTPException(status_code=404, detail="Graph interp data not available")
@@ -163,6 +179,9 @@ def get_model_graph(loaded: DepLoadedRun) -> ModelGraphResponse:
             )
         )
 
+    nodes = nodes[:MAX_GRAPH_NODES]
+    node_keys = {n.component_key for n in nodes}
+
     raw_edges = repo.get_all_prompt_edges()
     edges = []
     for e in raw_edges:
@@ -179,6 +198,9 @@ def get_model_graph(loaded: DepLoadedRun) -> ModelGraphResponse:
             case "input":
                 source, target = rel_canon, comp_canon
 
+        if source not in node_keys or target not in node_keys:
+            continue
+
         edges.append(
             GraphEdge(
                 source=source,
@@ -189,3 +211,118 @@ def get_model_graph(loaded: DepLoadedRun) -> ModelGraphResponse:
         )
 
     return ModelGraphResponse(nodes=nodes, edges=edges)
+
+
+# -- Mock data (TODO: remove after real data available) ------------------------
+
+_MOCK_LABELS = [
+    "sentence-final punctuation",
+    "proper noun completion",
+    "emotional adjective selection",
+    "temporal adverb prediction",
+    "morphological suffix (-ing/-ed)",
+    "determiner before noun",
+    "dialogue quotation marks",
+    "plural noun suffix",
+    "clause boundary detection",
+    "verb tense agreement",
+    "spatial preposition",
+    "possessive pronoun",
+    "narrative action verb",
+    "abstract emotion noun",
+    "comparative adjective form",
+    "subject-verb agreement",
+    "article selection (a/the)",
+    "comma splice detection",
+    "pronoun resolution",
+    "negation scope",
+]
+
+_MOCK_INPUT_LABELS = [
+    "sentence-initial capitals",
+    "mid-sentence verb position",
+    "adjective-noun boundary",
+    "clause-final position",
+    "article-noun sequence",
+    "subject pronoun at boundary",
+    "preposition-object pair",
+    "verb stem before suffix",
+    "quotation boundary",
+    "comma-separated items",
+]
+
+
+def _mock_all_labels(loaded: DepLoadedRun) -> dict[str, GraphInterpHeadline]:
+    rng = random.Random(42)
+    topology = loaded.topology
+    confidences = ["high", "high", "high", "medium", "medium", "low"]
+
+    result: dict[str, GraphInterpHeadline] = {}
+    for target_path, components in loaded.model.components.items():
+        canon = topology.target_to_canon(target_path)
+        n_components = components.C
+        n_mock = min(n_components, rng.randint(5, 20))
+        indices = sorted(rng.sample(range(n_components), n_mock))
+        for idx in indices:
+            key = f"{canon}:{idx}"
+            result[key] = GraphInterpHeadline(
+                label=rng.choice(_MOCK_LABELS),
+                confidence=rng.choice(confidences),
+                output_label=rng.choice(_MOCK_LABELS),
+                input_label=rng.choice(_MOCK_INPUT_LABELS),
+            )
+    return result
+
+
+def _mock_label_detail(layer: str, c_idx: int) -> GraphInterpDetail:
+    rng = random.Random(hash((layer, c_idx)))
+    conf = rng.choice(["high", "medium", "low"])
+    return GraphInterpDetail(
+        output=LabelDetail(
+            label=rng.choice(_MOCK_LABELS),
+            confidence=conf,
+            reasoning=f"Output: Component {layer}:{c_idx} writes {rng.choice(_MOCK_LABELS).lower()} tokens to the residual stream.",
+            prompt="(mock prompt)",
+        ),
+        input=LabelDetail(
+            label=rng.choice(_MOCK_INPUT_LABELS),
+            confidence=conf,
+            reasoning=f"Input: Component {layer}:{c_idx} fires on {rng.choice(_MOCK_INPUT_LABELS).lower()} patterns.",
+            prompt="(mock prompt)",
+        ),
+        unified=LabelDetail(
+            label=rng.choice(_MOCK_LABELS),
+            confidence=conf,
+            reasoning=f"Unified: Combines output ({rng.choice(_MOCK_LABELS).lower()}) and input ({rng.choice(_MOCK_INPUT_LABELS).lower()}) functions.",
+            prompt="(mock prompt)",
+        ),
+    )
+
+
+def _mock_model_graph(loaded: DepLoadedRun) -> ModelGraphResponse:
+    labels = _mock_all_labels(loaded)
+
+    nodes = [
+        GraphNode(component_key=key, label=h.label, confidence=h.confidence)
+        for key, h in labels.items()
+    ]
+
+    rng = random.Random(42)
+    keys = list(labels.keys())
+    edges: list[GraphEdge] = []
+
+    for key in keys:
+        layer = key.rsplit(":", 1)[0]
+        later_keys = [k for k in keys if k.rsplit(":", 1)[0] != layer]
+        n_edges = rng.randint(1, 4)
+        for target in rng.sample(later_keys, min(n_edges, len(later_keys))):
+            edges.append(
+                GraphEdge(
+                    source=key,
+                    target=target,
+                    attribution=rng.uniform(-1.0, 1.0),
+                    pass_name=rng.choice(["output", "input"]),
+                )
+            )
+
+    return ModelGraphResponse(nodes=nodes, edges=edges)
diff --git a/spd/app/backend/routers/runs.py b/spd/app/backend/routers/runs.py
index bd1c3e5a9..daac71373 100644
--- a/spd/app/backend/routers/runs.py
+++ b/spd/app/backend/routers/runs.py
@@ -10,6 +10,7 @@
 
 from spd.app.backend.app_tokenizer import AppTokenizer
 from spd.app.backend.dependencies import DepStateManager
+from spd.app.backend.routers.graph_interp import MOCK_MODE as _GRAPH_INTERP_MOCK_MODE
 from spd.app.backend.state import RunState
 from spd.app.backend.utils import log_errors
 from spd.autointerp.repo import InterpRepo
@@ -168,7 +169,9 @@ def get_status(manager: DepStateManager) -> LoadedRun | None:
         backend_user=getpass.getuser(),
         dataset_attributions_available=manager.run_state.attributions is not None,
         dataset_search_enabled=dataset_search_enabled,
-        graph_interp_available=manager.run_state.graph_interp is not None,
+        # TODO(oli): Remove MOCK_MODE import after real data available
+        graph_interp_available=manager.run_state.graph_interp is not None
+        or _GRAPH_INTERP_MOCK_MODE,
     )
 
 
diff --git a/spd/app/frontend/src/components/ActivationContextsViewer.svelte b/spd/app/frontend/src/components/ActivationContextsViewer.svelte
index d20831c1a..0ba730fb2 100644
--- a/spd/app/frontend/src/components/ActivationContextsViewer.svelte
+++ b/spd/app/frontend/src/components/ActivationContextsViewer.svelte
@@ -10,6 +10,7 @@
     import ActivationContextsPagedTable from "./ActivationContextsPagedTable.svelte";
     import ComponentProbeInput from "./ComponentProbeInput.svelte";
     import ComponentCorrelationMetrics from "./ui/ComponentCorrelationMetrics.svelte";
+    import GraphInterpBadge from "./ui/GraphInterpBadge.svelte";
     import InterpretationBadge from "./ui/InterpretationBadge.svelte";
     import SectionHeader from "./ui/SectionHeader.svelte";
     import StatusText from "./ui/StatusText.svelte";
@@ -38,6 +39,9 @@
     let currentIntruderScore = $derived(
         currentMetadata ? runState.getIntruderScore(`${selectedLayer}:${currentMetadata.subcomponent_idx}`) : null,
     );
+    let currentGraphInterpLabel = $derived(
+        currentMetadata ? runState.getGraphInterpLabel(`${selectedLayer}:${currentMetadata.subcomponent_idx}`) : null,
+    );
 
     // Component data hook - call load() explicitly when component changes
     const componentData = useComponentData();
@@ -412,11 +416,16 @@
             {/if}
         </SectionHeader>
 
-        <InterpretationBadge
-            interpretation={componentData.interpretation}
-            interpretationDetail={componentData.interpretationDetail}
-            onGenerate={componentData.generateInterpretation}
-        />
+        <div class="interpretation-badges">
+            <InterpretationBadge
+                interpretation={componentData.interpretation}
+                interpretationDetail={componentData.interpretationDetail}
+                onGenerate={componentData.generateInterpretation}
+            />
+            {#if currentGraphInterpLabel}
+                <GraphInterpBadge headline={currentGraphInterpLabel} />
+            {/if}
+        </div>
 
         <!-- Activation examples -->
         {#if componentData.componentDetail.status === "loading"}
@@ -701,6 +710,12 @@
         gap: var(--space-2);
     }
 
+    .interpretation-badges {
+        display: flex;
+        flex-direction: column;
+        gap: var(--space-2);
+    }
+
     .dataset-attributions-loading {
         display: flex;
         flex-direction: column;
diff --git a/spd/app/frontend/src/components/prompt-attr/OutputNodeCard.svelte b/spd/app/frontend/src/components/prompt-attr/OutputNodeCard.svelte
index 4fdd57b1b..935eaa169 100644
--- a/spd/app/frontend/src/components/prompt-attr/OutputNodeCard.svelte
+++ b/spd/app/frontend/src/components/prompt-attr/OutputNodeCard.svelte
@@ -134,7 +134,6 @@
         font-family: var(--font-mono);
     }
 
-
     .stats {
         margin: var(--space-1) 0;
         font-size: var(--text-sm);

From 63c544e97020546fea358da07ab8c86cbb3f5116 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Wed, 25 Feb 2026 12:45:57 +0000
Subject: [PATCH 59/62] Expand graph interp badge with detail, edges, token
 strings, and auto-load env var

- Add /api/graph_interp/detail endpoint returning labels + prompt edges per component
- GraphInterpBadge: lazy-fetch detail on expand, two-column layout (left=input, right=output)
- Resolve embed/output token strings server-side via AppTokenizer
- Extract shared isTokenNode/formatComponentKey utility (componentKeys.ts)
- Fix _concrete_to_canonical_key for embed/output pseudo-layers
- Add SPD_APP_DEFAULT_RUN env var to auto-load a run on app startup

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 spd/app/CLAUDE.md                             |   1 +
 spd/app/backend/routers/graph_interp.py       | 103 ++++++++---
 spd/app/backend/server.py                     |   7 +
 .../ActivationContextsViewer.svelte           |   6 +-
 .../prompt-attr/ComponentNodeCard.svelte      |   2 +-
 .../components/ui/EdgeAttributionList.svelte  |   7 +-
 .../src/components/ui/GraphInterpBadge.svelte | 174 +++++++++++++++++-
 spd/app/frontend/src/lib/api/graphInterp.ts   |  20 ++
 spd/app/frontend/src/lib/componentKeys.ts     |  17 ++
 spd/settings.py                               |   2 +
 10 files changed, 299 insertions(+), 40 deletions(-)
 create mode 100644 spd/app/frontend/src/lib/componentKeys.ts

diff --git a/spd/app/CLAUDE.md b/spd/app/CLAUDE.md
index 0f95d54a8..906bbee1a 100644
--- a/spd/app/CLAUDE.md
+++ b/spd/app/CLAUDE.md
@@ -15,6 +15,7 @@ This is a **rapidly iterated research tool**. Key implications:
 - **Database is disposable**: Delete `.data/app/prompt_attr.db` if schema changes break things
 - **Prefer simplicity**: Avoid over-engineering for hypothetical future needs
 - **Fail loud and fast**: The users are a small team of highly technical people. Errors are good. We want to know immediately if something is wrong. No soft failing, assert, assert, assert
+- **Token display**: Always ship token strings rendered server-side via `AppTokenizer`, never raw token IDs. For embed/output layers, `component_idx` is a token ID — resolve it to a display string in the backend response.
 
 ## Running the App
 
diff --git a/spd/app/backend/routers/graph_interp.py b/spd/app/backend/routers/graph_interp.py
index 517a1fe40..272bd4d80 100644
--- a/spd/app/backend/routers/graph_interp.py
+++ b/spd/app/backend/routers/graph_interp.py
@@ -20,8 +20,13 @@
 MAX_GRAPH_NODES = 500
 
 
+_ALREADY_CANONICAL = {"embed", "output"}
+
+
 def _concrete_to_canonical_key(concrete_key: str, topology: TransformerTopology) -> str:
     layer, idx = concrete_key.rsplit(":", 1)
+    if layer in _ALREADY_CANONICAL:
+        return concrete_key
     canonical = topology.target_to_canon(layer)
     return f"{canonical}:{idx}"
 
@@ -56,6 +61,22 @@ class GraphInterpDetail(BaseModel):
     unified: LabelDetail | None
 
 
+class PromptEdgeResponse(BaseModel):
+    related_key: str
+    pass_name: str
+    attribution: float
+    related_label: str | None
+    related_confidence: str | None
+    token_str: str | None
+
+
+class GraphInterpComponentDetail(BaseModel):
+    output: LabelDetail | None
+    input: LabelDetail | None
+    unified: LabelDetail | None
+    edges: list[PromptEdgeResponse]
+
+
 class GraphNode(BaseModel):
     component_key: str
     label: str
@@ -104,10 +125,7 @@ def get_all_labels(loaded: DepLoadedRun) -> dict[str, GraphInterpHeadline]:
 
         label = u or o or i
         assert label is not None
-        try:
-            canonical_key = _concrete_to_canonical_key(concrete_key, topology)
-        except (KeyError, AssertionError):
-            canonical_key = concrete_key
+        canonical_key = _concrete_to_canonical_key(concrete_key, topology)
 
         result[canonical_key] = GraphInterpHeadline(
             label=label.label,
@@ -119,6 +137,17 @@ def get_all_labels(loaded: DepLoadedRun) -> dict[str, GraphInterpHeadline]:
     return result
 
 
+def _to_detail(label: LabelResult | None) -> LabelDetail | None:
+    if label is None:
+        return None
+    return LabelDetail(
+        label=label.label,
+        confidence=label.confidence,
+        reasoning=label.reasoning,
+        prompt=label.prompt,
+    )
+
+
 @router.get("/labels/{layer}/{c_idx}")
 @log_errors
 def get_label_detail(layer: str, c_idx: int, loaded: DepLoadedRun) -> GraphInterpDetail:
@@ -131,24 +160,47 @@ def get_label_detail(layer: str, c_idx: int, loaded: DepLoadedRun) -> GraphInter
 
     concrete_key = _canonical_to_concrete_key(layer, c_idx, loaded.topology)
 
-    o = repo.get_output_label(concrete_key)
-    i = repo.get_input_label(concrete_key)
-    u = repo.get_unified_label(concrete_key)
+    return GraphInterpDetail(
+        output=_to_detail(repo.get_output_label(concrete_key)),
+        input=_to_detail(repo.get_input_label(concrete_key)),
+        unified=_to_detail(repo.get_unified_label(concrete_key)),
+    )
 
-    def to_detail(label: LabelResult | None) -> LabelDetail | None:
-        if label is None:
-            return None
-        return LabelDetail(
-            label=label.label,
-            confidence=label.confidence,
-            reasoning=label.reasoning,
-            prompt=label.prompt,
+
+@router.get("/detail/{layer}/{c_idx}")
+@log_errors
+def get_component_detail(
+    layer: str, c_idx: int, loaded: DepLoadedRun
+) -> GraphInterpComponentDetail:
+    repo = loaded.graph_interp
+    if repo is None:
+        raise HTTPException(status_code=404, detail="Graph interp data not available")
+
+    topology = loaded.topology
+    concrete_key = _canonical_to_concrete_key(layer, c_idx, topology)
+
+    raw_edges = repo.get_prompt_edges(concrete_key)
+    tokenizer = loaded.tokenizer
+    edges = []
+    for e in raw_edges:
+        rel_layer, rel_idx = e.related_key.rsplit(":", 1)
+        token_str = tokenizer.decode([int(rel_idx)]) if rel_layer in ("embed", "output") else None
+        edges.append(
+            PromptEdgeResponse(
+                related_key=_concrete_to_canonical_key(e.related_key, topology),
+                pass_name=e.pass_name,
+                attribution=e.attribution,
+                related_label=e.related_label,
+                related_confidence=e.related_confidence,
+                token_str=token_str,
+            )
         )
 
-    return GraphInterpDetail(
-        output=to_detail(o),
-        input=to_detail(i),
-        unified=to_detail(u),
+    return GraphInterpComponentDetail(
+        output=_to_detail(repo.get_output_label(concrete_key)),
+        input=_to_detail(repo.get_input_label(concrete_key)),
+        unified=_to_detail(repo.get_unified_label(concrete_key)),
+        edges=edges,
     )
 
 
@@ -167,10 +219,7 @@ def get_model_graph(loaded: DepLoadedRun) -> ModelGraphResponse:
     unified = repo.get_all_unified_labels()
     nodes = []
     for concrete_key, label in unified.items():
-        try:
-            canonical_key = _concrete_to_canonical_key(concrete_key, topology)
-        except (KeyError, AssertionError):
-            canonical_key = concrete_key
+        canonical_key = _concrete_to_canonical_key(concrete_key, topology)
         nodes.append(
             GraphNode(
                 component_key=canonical_key,
@@ -185,12 +234,8 @@ def get_model_graph(loaded: DepLoadedRun) -> ModelGraphResponse:
     raw_edges = repo.get_all_prompt_edges()
     edges = []
     for e in raw_edges:
-        try:
-            comp_canon = _concrete_to_canonical_key(e.component_key, topology)
-            rel_canon = _concrete_to_canonical_key(e.related_key, topology)
-        except (KeyError, AssertionError):
-            comp_canon = e.component_key
-            rel_canon = e.related_key
+        comp_canon = _concrete_to_canonical_key(e.component_key, topology)
+        rel_canon = _concrete_to_canonical_key(e.related_key, topology)
 
         match e.pass_name:
             case "output":
diff --git a/spd/app/backend/server.py b/spd/app/backend/server.py
index 55c2cb4c8..661e3f361 100644
--- a/spd/app/backend/server.py
+++ b/spd/app/backend/server.py
@@ -41,6 +41,7 @@
 )
 from spd.app.backend.state import StateManager
 from spd.log import logger
+from spd.settings import SPD_APP_DEFAULT_RUN
 from spd.utils.distributed_utils import get_device
 
 DEVICE = get_device()
@@ -59,6 +60,12 @@ async def lifespan(app: FastAPI):  # pyright: ignore[reportUnusedParameter]
     logger.info(f"[STARTUP] Device: {DEVICE}")
     logger.info(f"[STARTUP] CUDA available: {torch.cuda.is_available()}")
 
+    if SPD_APP_DEFAULT_RUN is not None:
+        from spd.app.backend.routers.runs import load_run
+
+        logger.info(f"[STARTUP] Auto-loading default run: {SPD_APP_DEFAULT_RUN}")
+        load_run(SPD_APP_DEFAULT_RUN, context_length=512, manager=manager)
+
     yield
 
     manager.close()
diff --git a/spd/app/frontend/src/components/ActivationContextsViewer.svelte b/spd/app/frontend/src/components/ActivationContextsViewer.svelte
index 0ba730fb2..3df0b7e27 100644
--- a/spd/app/frontend/src/components/ActivationContextsViewer.svelte
+++ b/spd/app/frontend/src/components/ActivationContextsViewer.svelte
@@ -423,7 +423,11 @@
                 onGenerate={componentData.generateInterpretation}
             />
             {#if currentGraphInterpLabel}
-                <GraphInterpBadge headline={currentGraphInterpLabel} />
+                <GraphInterpBadge
+                    headline={currentGraphInterpLabel}
+                    layer={selectedLayer}
+                    cIdx={currentMetadata.subcomponent_idx}
+                />
             {/if}
         </div>
 
diff --git a/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte b/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte
index ede5ea8ef..02d58b3e9 100644
--- a/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte
+++ b/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte
@@ -228,7 +228,7 @@
             onGenerate={componentData.generateInterpretation}
         />
         {#if graphInterpLabel}
-            <GraphInterpBadge headline={graphInterpLabel} />
+            <GraphInterpBadge headline={graphInterpLabel} {layer} {cIdx} />
         {/if}
     </div>
 
diff --git a/spd/app/frontend/src/components/ui/EdgeAttributionList.svelte b/spd/app/frontend/src/components/ui/EdgeAttributionList.svelte
index 7ab276067..1fa05c400 100644
--- a/spd/app/frontend/src/components/ui/EdgeAttributionList.svelte
+++ b/spd/app/frontend/src/components/ui/EdgeAttributionList.svelte
@@ -1,6 +1,7 @@
 <script lang="ts">
     import { getContext } from "svelte";
     import { colors } from "../../lib/colors";
+    import { isTokenNode } from "../../lib/componentKeys";
     import type { EdgeAttribution } from "../../lib/promptAttributionsTypes";
     import { RUN_KEY, type InterpretationBackendState, type RunContext } from "../../lib/useRun.svelte";
     import { lerp } from "../prompt-attr/graphUtils";
@@ -39,12 +40,6 @@
         return { status: "none" };
     }
 
-    // Check if a key refers to a pseudo-layer token node (embed/output)
-    function isTokenNode(key: string): boolean {
-        const layer = key.split(":")[0];
-        return layer === "embed" || layer === "output";
-    }
-
     // Get the token type label for the right side (e.g., "Input token" or "Output token")
     function getTokenTypeLabel(key: string): string {
         const layer = key.split(":")[0];
diff --git a/spd/app/frontend/src/components/ui/GraphInterpBadge.svelte b/spd/app/frontend/src/components/ui/GraphInterpBadge.svelte
index 2868004db..3f8402957 100644
--- a/spd/app/frontend/src/components/ui/GraphInterpBadge.svelte
+++ b/spd/app/frontend/src/components/ui/GraphInterpBadge.svelte
@@ -1,17 +1,48 @@
 <script lang="ts">
-    import type { GraphInterpHeadline } from "../../lib/api";
+    import type { GraphInterpComponentDetail, GraphInterpHeadline } from "../../lib/api";
+    import { getGraphInterpComponentDetail } from "../../lib/api";
+    import { formatComponentKey } from "../../lib/componentKeys";
 
     interface Props {
         headline: GraphInterpHeadline;
+        layer: string;
+        cIdx: number;
     }
 
-    let { headline }: Props = $props();
+    let { headline, layer, cIdx }: Props = $props();
 
     let expanded = $state(false);
+    let detail = $state<GraphInterpComponentDetail | null>(null);
+    let detailError = $state<string | null>(null);
+    let fetched = false;
+
+    async function toggle() {
+        expanded = !expanded;
+        if (expanded && !fetched) {
+            fetched = true;
+            try {
+                detail = await getGraphInterpComponentDetail(layer, cIdx);
+            } catch (e) {
+                detailError = String(e);
+            }
+        }
+    }
+
+    const incomingEdges = $derived(
+        detail?.edges
+            .filter((e) => e.pass_name === "input")
+            .sort((a, b) => Math.abs(b.attribution) - Math.abs(a.attribution)) ?? [],
+    );
+
+    const outgoingEdges = $derived(
+        detail?.edges
+            .filter((e) => e.pass_name === "output")
+            .sort((a, b) => Math.abs(b.attribution) - Math.abs(a.attribution)) ?? [],
+    );
 </script>
 
 <div class="graph-interp-container">
-    <button class="graph-interp-badge" onclick={() => (expanded = !expanded)} type="button">
+    <button class="graph-interp-badge" onclick={toggle} type="button">
         <div class="badge-header">
             <span class="badge-label">{headline.label}</span>
             <span class="confidence confidence-{headline.confidence}">{headline.confidence}</span>
@@ -28,6 +59,61 @@
             </div>
         {/if}
     </button>
+
+    {#if expanded}
+        <div class="detail-section">
+            {#if detailError}
+                <div class="detail-error">{detailError}</div>
+            {:else if detail === null}
+                <div class="detail-loading">Loading...</div>
+            {:else}
+                <div class="detail-columns">
+                    <div class="detail-column">
+                        <span class="column-title">Input</span>
+                        {#if detail.input?.reasoning}
+                            <p class="reasoning-text">{detail.input.reasoning}</p>
+                        {/if}
+                        {#each incomingEdges as edge (edge.related_key)}
+                            <div class="edge-row">
+                                <span class="edge-key">{formatComponentKey(edge.related_key, edge.token_str)}</span>
+                                <span
+                                    class="edge-attr"
+                                    class:positive={edge.attribution > 0}
+                                    class:negative={edge.attribution < 0}
+                                >
+                                    {edge.attribution > 0 ? "+" : ""}{edge.attribution.toFixed(3)}
+                                </span>
+                                {#if edge.related_label}
+                                    <span class="edge-label">{edge.related_label}</span>
+                                {/if}
+                            </div>
+                        {/each}
+                    </div>
+                    <div class="detail-column">
+                        <span class="column-title">Output</span>
+                        {#if detail.output?.reasoning}
+                            <p class="reasoning-text">{detail.output.reasoning}</p>
+                        {/if}
+                        {#each outgoingEdges as edge (edge.related_key)}
+                            <div class="edge-row">
+                                <span class="edge-key">{formatComponentKey(edge.related_key, edge.token_str)}</span>
+                                <span
+                                    class="edge-attr"
+                                    class:positive={edge.attribution > 0}
+                                    class:negative={edge.attribution < 0}
+                                >
+                                    {edge.attribution > 0 ? "+" : ""}{edge.attribution.toFixed(3)}
+                                </span>
+                                {#if edge.related_label}
+                                    <span class="edge-label">{edge.related_label}</span>
+                                {/if}
+                            </div>
+                        {/each}
+                    </div>
+                </div>
+            {/if}
+        </div>
+    {/if}
 </div>
 
 <style>
@@ -126,4 +212,86 @@
         color: var(--text-muted);
         min-width: 24px;
     }
+
+    .detail-section {
+        display: flex;
+        flex-direction: column;
+        gap: var(--space-3);
+        padding: var(--space-3);
+        background: var(--bg-elevated);
+        border-radius: var(--radius-md);
+        border: 1px solid var(--border-default);
+    }
+
+    .detail-loading,
+    .detail-error {
+        font-size: var(--text-xs);
+        color: var(--text-muted);
+    }
+
+    .detail-error {
+        color: var(--semantic-error);
+    }
+
+    .detail-columns {
+        display: grid;
+        grid-template-columns: 1fr 1fr;
+        gap: var(--space-3);
+    }
+
+    .detail-column {
+        display: flex;
+        flex-direction: column;
+        gap: var(--space-2);
+    }
+
+    .column-title {
+        font-size: 10px;
+        font-weight: 600;
+        text-transform: uppercase;
+        color: var(--text-muted);
+        padding-bottom: var(--space-1);
+        border-bottom: 1px solid var(--border-subtle);
+    }
+
+    .reasoning-text {
+        font-size: var(--text-xs);
+        color: var(--text-secondary);
+        line-height: 1.5;
+        margin: 0;
+    }
+
+    .edge-row {
+        display: flex;
+        align-items: baseline;
+        gap: var(--space-2);
+        font-size: var(--text-xs);
+    }
+
+    .edge-key {
+        font-family: var(--font-mono);
+        color: var(--text-secondary);
+        flex-shrink: 0;
+    }
+
+    .edge-attr {
+        font-family: var(--font-mono);
+        font-weight: 600;
+        flex-shrink: 0;
+    }
+
+    .edge-attr.positive {
+        color: var(--status-positive-bright);
+    }
+
+    .edge-attr.negative {
+        color: var(--semantic-error);
+    }
+
+    .edge-label {
+        color: var(--text-muted);
+        overflow: hidden;
+        text-overflow: ellipsis;
+        white-space: nowrap;
+    }
 </style>
diff --git a/spd/app/frontend/src/lib/api/graphInterp.ts b/spd/app/frontend/src/lib/api/graphInterp.ts
index 992d0e744..8229e757c 100644
--- a/spd/app/frontend/src/lib/api/graphInterp.ts
+++ b/spd/app/frontend/src/lib/api/graphInterp.ts
@@ -24,6 +24,22 @@ export type GraphInterpDetail = {
     unified: LabelDetail | null;
 };
 
+export type PromptEdgeResponse = {
+    related_key: string;
+    pass_name: string;
+    attribution: number;
+    related_label: string | null;
+    related_confidence: string | null;
+    token_str: string | null;
+};
+
+export type GraphInterpComponentDetail = {
+    output: LabelDetail | null;
+    input: LabelDetail | null;
+    unified: LabelDetail | null;
+    edges: PromptEdgeResponse[];
+};
+
 export type GraphNode = {
     component_key: string;
     label: string;
@@ -56,6 +72,10 @@ export async function getGraphInterpDetail(layer: string, cIdx: number): Promise
     return fetchJson<GraphInterpDetail>(`/api/graph_interp/labels/${layer}/${cIdx}`);
 }
 
+export async function getGraphInterpComponentDetail(layer: string, cIdx: number): Promise<GraphInterpComponentDetail> {
+    return fetchJson<GraphInterpComponentDetail>(`/api/graph_interp/detail/${layer}/${cIdx}`);
+}
+
 export async function getModelGraph(): Promise<ModelGraphResponse> {
     return fetchJson<ModelGraphResponse>("/api/graph_interp/graph");
 }
diff --git a/spd/app/frontend/src/lib/componentKeys.ts b/spd/app/frontend/src/lib/componentKeys.ts
new file mode 100644
index 000000000..ff83bda06
--- /dev/null
+++ b/spd/app/frontend/src/lib/componentKeys.ts
@@ -0,0 +1,17 @@
+/**
+ * Utilities for component key display (e.g. rendering embed/output keys with token strings).
+ */
+
+export function isTokenNode(key: string): boolean {
+    const layer = key.split(":")[0];
+    return layer === "embed" || layer === "output";
+}
+
+export function formatComponentKey(key: string, tokenStr: string | null): string {
+    if (tokenStr && isTokenNode(key)) {
+        const layer = key.split(":")[0];
+        const label = layer === "embed" ? "input" : "output";
+        return `'${tokenStr}' (${label})`;
+    }
+    return key;
+}
diff --git a/spd/settings.py b/spd/settings.py
index 9e3e37f7b..56d60ecfe 100644
--- a/spd/settings.py
+++ b/spd/settings.py
@@ -24,3 +24,5 @@
 DEFAULT_PARTITION_NAME = "h200-reserved"
 
 DEFAULT_PROJECT_NAME = "spd"
+
+SPD_APP_DEFAULT_RUN: str | None = os.environ.get("SPD_APP_DEFAULT_RUN")

From 889a89e53c4653dc5a2f68247d1f8d9054013a89 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Wed, 25 Feb 2026 14:09:38 +0000
Subject: [PATCH 60/62] Move graph interp detail fetch into useComponentData
 hooks

GraphInterpBadge was fetching its own data with a plain `fetched` flag
that didn't reset when props changed. This caused stale data when the
component was reused (e.g. clicking through components in Activation
Contexts tab). Now the fetch lives in useComponentData/ExpectCached
alongside all other component data, and GraphInterpBadge is a pure
display component.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../ActivationContextsViewer.svelte           |   8 +-
 .../prompt-attr/ComponentNodeCard.svelte      |   4 +-
 .../src/components/ui/GraphInterpBadge.svelte | 127 +++++++-----------
 .../src/lib/useComponentData.svelte.ts        |  28 +++-
 .../useComponentDataExpectCached.svelte.ts    |  28 +++-
 5 files changed, 105 insertions(+), 90 deletions(-)

diff --git a/spd/app/frontend/src/components/ActivationContextsViewer.svelte b/spd/app/frontend/src/components/ActivationContextsViewer.svelte
index 3df0b7e27..f8fcbbea5 100644
--- a/spd/app/frontend/src/components/ActivationContextsViewer.svelte
+++ b/spd/app/frontend/src/components/ActivationContextsViewer.svelte
@@ -422,12 +422,8 @@
                 interpretationDetail={componentData.interpretationDetail}
                 onGenerate={componentData.generateInterpretation}
             />
-            {#if currentGraphInterpLabel}
-                <GraphInterpBadge
-                    headline={currentGraphInterpLabel}
-                    layer={selectedLayer}
-                    cIdx={currentMetadata.subcomponent_idx}
-                />
+            {#if currentGraphInterpLabel && componentData.graphInterpDetail.status === "loaded" && componentData.graphInterpDetail.data}
+                <GraphInterpBadge headline={currentGraphInterpLabel} detail={componentData.graphInterpDetail.data} />
             {/if}
         </div>
 
diff --git a/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte b/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte
index 02d58b3e9..969fd0a60 100644
--- a/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte
+++ b/spd/app/frontend/src/components/prompt-attr/ComponentNodeCard.svelte
@@ -227,8 +227,8 @@
             interpretationDetail={componentData.interpretationDetail}
             onGenerate={componentData.generateInterpretation}
         />
-        {#if graphInterpLabel}
-            <GraphInterpBadge headline={graphInterpLabel} {layer} {cIdx} />
+        {#if graphInterpLabel && componentData.graphInterpDetail.status === "loaded" && componentData.graphInterpDetail.data}
+            <GraphInterpBadge headline={graphInterpLabel} detail={componentData.graphInterpDetail.data} />
         {/if}
     </div>
 
diff --git a/spd/app/frontend/src/components/ui/GraphInterpBadge.svelte b/spd/app/frontend/src/components/ui/GraphInterpBadge.svelte
index 3f8402957..21342555c 100644
--- a/spd/app/frontend/src/components/ui/GraphInterpBadge.svelte
+++ b/spd/app/frontend/src/components/ui/GraphInterpBadge.svelte
@@ -1,32 +1,15 @@
 <script lang="ts">
     import type { GraphInterpComponentDetail, GraphInterpHeadline } from "../../lib/api";
-    import { getGraphInterpComponentDetail } from "../../lib/api";
     import { formatComponentKey } from "../../lib/componentKeys";
 
     interface Props {
         headline: GraphInterpHeadline;
-        layer: string;
-        cIdx: number;
+        detail: GraphInterpComponentDetail | null;
     }
 
-    let { headline, layer, cIdx }: Props = $props();
+    let { headline, detail }: Props = $props();
 
     let expanded = $state(false);
-    let detail = $state<GraphInterpComponentDetail | null>(null);
-    let detailError = $state<string | null>(null);
-    let fetched = false;
-
-    async function toggle() {
-        expanded = !expanded;
-        if (expanded && !fetched) {
-            fetched = true;
-            try {
-                detail = await getGraphInterpComponentDetail(layer, cIdx);
-            } catch (e) {
-                detailError = String(e);
-            }
-        }
-    }
 
     const incomingEdges = $derived(
         detail?.edges
@@ -42,7 +25,7 @@
 </script>
 
 <div class="graph-interp-container">
-    <button class="graph-interp-badge" onclick={toggle} type="button">
+    <button class="graph-interp-badge" onclick={() => (expanded = !expanded)} type="button">
         <div class="badge-header">
             <span class="badge-label">{headline.label}</span>
             <span class="confidence confidence-{headline.confidence}">{headline.confidence}</span>
@@ -60,58 +43,52 @@
         {/if}
     </button>
 
-    {#if expanded}
+    {#if expanded && detail}
         <div class="detail-section">
-            {#if detailError}
-                <div class="detail-error">{detailError}</div>
-            {:else if detail === null}
-                <div class="detail-loading">Loading...</div>
-            {:else}
-                <div class="detail-columns">
-                    <div class="detail-column">
-                        <span class="column-title">Input</span>
-                        {#if detail.input?.reasoning}
-                            <p class="reasoning-text">{detail.input.reasoning}</p>
-                        {/if}
-                        {#each incomingEdges as edge (edge.related_key)}
-                            <div class="edge-row">
-                                <span class="edge-key">{formatComponentKey(edge.related_key, edge.token_str)}</span>
-                                <span
-                                    class="edge-attr"
-                                    class:positive={edge.attribution > 0}
-                                    class:negative={edge.attribution < 0}
-                                >
-                                    {edge.attribution > 0 ? "+" : ""}{edge.attribution.toFixed(3)}
-                                </span>
-                                {#if edge.related_label}
-                                    <span class="edge-label">{edge.related_label}</span>
-                                {/if}
-                            </div>
-                        {/each}
-                    </div>
-                    <div class="detail-column">
-                        <span class="column-title">Output</span>
-                        {#if detail.output?.reasoning}
-                            <p class="reasoning-text">{detail.output.reasoning}</p>
-                        {/if}
-                        {#each outgoingEdges as edge (edge.related_key)}
-                            <div class="edge-row">
-                                <span class="edge-key">{formatComponentKey(edge.related_key, edge.token_str)}</span>
-                                <span
-                                    class="edge-attr"
-                                    class:positive={edge.attribution > 0}
-                                    class:negative={edge.attribution < 0}
-                                >
-                                    {edge.attribution > 0 ? "+" : ""}{edge.attribution.toFixed(3)}
-                                </span>
-                                {#if edge.related_label}
-                                    <span class="edge-label">{edge.related_label}</span>
-                                {/if}
-                            </div>
-                        {/each}
-                    </div>
+            <div class="detail-columns">
+                <div class="detail-column">
+                    <span class="column-title">Input</span>
+                    {#if detail.input?.reasoning}
+                        <p class="reasoning-text">{detail.input.reasoning}</p>
+                    {/if}
+                    {#each incomingEdges as edge (edge.related_key)}
+                        <div class="edge-row">
+                            <span class="edge-key">{formatComponentKey(edge.related_key, edge.token_str)}</span>
+                            <span
+                                class="edge-attr"
+                                class:positive={edge.attribution > 0}
+                                class:negative={edge.attribution < 0}
+                            >
+                                {edge.attribution > 0 ? "+" : ""}{edge.attribution.toFixed(3)}
+                            </span>
+                            {#if edge.related_label}
+                                <span class="edge-label">{edge.related_label}</span>
+                            {/if}
+                        </div>
+                    {/each}
+                </div>
+                <div class="detail-column">
+                    <span class="column-title">Output</span>
+                    {#if detail.output?.reasoning}
+                        <p class="reasoning-text">{detail.output.reasoning}</p>
+                    {/if}
+                    {#each outgoingEdges as edge (edge.related_key)}
+                        <div class="edge-row">
+                            <span class="edge-key">{formatComponentKey(edge.related_key, edge.token_str)}</span>
+                            <span
+                                class="edge-attr"
+                                class:positive={edge.attribution > 0}
+                                class:negative={edge.attribution < 0}
+                            >
+                                {edge.attribution > 0 ? "+" : ""}{edge.attribution.toFixed(3)}
+                            </span>
+                            {#if edge.related_label}
+                                <span class="edge-label">{edge.related_label}</span>
+                            {/if}
+                        </div>
+                    {/each}
                 </div>
-            {/if}
+            </div>
         </div>
     {/if}
 </div>
@@ -223,16 +200,6 @@
         border: 1px solid var(--border-default);
     }
 
-    .detail-loading,
-    .detail-error {
-        font-size: var(--text-xs);
-        color: var(--text-muted);
-    }
-
-    .detail-error {
-        color: var(--semantic-error);
-    }
-
     .detail-columns {
         display: grid;
         grid-template-columns: 1fr 1fr;
diff --git a/spd/app/frontend/src/lib/useComponentData.svelte.ts b/spd/app/frontend/src/lib/useComponentData.svelte.ts
index d2af449ef..54f72ca90 100644
--- a/spd/app/frontend/src/lib/useComponentData.svelte.ts
+++ b/spd/app/frontend/src/lib/useComponentData.svelte.ts
@@ -5,10 +5,11 @@ import {
     getComponentAttributions,
     getComponentCorrelations,
     getComponentTokenStats,
+    getGraphInterpComponentDetail,
     getInterpretationDetail,
     requestComponentInterpretation,
 } from "./api";
-import type { AllMetricAttributions, InterpretationDetail } from "./api";
+import type { AllMetricAttributions, GraphInterpComponentDetail, InterpretationDetail } from "./api";
 import type {
     SubcomponentCorrelationsResponse,
     SubcomponentActivationContexts,
@@ -46,6 +47,7 @@ export function useComponentData() {
     let datasetAttributions = $state<Loadable<AllMetricAttributions | null>>({ status: "uninitialized" });
 
     let interpretationDetail = $state<Loadable<InterpretationDetail | null>>({ status: "uninitialized" });
+    let graphInterpDetail = $state<Loadable<GraphInterpComponentDetail | null>>({ status: "uninitialized" });
 
     // Current coords being loaded/displayed (for interpretation lookup)
     let currentCoords = $state<ComponentCoords | null>(null);
@@ -146,6 +148,26 @@ export function useComponentData() {
                     interpretationDetail = { status: "error", error };
                 }
             });
+
+        // Fetch graph interp detail (skip if not available for this run)
+        if (runState.graphInterpAvailable) {
+            graphInterpDetail = { status: "loading" };
+            getGraphInterpComponentDetail(layer, cIdx)
+                .then((data) => {
+                    if (isStale()) return;
+                    graphInterpDetail = { status: "loaded", data };
+                })
+                .catch((error) => {
+                    if (isStale()) return;
+                    if (error instanceof ApiError && error.status === 404) {
+                        graphInterpDetail = { status: "loaded", data: null };
+                    } else {
+                        graphInterpDetail = { status: "error", error };
+                    }
+                });
+        } else {
+            graphInterpDetail = { status: "loaded", data: null };
+        }
     }
 
     /**
@@ -159,6 +181,7 @@ export function useComponentData() {
         tokenStats = { status: "uninitialized" };
         datasetAttributions = { status: "uninitialized" };
         interpretationDetail = { status: "uninitialized" };
+        graphInterpDetail = { status: "uninitialized" };
     }
 
     // Interpretation is derived from the global cache - reactive to both coords and cache
@@ -212,6 +235,9 @@ export function useComponentData() {
         get interpretationDetail() {
             return interpretationDetail;
         },
+        get graphInterpDetail() {
+            return graphInterpDetail;
+        },
         load,
         reset,
         generateInterpretation,
diff --git a/spd/app/frontend/src/lib/useComponentDataExpectCached.svelte.ts b/spd/app/frontend/src/lib/useComponentDataExpectCached.svelte.ts
index 29110c327..4371fa91b 100644
--- a/spd/app/frontend/src/lib/useComponentDataExpectCached.svelte.ts
+++ b/spd/app/frontend/src/lib/useComponentDataExpectCached.svelte.ts
@@ -14,10 +14,11 @@ import {
     getComponentAttributions,
     getComponentCorrelations,
     getComponentTokenStats,
+    getGraphInterpComponentDetail,
     getInterpretationDetail,
     requestComponentInterpretation,
 } from "./api";
-import type { AllMetricAttributions, InterpretationDetail } from "./api";
+import type { AllMetricAttributions, GraphInterpComponentDetail, InterpretationDetail } from "./api";
 import type {
     SubcomponentCorrelationsResponse,
     SubcomponentActivationContexts,
@@ -41,6 +42,7 @@ export function useComponentDataExpectCached() {
     let tokenStats = $state<Loadable<TokenStatsResponse | null>>({ status: "uninitialized" });
     let datasetAttributions = $state<Loadable<AllMetricAttributions | null>>({ status: "uninitialized" });
     let interpretationDetail = $state<Loadable<InterpretationDetail | null>>({ status: "uninitialized" });
+    let graphInterpDetail = $state<Loadable<GraphInterpComponentDetail | null>>({ status: "uninitialized" });
 
     let currentCoords = $state<ComponentCoords | null>(null);
     let requestId = 0;
@@ -102,6 +104,26 @@ export function useComponentDataExpectCached() {
                     interpretationDetail = { status: "error", error };
                 }
             });
+
+        // Fetch graph interp detail
+        if (runState.graphInterpAvailable) {
+            graphInterpDetail = { status: "loading" };
+            getGraphInterpComponentDetail(layer, cIdx)
+                .then((data) => {
+                    if (isStale()) return;
+                    graphInterpDetail = { status: "loaded", data };
+                })
+                .catch((error) => {
+                    if (isStale()) return;
+                    if (error instanceof ApiError && error.status === 404) {
+                        graphInterpDetail = { status: "loaded", data: null };
+                    } else {
+                        graphInterpDetail = { status: "error", error };
+                    }
+                });
+        } else {
+            graphInterpDetail = { status: "loaded", data: null };
+        }
     }
 
     function load(layer: string, cIdx: number) {
@@ -144,6 +166,7 @@ export function useComponentDataExpectCached() {
         tokenStats = { status: "uninitialized" };
         datasetAttributions = { status: "uninitialized" };
         interpretationDetail = { status: "uninitialized" };
+        graphInterpDetail = { status: "uninitialized" };
     }
 
     // Interpretation is derived from the global cache
@@ -197,6 +220,9 @@ export function useComponentDataExpectCached() {
         get interpretationDetail() {
             return interpretationDetail;
         },
+        get graphInterpDetail() {
+            return graphInterpDetail;
+        },
         load,
         reset,
         generateInterpretation,

From e1834019f517eacf3eeeb7b9e2a8cfefd04ddc9c Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Wed, 25 Feb 2026 14:39:31 +0000
Subject: [PATCH 61/62] tiny tidy

---
 spd/dataset_attributions/harvester.py |  6 ++--
 spd/dataset_attributions/storage.py   | 50 +++++++++++++++------------
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/spd/dataset_attributions/harvester.py b/spd/dataset_attributions/harvester.py
index 26aeae1f6..10ab26ce0 100644
--- a/spd/dataset_attributions/harvester.py
+++ b/spd/dataset_attributions/harvester.py
@@ -174,21 +174,21 @@ def pre_unembed_hook(_mod: nn.Module, args: tuple[Any, ...], _kwargs: Any) -> No
 
         # Forward pass with gradients
         with torch.enable_grad(), bf16_autocast():
-            comp_output: OutputWithCache = self.model(
+            model_output: OutputWithCache = self.model(
                 tokens, mask_infos=mask_infos, cache_type="component_acts"
             )
 
         h1.remove()
         h2.remove()
 
-        cache = comp_output.cache
+        cache = model_output.cache
         cache[f"{self.embed_path}_post_detach"] = embed_out[0]
         cache[f"{self.unembed_path}_pre_detach"] = pre_unembed[0]
 
         with torch.no_grad():
             for real_layer, ci_vals in ci.lower_leaky.items():
                 self._ci_sum_accumulator[real_layer].add_(ci_vals.sum(dim=(0, 1)))
-            self._logit_sq_sum.add_(comp_output.output.detach().square().sum(dim=(0, 1)))
+            self._logit_sq_sum.add_(model_output.output.detach().square().sum(dim=(0, 1)))
 
         for target_layer in self.sources_by_target:
             if target_layer == self.unembed_path:
diff --git a/spd/dataset_attributions/storage.py b/spd/dataset_attributions/storage.py
index f12956c9a..2cbafaac5 100644
--- a/spd/dataset_attributions/storage.py
+++ b/spd/dataset_attributions/storage.py
@@ -120,15 +120,15 @@ def _select_metric(
             case "attr_abs":
                 return self._regular_attr_abs, self._embed_attr_abs
 
-    def _component_rms(self, layer: str) -> Tensor:
+    def _component_activation_rms(self, layer: str) -> Tensor:
         """RMS activation for a component layer. Shape (n_components,)."""
         return (self._component_act_sq_sum[layer] / self.n_tokens_processed).sqrt().clamp(min=EPS)
 
-    def _logit_rms(self) -> Tensor:
+    def _logit_activation_rms(self) -> Tensor:
         """RMS logit per token. Shape (vocab,)."""
         return (self._logit_sq_sum / self.n_tokens_processed).sqrt().clamp(min=EPS)
 
-    def _ci_norm(self, layer: str) -> Tensor:
+    def _layer_ci_sum(self, layer: str) -> Tensor:
         """CI sum for a source layer, clamped. Shape (n_components,)."""
         return self._ci_sum[layer].clamp(min=EPS)
 
@@ -143,35 +143,39 @@ def get_top_sources(
 
         value_segments: list[Tensor] = []
         layer_names: list[str] = []
+        if target_layer == "embed":
+            return []
 
         if target_layer == "output":
             if metric == "attr_abs":
                 return []
             w = self._w_unembed[:, target_idx].to(self._embed_unembed_attr.device)
-            target_norm = self._logit_rms()[target_idx]
+            target_act_rms = self._logit_activation_rms()[target_idx]
 
             for source_layer, attr_matrix in self._unembed_attr.items():
                 raw = w @ attr_matrix  # (src_c,)
-                value_segments.append(raw / self._ci_norm(source_layer) / target_norm)
+                value_segments.append(raw / self._layer_ci_sum(source_layer) / target_act_rms)
                 layer_names.append(source_layer)
 
             raw = w @ self._embed_unembed_attr  # (vocab,)
-            value_segments.append(raw / target_norm)
+            value_segments.append(raw / target_act_rms)
             layer_names.append("embed")
         else:
-            regular, embed = self._select_metric(metric)
-            target_norm = self._component_rms(target_layer)[target_idx]
-
-            if target_layer in regular:
-                for source_layer, attr_matrix in regular[target_layer].items():
-                    raw = attr_matrix[target_idx, :]  # (src_c,)
-                    value_segments.append(raw / self._ci_norm(source_layer) / target_norm)
-                    layer_names.append(source_layer)
+            regular_attr, embed_target_attr = self._select_metric(metric)
+            target_act_rms = self._component_activation_rms(target_layer)[target_idx]
+
+            # commenting out guard. Should be a no-op because we're in the else block.
+            # if target_layer in regular_attr:
+            for source_layer, attr_matrix in regular_attr[target_layer].items():
+                raw = attr_matrix[target_idx, :]  # (src_c,)
+                value_segments.append(raw / self._layer_ci_sum(source_layer) / target_act_rms)
+                layer_names.append(source_layer)
 
-            if target_layer in embed:
-                raw = embed[target_layer][target_idx, :]  # (vocab,)
-                value_segments.append(raw / target_norm)
-                layer_names.append("embed")
+            # same here
+            # if target_layer in embed_target_attr:
+            raw = embed_target_attr[target_layer][target_idx, :]  # (vocab,)
+            value_segments.append(raw / target_act_rms)
+            layer_names.append("embed")
 
         return self._top_k_from_segments(value_segments, layer_names, k, sign)
 
@@ -195,29 +199,29 @@ def get_top_targets(
 
             for target_layer, attr_matrix in embed.items():
                 raw = attr_matrix[:, source_idx]  # (tgt_c,)
-                value_segments.append(raw / self._component_rms(target_layer))
+                value_segments.append(raw / self._component_activation_rms(target_layer))
                 layer_names.append(target_layer)
 
             if include_outputs and metric == "attr":
                 residual = self._embed_unembed_attr[:, source_idx]  # (d_model,)
                 raw = residual @ self._w_unembed  # (vocab,)
-                value_segments.append(raw / self._logit_rms())
+                value_segments.append(raw / self._logit_activation_rms())
                 layer_names.append("output")
         else:
             regular, embed = self._select_metric(metric)
-            ci = self._ci_norm(source_layer)[source_idx]
+            ci = self._layer_ci_sum(source_layer)[source_idx]
 
             for target_layer, sources in regular.items():
                 if source_layer not in sources:
                     continue
                 raw = sources[source_layer][:, source_idx]  # (tgt_c,)
-                value_segments.append(raw / ci / self._component_rms(target_layer))
+                value_segments.append(raw / ci / self._component_activation_rms(target_layer))
                 layer_names.append(target_layer)
 
             if include_outputs and metric == "attr" and source_layer in self._unembed_attr:
                 residual = self._unembed_attr[source_layer][:, source_idx]  # (d_model,)
                 raw = residual @ self._w_unembed  # (vocab,)
-                value_segments.append(raw / ci / self._logit_rms())
+                value_segments.append(raw / ci / self._logit_activation_rms())
                 layer_names.append("output")
 
         return self._top_k_from_segments(value_segments, layer_names, k, sign)

From 5beaa66f2a32f430f844d0d6f21250c44dc66322 Mon Sep 17 00:00:00 2001
From: Oliver Clive-Griffin <oli@goodfire.ai>
Date: Thu, 26 Feb 2026 10:14:05 +0000
Subject: [PATCH 62/62] wip: Add embed token count normalization for dataset
 attributions

---
 spd/dataset_attributions/CLAUDE.md         |  2 ++
 spd/dataset_attributions/harvester.py      |  9 +++++
 spd/dataset_attributions/storage.py        | 42 +++++++++++++---------
 tests/dataset_attributions/test_storage.py |  1 +
 4 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/spd/dataset_attributions/CLAUDE.md b/spd/dataset_attributions/CLAUDE.md
index f91e68d75..2e2c0bd4c 100644
--- a/spd/dataset_attributions/CLAUDE.md
+++ b/spd/dataset_attributions/CLAUDE.md
@@ -70,6 +70,8 @@ All layer names use **canonical addressing** (`"embed"`, `"0.glu.up"`, `"output"
 
 Unembed edges are stored in residual space (d_model dimensions). `w_unembed` is stored alongside the attribution data, so output token attributions are computed on-the-fly internally — callers never need to provide the projection matrix. No abs variant for unembed edges because abs is a nonlinear operation incompatible with residual-space storage.
 
+**Normalization**: `normed[t, s] = raw[t, s] / source_denom[s] / target_rms[t]`. Component sources use `ci_sum[s]` as denominator, embed sources use `embed_token_count[s]` (per-token occurrence count). This puts both source types on comparable per-occurrence scales.
+
 Key methods: `get_top_sources(key, k, sign, metric)`, `get_top_targets(key, k, sign, metric)`. Both return `[]` for nonexistent components. `merge(paths)` classmethod for combining worker results via weighted average by n_tokens.
 
 ### Harvester (`harvester.py`)
diff --git a/spd/dataset_attributions/harvester.py b/spd/dataset_attributions/harvester.py
index 10ab26ce0..84c153847 100644
--- a/spd/dataset_attributions/harvester.py
+++ b/spd/dataset_attributions/harvester.py
@@ -79,6 +79,11 @@ def __init__(
         self._regular_layers_acc = self._get_regular_layer_attr_accumulator(sources_by_target)
         self._regular_layers_acc_abs = self._get_regular_layer_attr_accumulator(sources_by_target)
 
+        # embed token occurrence counts for normalization (analogous to ci_sum for components)
+        self._embed_token_count = torch.zeros(
+            (self.embedding_module.num_embeddings,), dtype=torch.long, device=self.device
+        )
+
         # rms normalization accumulators
         self.n_tokens = 0
         self._ci_sum_accumulator = {
@@ -141,6 +146,9 @@ def _get_regular_layer_attr_accumulator(
     def process_batch(self, tokens: Int[Tensor, "batch seq"]) -> None:
         """Accumulate attributions from one batch."""
         self.n_tokens += tokens.numel()
+        self._embed_token_count.add_(
+            torch.bincount(tokens.flatten(), minlength=self.embedding_module.num_embeddings)
+        )
 
         # Setup hooks to capture embedding output and pre-unembed residual
         embed_out: list[Tensor] = []
@@ -309,6 +317,7 @@ def _canon(acc: dict[str, Tensor]) -> dict[str, Tensor]:
             ci_sum=_canon(self._ci_sum_accumulator),
             component_act_sq_sum=_canon(self._square_component_act_accumulator),
             logit_sq_sum=self._logit_sq_sum,
+            embed_token_count=self._embed_token_count,
             ci_threshold=ci_threshold,
             n_tokens_processed=self.n_tokens,
         )
diff --git a/spd/dataset_attributions/storage.py b/spd/dataset_attributions/storage.py
index 2cbafaac5..9e6041cca 100644
--- a/spd/dataset_attributions/storage.py
+++ b/spd/dataset_attributions/storage.py
@@ -13,8 +13,8 @@
 incompatible with the residual-space storage trick.
 
 Normalization formula:
-    normed[t, s] = raw[t, s] / ci_sum[s] / target_rms[t]
-- ci_sum omitted for embed sources (no CI for embeddings)
+    normed[t, s] = raw[t, s] / source_denom[s] / target_rms[t]
+- source_denom is ci_sum[s] for component sources, embed_token_count[s] for embed sources
 - target_rms is component activation RMS for component targets, logit RMS for output targets
 """
 
@@ -69,6 +69,7 @@ def __init__(
         ci_sum: dict[str, Tensor],
         component_act_sq_sum: dict[str, Tensor],
         logit_sq_sum: Tensor,
+        embed_token_count: Tensor,
         ci_threshold: float,
         n_tokens_processed: int,
     ):
@@ -82,6 +83,7 @@ def __init__(
         self._ci_sum = ci_sum
         self._component_act_sq_sum = component_act_sq_sum
         self._logit_sq_sum = logit_sq_sum
+        self._embed_token_count = embed_token_count
         self.ci_threshold = ci_threshold
         self.n_tokens_processed = n_tokens_processed
 
@@ -132,6 +134,10 @@ def _layer_ci_sum(self, layer: str) -> Tensor:
         """CI sum for a source layer, clamped. Shape (n_components,)."""
         return self._ci_sum[layer].clamp(min=EPS)
 
+    def _embed_count(self) -> Tensor:
+        """Per-token occurrence count, clamped. Shape (vocab,)."""
+        return self._embed_token_count.float().clamp(min=EPS)
+
     def get_top_sources(
         self,
         target_key: str,
@@ -158,24 +164,22 @@ def get_top_sources(
                 layer_names.append(source_layer)
 
             raw = w @ self._embed_unembed_attr  # (vocab,)
-            value_segments.append(raw / target_act_rms)
+            value_segments.append(raw / self._embed_count() / target_act_rms)
             layer_names.append("embed")
         else:
             regular_attr, embed_target_attr = self._select_metric(metric)
             target_act_rms = self._component_activation_rms(target_layer)[target_idx]
 
-            # commenting out guard. Should be a no-op because we're in the else block.
-            # if target_layer in regular_attr:
-            for source_layer, attr_matrix in regular_attr[target_layer].items():
-                raw = attr_matrix[target_idx, :]  # (src_c,)
-                value_segments.append(raw / self._layer_ci_sum(source_layer) / target_act_rms)
-                layer_names.append(source_layer)
+            if target_layer in regular_attr:
+                for source_layer, attr_matrix in regular_attr[target_layer].items():
+                    raw = attr_matrix[target_idx, :]  # (src_c,)
+                    value_segments.append(raw / self._layer_ci_sum(source_layer) / target_act_rms)
+                    layer_names.append(source_layer)
 
-            # same here
-            # if target_layer in embed_target_attr:
-            raw = embed_target_attr[target_layer][target_idx, :]  # (vocab,)
-            value_segments.append(raw / target_act_rms)
-            layer_names.append("embed")
+            if target_layer in embed_target_attr:
+                raw = embed_target_attr[target_layer][target_idx, :]  # (vocab,)
+                value_segments.append(raw / self._embed_count() / target_act_rms)
+                layer_names.append("embed")
 
         return self._top_k_from_segments(value_segments, layer_names, k, sign)
 
@@ -196,16 +200,19 @@ def get_top_targets(
             return []
         elif source_layer == "embed":
             regular, embed = self._select_metric(metric)
+            embed_count = self._embed_count()[source_idx]
 
             for target_layer, attr_matrix in embed.items():
                 raw = attr_matrix[:, source_idx]  # (tgt_c,)
-                value_segments.append(raw / self._component_activation_rms(target_layer))
+                value_segments.append(
+                    raw / embed_count / self._component_activation_rms(target_layer)
+                )
                 layer_names.append(target_layer)
 
             if include_outputs and metric == "attr":
                 residual = self._embed_unembed_attr[:, source_idx]  # (d_model,)
                 raw = residual @ self._w_unembed  # (vocab,)
-                value_segments.append(raw / self._logit_activation_rms())
+                value_segments.append(raw / embed_count / self._logit_activation_rms())
                 layer_names.append("output")
         else:
             regular, embed = self._select_metric(metric)
@@ -276,6 +283,7 @@ def save(self, path: Path) -> None:
                 "ci_sum": _to_cpu(self._ci_sum),
                 "component_act_sq_sum": _to_cpu(self._component_act_sq_sum),
                 "logit_sq_sum": self._logit_sq_sum.detach().cpu(),
+                "embed_token_count": self._embed_token_count.detach().cpu(),
                 "ci_threshold": self.ci_threshold,
                 "n_tokens_processed": self.n_tokens_processed,
             },
@@ -298,6 +306,7 @@ def load(cls, path: Path) -> "DatasetAttributionStorage":
             ci_sum=data["ci_sum"],
             component_act_sq_sum=data["component_act_sq_sum"],
             logit_sq_sum=data["logit_sq_sum"],
+            embed_token_count=data["embed_token_count"],
             ci_threshold=data["ci_threshold"],
             n_tokens_processed=data["n_tokens_processed"],
         )
@@ -339,6 +348,7 @@ def merge(cls, paths: list[Path]) -> "DatasetAttributionStorage":
                 merged._component_act_sq_sum[layer] += other._component_act_sq_sum[layer]
 
             merged._logit_sq_sum += other._logit_sq_sum
+            merged._embed_token_count += other._embed_token_count
             merged.n_tokens_processed += other.n_tokens_processed
 
         return merged
diff --git a/tests/dataset_attributions/test_storage.py b/tests/dataset_attributions/test_storage.py
index 8c80ee915..95fb788f7 100644
--- a/tests/dataset_attributions/test_storage.py
+++ b/tests/dataset_attributions/test_storage.py
@@ -40,6 +40,7 @@ def rand(*shape: int) -> Tensor:
         ci_sum={LAYER_0: rand(C0).abs() + 1.0, LAYER_1: rand(C1).abs() + 1.0},
         component_act_sq_sum={LAYER_0: rand(C0).abs() + 1.0, LAYER_1: rand(C1).abs() + 1.0},
         logit_sq_sum=rand(VOCAB_SIZE).abs() + 1.0,
+        embed_token_count=torch.randint(100, 1000, (VOCAB_SIZE,), generator=g),
         ci_threshold=1e-6,
         n_tokens_processed=n_tokens,
     )