diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..8dafc8f
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,45 @@
+# Virtual environment and build artifacts
+.venv/
+*.egg-info/
+dist/
+build/
+__pycache__/
+*.py[cod]
+
+# Test artifacts
+.pytest_cache/
+.coverage
+htmlcov/
+coverage.xml
+
+# Type / lint caches
+.mypy_cache/
+.ruff_cache/
+
+# Secrets and local config
+.env
+assets/
+
+# Dev tooling
+.pre-commit-config.yaml
+.claude/
+.idea/
+.vscode/
+*.swp
+*.swo
+.DS_Store
+
+# Git
+.git/
+.github/
+
+# Docs and non-runtime files
+docs/
+tests/
+CHANGELOG.md
+CONTRIBUTING.md
+CLAUDE.md
+SECURITY.md
+Makefile
+README.md
+uv.lock
diff --git a/.gitignore b/.gitignore
index abb7096..c64f816 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 assets/
+.claude/
 
 # Python
 .venv/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 92ecb6e..c83f769 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.1] - 2026-03-06
+
+### Added
+
+- `combined_report.json` now includes four derived metrics: `avg_time_per_file_seconds`, `avg_time_per_mb_seconds`, `processing_speed_ratio` (real-time factor), and `words_per_audio_hour` (transcription density)
+- Slack notifications now include detailed per-stage stats (processed / skipped / failed counts) and average processing time per file
+- `make test-slack` Makefile target for validating Slack webhook integration
+- Dockerfile and `.dockerignore` for containerized deployment
+- Sentiment output directory (`<base>/sentiment/`) support in batch pipeline
+
+### Changed
+
+- Centralized Demucs scratch directory resolution in CLI — RAM disk detection and fallback confirmation now happen in one place
+- Worker status reporting and failure aggregation in `pipeline-parallel` refactored for improved accuracy
+- `python-dotenv` import in Slack notifier is now conditional — avoids import-time failure when the package is absent
+- DEPLOYMENT.md expanded: HuggingFace token setup, NVIDIA driver requirements, cloud instance guidelines, and Docker usage
+- Combined report fields documented in README under the Parallel Pipeline section
+
+### Fixed
+
+- Narrowed exception handling in `gpu_utils.py`, `transcriber.py`, and `notifier.py` to avoid masking unexpected errors
+- Typo in `SeparationError` docstring
+
 ## [0.1.0] - 2026-03-01
 
 ### Added
@@ -35,5 +58,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `transformers` capped at `<4.40.0` — versions 4.40+ use `torch.utils._pytree.register_pytree_node`, an API introduced in PyTorch 2.2, which breaks with the pinned PyTorch 2.1.2
 - `make dev-setup` now reinstalls CUDA torch wheels (`torch==2.1.2+cu121`, `torchaudio==2.1.2+cu121`) as its final step — `uv sync` resolves torch from PyPI and installs the CPU-only build, silently breaking GPU inference
 
-[Unreleased]: https://github.com/LunarCommand/audio-refinery/compare/v0.1.0...HEAD
+[Unreleased]: https://github.com/LunarCommand/audio-refinery/compare/v0.1.1...HEAD
+[0.1.1]: https://github.com/LunarCommand/audio-refinery/compare/v0.1.0...v0.1.1
 [0.1.0]: https://github.com/LunarCommand/audio-refinery/releases/tag/v0.1.0
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..1163830
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,39 @@
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+
+# System dependencies
+RUN apt-get update && apt-get install -y \
+    python3.11 python3.11-dev python3-pip python3.11-venv \
+    ffmpeg git curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Non-root user
+RUN useradd -m -u 1000 refinery
+WORKDIR /app
+USER refinery
+
+# Install uv
+RUN pip install --user uv
+
+# Copy and install the package (resolves main deps; may pull CPU-only torch)
+COPY --chown=refinery:refinery . .
+RUN uv pip install -e .
+
+# Install WhisperX at the pinned commit — no-deps to avoid overwriting torch
+# v3.1.1 tag has the old API without device_index; use the correct commit instead
+RUN uv pip install --no-deps \
+    "whisperx @ git+https://github.com/m-bain/whisperX.git@741ab9a2a8a1076c171e785363b23c55a91ceff1"
+
+# Install pinned WhisperX runtime deps
+# transformers must stay <4.40.0 — 4.40+ uses torch.utils._pytree.register_pytree_node
+# which was added in PyTorch 2.2 and breaks with the pinned 2.1.2
+RUN uv pip install \
+    "av==16.1.0" "ctranslate2==4.7.1" "faster-whisper==1.2.1" \
+    "flatbuffers==25.12.19" "nltk==3.9.2" "onnxruntime==1.24.1" \
+    "transformers>=4.30.0,<4.40.0"
+
+# Reinstall PyTorch with CUDA 12.1 wheels last — uv pip install -e . above may have
+# pulled CPU-only builds; this guarantees the CUDA wheel is what's actually used
+RUN uv pip install torch==2.1.2+cu121 torchaudio==2.1.2+cu121 \
+    --extra-index-url https://download.pytorch.org/whl/cu121
+
+CMD ["audio-refinery", "--help"]
diff --git a/Makefile b/Makefile
index 0ec7ef6..89dbca1 100644
--- a/Makefile
+++ b/Makefile
@@ -79,6 +79,17 @@ dev-setup: install-dev install-whisperx install-torch-cuda pre-commit-install ##
 	@echo "  2. Run 'make test' to verify everything works"
 	@echo "  3. Run 'audio-refinery --help' to see available commands"
 
+test-slack: ## Send a test Slack notification to verify SLACK_WEBHOOK_URL is configured
+	@uv run python -c "\
+from dotenv import load_dotenv; \
+load_dotenv(); \
+import os, sys, json, urllib.request; \
+url = os.getenv('SLACK_WEBHOOK_URL') or (print('SLACK_WEBHOOK_URL is not set — add it to .env or export it') or sys.exit(1)); \
+data = json.dumps({'text': ':white_check_mark: *Test notification* from \`audio-refinery\` — Slack integration is working.'}).encode(); \
+req = urllib.request.Request(url, data=data, headers={'Content-Type': 'application/json'}); \
+urllib.request.urlopen(req, timeout=5); \
+print('Test notification sent — check your Slack channel')"
+
 stats: ## Show project statistics
 	@echo "Project Statistics:"
 	@echo "==================="
diff --git a/README.md b/README.md
index 3b748df..96eef5c 100644
--- a/README.md
+++ b/README.md
@@ -607,6 +607,29 @@ Options:
   --help                             Show this message and exit.
 ```
 
+### Combined report fields
+
+`combined_report.json` is always written after all workers finish. It contains aggregate metrics across all workers:
+
+| Field | Type | Description |
+|---|---|---|
+| `run_at` | string | ISO 8601 timestamp of run start (UTC) |
+| `total_discovered` | int | Total WAV files found in `extracted/` |
+| `total_time_seconds` | float | Wall-clock seconds from first worker start to last finish |
+| `total_audio_hours` | float | Total audio duration processed across all workers |
+| `source_audio_bytes` | int | Combined size of all input WAV files |
+| `total_words` | int | Total words transcribed across all files |
+| `total_segments` | int | Total transcript segments across all files |
+| `avg_time_per_file_seconds` | float | `total_time / total_discovered` — average wall-clock cost per file |
+| `avg_time_per_mb_seconds` | float | `total_time / source_MB` — processing seconds per MB of source audio |
+| `processing_speed_ratio` | float | `audio_seconds / wall_seconds` — real-time factor (e.g. `3.7` means the pipeline processed audio 3.7× faster than its playback duration) |
+| `words_per_audio_hour` | float | Transcription density — useful for detecting sparse/silent audio or diarization misses |
+| `gpu_temp_celsius` | object | Per-device temperature summary: `peak_celsius`, `avg_celsius`, `sample_count` |
+| `workers` | array | Per-worker label, device, exit code, and individual summary |
+| `combined_failures` | array | Aggregated failure records from all workers |
+
+`null` is written for derived metrics when the divisor is zero (e.g. `avg_time_per_file_seconds` is `null` if no files were discovered).
+
 ### Power limit / sudoers
 
 `--power-limit` invokes `sudo nvidia-smi -pl <watts>`. To allow this without a password prompt:
diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md
index 45508b5..9d0437c 100644
--- a/docs/DEPLOYMENT.md
+++ b/docs/DEPLOYMENT.md
@@ -191,7 +191,62 @@ with engine.connect() as conn:
 ## Docker Containerization
 
 Containerizing audio-refinery ensures that the PyTorch 2.1.2 + CUDA 12.1 dependency stack is
-portable and reproducible across machines, including cloud GPU instances.
+portable and reproducible across machines. The same image runs on a local GPU workstation and
+on a cloud GPU instance — the difference is how the image is built and delivered, and what
+scratch storage is available.
+
+### Prerequisites
+
+#### HuggingFace Token
+
+Pyannote speaker diarization uses gated models that require HuggingFace authentication:
+
+1. Create a free account at [huggingface.co](https://huggingface.co)
+2. Accept the model terms for [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) and [pyannote/segmentation-3.0](https://huggingface.co/pyannote/segmentation-3.0)
+3. Generate a read token at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
+
+Pass the token via `-e HF_TOKEN` or the compose `environment:` block. Without it the diarization
+stage will fail with a 401 authentication error the first time it tries to download the model.
+
+#### NVIDIA Driver Version
+
+The `nvidia/cuda:12.1.1` base image requires **NVIDIA driver ≥ 525.85.12** on the host.
+Check before pulling the image:
+
+```bash
+nvidia-smi --query-gpu=driver_version --format=csv,noheader
+```
+
+If the driver is older than 525, update it before proceeding.
+
+#### NVIDIA Container Toolkit
+
+Docker cannot access the GPU without the NVIDIA Container Toolkit installed and configured on
+the host. This applies to both local workstations and cloud instances — most cloud GPU images
+include NVIDIA drivers but not the container toolkit.
+
+```bash
+# Install the toolkit (Ubuntu / Debian)
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
+  | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
+  | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
+  | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
+
+# Configure Docker to use the NVIDIA runtime
+sudo nvidia-ctk runtime configure --runtime=docker
+sudo systemctl restart docker
+```
+
+Verify GPU access inside a container before building:
+
+```bash
+docker run --rm --gpus all nvidia/cuda:12.1.1-base-ubuntu22.04 nvidia-smi
+```
+
+This should print the same `nvidia-smi` output as the host. If it fails, the toolkit is not
+installed correctly — audio-refinery will not be able to use the GPU inside the container.
 
 ### Dockerfile
 
@@ -212,25 +267,50 @@ USER refinery
 # Install uv
 RUN pip install --user uv
 
-# Install PyTorch first (CUDA 12.1 wheel)
-RUN uv pip install torch==2.1.2 torchaudio==2.1.2 \
-    --extra-index-url https://download.pytorch.org/whl/cu121
-
-# Install WhisperX and runtime deps (must be separate due to ctranslate2 constraints)
-RUN uv pip install setuptools && \
-    uv pip install --no-deps --no-build-isolation \
-      "whisperx @ git+https://github.com/m-bain/whisperX.git@v3.1.1" && \
-    uv pip install "ctranslate2>=4.0" "faster-whisper>=1.0.0" \
-      "transformers>=4.35.0,<4.42.0" nltk
-
-# Copy and install the package
+# Copy and install the package (resolves main deps; may pull CPU-only torch)
 COPY --chown=refinery:refinery . .
 RUN uv pip install -e .
 
+# Install WhisperX at the pinned commit — no-deps to avoid overwriting torch
+# v3.1.1 tag has the old API without device_index; use the correct commit instead
+RUN uv pip install --no-deps \
+    "whisperx @ git+https://github.com/m-bain/whisperX.git@741ab9a2a8a1076c171e785363b23c55a91ceff1"
+
+# Install pinned WhisperX runtime deps
+# transformers must stay <4.40.0 — 4.40+ uses torch.utils._pytree.register_pytree_node
+# which was added in PyTorch 2.2 and breaks with the pinned 2.1.2
+RUN uv pip install \
+    "av==16.1.0" "ctranslate2==4.7.1" "faster-whisper==1.2.1" \
+    "flatbuffers==25.12.19" "nltk==3.9.2" "onnxruntime==1.24.1" \
+    "transformers>=4.30.0,<4.40.0"
+
+# Reinstall PyTorch with CUDA 12.1 wheels last — uv pip install -e . above may have
+# pulled CPU-only builds; this guarantees the CUDA wheel is what's actually used
+RUN uv pip install torch==2.1.2+cu121 torchaudio==2.1.2+cu121 \
+    --extra-index-url https://download.pytorch.org/whl/cu121
+
 CMD ["audio-refinery", "--help"]
 ```
 
-### docker-compose.yml
+### Building the Image
+
+```bash
+docker build -t audio-refinery:latest .
+```
+
+The build clones WhisperX from GitHub and downloads PyTorch CUDA wheels, so it requires internet
+access and takes 10–20 minutes on first run. Subsequent builds are faster due to layer caching,
+provided `pyproject.toml` has not changed.
+
+---
+
+### Running Locally (Workstation)
+
+For a local GPU workstation, build the image once and run it via compose for sustained batch
+runs, or `docker run` for one-off jobs. Audio data and the RAM disk are bind-mounted from the
+host, so the container has no persistent state of its own.
+
+**Sustained batch — docker-compose.yml:**
 
 ```yaml
 services:
@@ -244,8 +324,8 @@ services:
               device_ids: ['0']       # Pin to specific GPU by PCI ID
               capabilities: [gpu]
     volumes:
-      - /data/audio:/data             # Persistent audio storage
-      - /mnt/fast_scratch:/mnt/fast_scratch  # RAM disk (mount on host first)
+      - /data/audio:/data                          # Persistent audio storage
+      - /mnt/fast_scratch:/mnt/fast_scratch        # RAM disk (mount on host first)
     environment:
       - HF_TOKEN=${HF_TOKEN}
       - SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL}
@@ -255,21 +335,80 @@ services:
         --compute-type int8_float16
 ```
 
-### Running in the cloud
+**Ad-hoc run:**
+
+```bash
+docker run --rm --gpus '"device=0"' \
+  -v /data/audio:/data \
+  -v /mnt/fast_scratch:/mnt/fast_scratch \
+  -e HF_TOKEN="${HF_TOKEN}" \
+  -e SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL}" \
+  audio-refinery:latest \
+  audio-refinery pipeline --base-dir /data/batch --compute-type int8_float16
+```
+
+`--gpus '"device=0"'` pins to a specific GPU by index. Use `--gpus all` to expose all GPUs.
+
+---
+
+### Deploying to the Cloud
+
+Cloud deployment follows the same container pattern as local, with two differences: the image
+is delivered via a registry rather than built in place, and NVMe instance storage substitutes
+for the RAM disk.
+
+#### Instance Selection
+
+The recommended minimum is a **24 GB GPU** to hold all models resident simultaneously. On a
+10–12 GB GPU, models are loaded and unloaded between stages, adding 10–30 seconds of overhead
+per file. See [VRAM Footprint by Stage](#vram-footprint-by-stage) for a full breakdown.
+
+Common instance types that meet the 24 GB threshold: NVIDIA A10G (AWS g5), L4 (GCP g2), RTX
+3090 / 4090 (bare metal providers).
+
+#### Registry Workflow
+
+Build and push from your local machine (or a CI runner), then pull on the cloud instance:
+
+```bash
+# Build and push (local machine)
+docker build -t your-registry/audio-refinery:latest .
+docker push your-registry/audio-refinery:latest
+
+# Pull and run (cloud instance — after completing Prerequisites above)
+docker pull your-registry/audio-refinery:latest
+docker run --rm --gpus '"device=0"' \
+  -v /data/audio:/data \
+  -v /mnt/nvme:/mnt/fast_scratch \
+  -e HF_TOKEN="${HF_TOKEN}" \
+  -e SLACK_WEBHOOK_URL="${SLACK_WEBHOOK_URL}" \
+  your-registry/audio-refinery:latest \
+  audio-refinery pipeline --base-dir /data/batch --compute-type int8_float16
+```
+
+Alternatively, build directly on the cloud instance if it has internet access and you want to
+skip managing a registry:
 
-On a cloud GPU instance without a RAM disk, substitute a high-bandwidth NVMe instance volume
-for `/mnt/fast_scratch`. Cloud GPU instances typically provide NVMe-backed instance storage at
-2–4 GB/s write throughput — adequate as a scratch substitute at the cost of some SSD wear.
+```bash
+git clone https://github.com/LunarCommand/audio-refinery.git
+cd audio-refinery
+docker build -t audio-refinery:latest .
+```
+
+#### Scratch Storage
+
+Cloud GPU instances do not have a RAM disk. Use a high-bandwidth NVMe instance volume as scratch:
 
 ```bash
-# Use a local NVMe volume as scratch instead of RAM disk
-audio-refinery pipeline \
-  --base-dir /data/batch \
-  --compute-type int8_float16
-  # The pipeline will prompt for confirmation before writing to local storage
-  # if /mnt/fast_scratch is not mounted
+# Mount instance NVMe storage (device path varies by provider)
+sudo mkdir -p /mnt/nvme
+sudo mount /dev/nvme1n1 /mnt/nvme
 ```
 
+Pass `/mnt/nvme` as the scratch bind mount (`-v /mnt/nvme:/mnt/fast_scratch`). Cloud NVMe
+instance storage typically provides 2–4 GB/s write throughput — adequate as a scratch substitute
+at the cost of some SSD wear.
+
 ---
 
 ## Monitoring
diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md
index 21e8f7d..66048d7 100644
--- a/docs/DEVELOPMENT.md
+++ b/docs/DEVELOPMENT.md
@@ -18,11 +18,11 @@ This guide covers setting up audio-refinery for development, testing, and contri
 
 The three GPU-resident models have the following approximate VRAM footprints:
 
-| Model | Stage | Peak VRAM |
-|---|---|:---:|
-| Demucs `htdemucs` | Vocal separation | ~4 GB |
-| Pyannote `speaker-diarization-3.1` | Diarization | ~1 GB |
-| WhisperX `large-v3` | Transcription | ~10 GB |
+| Model                              | Stage            | Peak VRAM |
+|------------------------------------|------------------|:---------:|
+| Demucs `htdemucs`                  | Vocal separation |   ~4 GB   |
+| Pyannote `speaker-diarization-3.1` | Diarization      |   ~1 GB   |
+| WhisperX `large-v3`                | Transcription    |  ~10 GB   |
 
 A **24 GB GPU** (RTX 3090, 3090 Ti, 4090, A5000, etc.) holds all three models simultaneously
 with room for a comfortable batch size (16–32). This is the recommended configuration for
@@ -259,12 +259,12 @@ Hooks run:
 
 Each stage is a standalone module with a pure-function API:
 
-| Module | Function | Output Model |
-|--------|----------|--------------|
-| `separator.py` | `separate()` | `SeparationResult` |
-| `diarizer.py` | `diarize()` | `DiarizationResult` |
-| `transcriber.py` | `transcribe()` | `TranscriptionResult` |
-| `sentiment_analyzer.py` | `analyze_sentiment()` | `SentimentResult` |
+| Module                  | Function              | Output Model          |
+|-------------------------|-----------------------|-----------------------|
+| `separator.py`          | `separate()`          | `SeparationResult`    |
+| `diarizer.py`           | `diarize()`           | `DiarizationResult`   |
+| `transcriber.py`        | `transcribe()`        | `TranscriptionResult` |
+| `sentiment_analyzer.py` | `analyze_sentiment()` | `SentimentResult`     |
 
 ### Data Flow
 
@@ -371,15 +371,15 @@ Before creating a release:
 
 After release:
 
-- [ ] GitHub Release page shows correct version and artifacts
+- [ ] GitHub Release page shows the correct version and artifacts
 - [ ] Release notes are accurate
 
 ### Quick Reference
 
 ```bash
 # Complete release workflow
-git checkout main && git pull
-git checkout -b release/vX.Y.Z
+git switch main && git pull
+git switch -c release/vX.Y.Z
 # Update pyproject.toml version and CHANGELOG.md
 make all-checks
 git add pyproject.toml CHANGELOG.md
diff --git a/pyproject.toml b/pyproject.toml
index c482afc..48aceb2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "audio-refinery"
-version = "0.1.0"
+version = "0.1.1"
 description = "GPU-accelerated audio processing pipeline: vocal separation, speaker diarization, transcription, and sentiment analysis."
 readme = "README.md"
 license = { text = "MIT" }
diff --git a/src/cli.py b/src/cli.py
index c30e192..d44f983 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -7,6 +7,8 @@
 import sys
 import threading
 import time
+from collections import Counter
+from pathlib import Path
 
 # Force PCI bus order so CUDA device indices match nvidia-smi numbering.
 # Must be set before any CUDA context is created.
@@ -145,12 +147,66 @@ def _fmt_time(seconds: float) -> str:
     return f"{mins}m {secs}s"
 
 
+def _mkdir_demucs(demucs_path: Path, base_path: Path, demucs_on_ramdisk: bool) -> tuple[Path, bool]:
+    """Create demucs_path, prompting for local fallback on PermissionError.
+
+    Returns (final_demucs_path, final_demucs_on_ramdisk).
+    """
+    try:
+        demucs_path.mkdir(parents=True, exist_ok=True)
+        return demucs_path, demucs_on_ramdisk
+    except PermissionError:
+        console.print(
+            Panel(
+                "[bold yellow]/mnt/fast_scratch is not writable.[/bold yellow]\n\n"
+                "The RAM disk is mounted but the current user cannot write to it.\n"
+                "Remount with open permissions:\n\n"
+                "  [dim]sudo mount -o remount,mode=1777 /mnt/fast_scratch[/dim]\n\n"
+                f"  Fallback path: [bold]{base_path / 'demucs'}[/bold]",
+                title="[yellow bold]RAM Disk Not Writable[/yellow bold]",
+                border_style="yellow",
+            )
+        )
+        if not click.confirm("Continue using local storage for Demucs scratch?", default=False):
+            console.print("[dim]Aborted.[/dim]")
+            sys.exit(0)
+        demucs_path = base_path / "demucs"
+        demucs_path.mkdir(parents=True, exist_ok=True)
+        return demucs_path, False
+
+
+def _resolve_demucs_scratch(base_path: Path) -> tuple[Path, bool]:
+    """Resolve the Demucs scratch directory, prompting if /mnt/fast_scratch is unavailable.
+
+    Returns (demucs_path, demucs_on_ramdisk). Exits if the user declines local fallback.
+    """
+    fast_scratch = Path("/mnt/fast_scratch")
+    if fast_scratch.is_mount():
+        return fast_scratch / "demucs", True
+    console.print(
+        Panel(
+            "[bold yellow]/mnt/fast_scratch is not mounted.[/bold yellow]\n\n"
+            "The RAM disk is not available. Without it, Demucs scratch files will be\n"
+            "written to local storage, which is slower and increases SSD wear.\n\n"
+            f"  Fallback path: [bold]{base_path / 'demucs'}[/bold]\n\n"
+            "To mount the RAM disk before running:\n"
+            "  [dim]sudo mount -t tmpfs -o size=32G,mode=1777 tmpfs /mnt/fast_scratch[/dim]",
+            title="[yellow bold]RAM Disk Not Available[/yellow bold]",
+            border_style="yellow",
+        )
+    )
+    if not click.confirm("Continue using local storage for Demucs scratch?", default=False):
+        console.print("[dim]Aborted.[/dim]")
+        sys.exit(0)
+    return base_path / "demucs", False
+
+
 @click.group()
 def cli():
     """Audio Refinery — Audio processing pipeline."""
 
 
-@cli.command()
+@cli.command("separate")
 @click.argument("input_file", type=click.Path(exists=True, dir_okay=False, resolve_path=True))
 @click.option(
     "-o",
@@ -177,8 +233,6 @@ def cli():
 )
 def separate_cmd(input_file: str, output_dir: str, device: str, segment: int | None):
     """Run Demucs vocal separation on an audio file."""
-    from pathlib import Path
-
     input_path = Path(input_file)
     output_path = Path(output_dir)
 
@@ -231,7 +285,7 @@ def separate_cmd(input_file: str, output_dir: str, device: str, segment: int | N
     )
 
 
-@cli.command()
+@cli.command("diarize")
 @click.argument("input_file", type=click.Path(exists=True, dir_okay=False, resolve_path=True))
 @click.option(
     "-d",
@@ -275,8 +329,6 @@ def diarize_cmd(
     output_file: str | None,
 ):
     """Run Pyannote speaker diarization on an audio file."""
-    from pathlib import Path
-
     input_path = Path(input_file)
 
     speaker_hints = ""
@@ -355,7 +407,7 @@ def diarize_cmd(
         )
 
 
-@cli.command()
+@cli.command("transcribe")
 @click.argument("input_file", type=click.Path(exists=True, dir_okay=False, resolve_path=True))
 @click.option(
     "-d",
@@ -409,8 +461,6 @@ def transcribe_cmd(
     output_file: str | None,
 ):
     """Run WhisperX transcription on an audio file."""
-    from pathlib import Path
-
     input_path = Path(input_file)
     diarization_path = Path(diarization_file) if diarization_file else None
 
@@ -538,8 +588,6 @@ def sentiment_cmd(transcription_file: str, model: str, device: str, output_file:
     TRANSCRIPTION_FILE is also updated in place with sentiment fields merged
     into each segment, giving a single enriched output for downstream use.
     """
-    from pathlib import Path
-
     tx_path = Path(transcription_file)
 
     console.print(
@@ -824,8 +872,6 @@ def pipeline(
       /mnt/fast_scratch/demucs  — RAM disk (used automatically if mounted)
       <base>/demucs             — disk fallback (requires confirmation)
     """
-    from pathlib import Path
-
     from rich.progress import BarColumn, MofNCompleteColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
 
     from src.pipeline import discover_files
@@ -860,49 +906,9 @@ def pipeline(
         demucs_path = Path(demucs_dir)
         demucs_on_ramdisk = demucs_path.is_mount()
     else:
-        fast_scratch = Path("/mnt/fast_scratch")
-        if fast_scratch.is_mount():
-            demucs_path = fast_scratch / "demucs"
-            demucs_on_ramdisk = True
-        else:
-            console.print(
-                Panel(
-                    "[bold yellow]/mnt/fast_scratch is not mounted.[/bold yellow]\n\n"
-                    "The RAM disk is not available. Without it, Demucs scratch files will be\n"
-                    "written to local storage, which is slower and increases SSD wear.\n\n"
-                    f"  Fallback path: [bold]{base_path / 'demucs'}[/bold]\n\n"
-                    "To mount the RAM disk before running:\n"
-                    "  [dim]sudo mount -t tmpfs -o size=32G,mode=1777 tmpfs /mnt/fast_scratch[/dim]",
-                    title="[yellow bold]RAM Disk Not Available[/yellow bold]",
-                    border_style="yellow",
-                )
-            )
-            if not click.confirm("Continue using local storage for Demucs scratch?", default=False):
-                console.print("[dim]Aborted.[/dim]")
-                sys.exit(0)
-            demucs_path = base_path / "demucs"
-            demucs_on_ramdisk = False
+        demucs_path, demucs_on_ramdisk = _resolve_demucs_scratch(base_path)
 
-    try:
-        demucs_path.mkdir(parents=True, exist_ok=True)
-    except PermissionError:
-        console.print(
-            Panel(
-                "[bold yellow]/mnt/fast_scratch is not writable.[/bold yellow]\n\n"
-                "The RAM disk is mounted but the current user cannot write to it.\n"
-                "Remount with open permissions:\n\n"
-                "  [dim]sudo mount -o remount,mode=1777 /mnt/fast_scratch[/dim]\n\n"
-                f"  Fallback path: [bold]{base_path / 'demucs'}[/bold]",
-                title="[yellow bold]RAM Disk Not Writable[/yellow bold]",
-                border_style="yellow",
-            )
-        )
-        if not click.confirm("Continue using local storage for Demucs scratch?", default=False):
-            console.print("[dim]Aborted.[/dim]")
-            sys.exit(0)
-        demucs_path = base_path / "demucs"
-        demucs_on_ramdisk = False
-        demucs_path.mkdir(parents=True, exist_ok=True)
+    demucs_path, demucs_on_ramdisk = _mkdir_demucs(demucs_path, base_path, demucs_on_ramdisk)
     for path in [diar_path, tx_path, summary_dir]:
         path.mkdir(parents=True, exist_ok=True)
     if sentiment:
@@ -1010,9 +1016,9 @@ def _on_progress(content_id: str, stage: str, i: int, n: int) -> None:
                 if _progress_file is not None:
                     _progress_file.write_text(json.dumps({"done": i, "total": n, "current": "", "stage": "done"}))
                 return
-            colour = _stage_colours.get(stage, "cyan")
+            stage_colour = _stage_colours.get(stage, "cyan")
             label = _stage_labels.get(stage, stage)
-            # Update temperature reading at most once every 5 s.
+            # Update the temperature reading at most once every 5 s.
             now = time.monotonic()
             if device != "cpu" and now - _temp_state["ts"] >= 5.0:
                 _t = query_gpu_temperature(_cuda_idx)
@@ -1021,7 +1027,9 @@ def _on_progress(content_id: str, stage: str, i: int, n: int) -> None:
                 if _t is not None:
                     _temp_state["readings"].append(_t)
             temp_str = f" · {_fmt_temp(_temp_state['value'], temp_limit)}" if device != "cpu" else ""
-            progress.update(task, completed=i, description=f"[{colour}]{label}[/{colour}] · {content_id}{temp_str}")
+            progress.update(
+                task, completed=i, description=f"[{stage_colour}]{label}[/{stage_colour}] · {content_id}{temp_str}"
+            )
             # Plain-text fallback when stdout is redirected to a log file (no TTY).
             if not sys.stdout.isatty():
                 click.echo(f"[{time.strftime('%H:%M:%S')}] {label} {i}/{n} · {content_id}")
@@ -1249,6 +1257,35 @@ def _stage_stats(stage_result):
         completed=completed,
         failures=len(all_failures),
         elapsed_seconds=total_time,
+        stages={
+            "separation": {
+                "processed": sep_result.n_succeeded,
+                "skipped": sep_result.n_skipped,
+                "failed": sep_result.n_failed,
+            },
+            "diarization": {
+                "processed": diar_result.n_succeeded,
+                "skipped": diar_result.n_skipped,
+                "failed": diar_result.n_failed,
+            },
+            "transcription": {
+                "processed": tx_result.n_succeeded,
+                "skipped": tx_result.n_skipped,
+                "failed": tx_result.n_failed,
+            },
+            **(
+                {
+                    "sentiment": {
+                        "processed": sent_result.n_succeeded,
+                        "skipped": sent_result.n_skipped,
+                        "failed": sent_result.n_failed,
+                    }
+                }
+                if sentiment
+                else {}
+            ),
+        },
+        avg_per_file_seconds=pipeline_avg,
     )
 
     if all_failures:
@@ -1393,7 +1430,6 @@ def pipeline_parallel(
       <base>/summary/       — per-worker summaries + combined_report.json
     """
     from datetime import UTC, datetime
-    from pathlib import Path
 
     from src.pipeline import discover_files, partition_ids
 
@@ -1423,53 +1459,13 @@ def pipeline_parallel(
     _warn_if_gpu_busy(list(devices))
 
     # ── Resolve Demucs scratch (interactive if needed; performed once here) ──
-    fast_scratch = Path("/mnt/fast_scratch")
-    if fast_scratch.is_mount():
-        demucs_path = fast_scratch / "demucs"
-        demucs_on_ramdisk = True
-    else:
-        console.print(
-            Panel(
-                "[bold yellow]/mnt/fast_scratch is not mounted.[/bold yellow]\n\n"
-                "The RAM disk is not available. Without it, Demucs scratch files will be\n"
-                "written to local storage, which is slower and increases SSD wear.\n\n"
-                f"  Fallback path: [bold]{base_path / 'demucs'}[/bold]\n\n"
-                "To mount the RAM disk before running:\n"
-                "  [dim]sudo mount -t tmpfs -o size=32G,mode=1777 tmpfs /mnt/fast_scratch[/dim]",
-                title="[yellow bold]RAM Disk Not Available[/yellow bold]",
-                border_style="yellow",
-            )
-        )
-        if not click.confirm("Continue using local storage for Demucs scratch?", default=False):
-            console.print("[dim]Aborted.[/dim]")
-            sys.exit(0)
-        demucs_path = base_path / "demucs"
-        demucs_on_ramdisk = False
+    demucs_path, demucs_on_ramdisk = _resolve_demucs_scratch(base_path)
 
     # ── Create working directories ──────────────────────────────────────────
     manifests_dir = base_path / "manifests"
     logs_dir = base_path / "logs"
     summary_dir = base_path / "summary"
-    try:
-        demucs_path.mkdir(parents=True, exist_ok=True)
-    except PermissionError:
-        console.print(
-            Panel(
-                "[bold yellow]/mnt/fast_scratch is not writable.[/bold yellow]\n\n"
-                "The RAM disk is mounted but the current user cannot write to it.\n"
-                "Remount with open permissions:\n\n"
-                "  [dim]sudo mount -o remount,mode=1777 /mnt/fast_scratch[/dim]\n\n"
-                f"  Fallback path: [bold]{base_path / 'demucs'}[/bold]",
-                title="[yellow bold]RAM Disk Not Writable[/yellow bold]",
-                border_style="yellow",
-            )
-        )
-        if not click.confirm("Continue using local storage for Demucs scratch?", default=False):
-            console.print("[dim]Aborted.[/dim]")
-            sys.exit(0)
-        demucs_path = base_path / "demucs"
-        demucs_on_ramdisk = False
-        demucs_path.mkdir(parents=True, exist_ok=True)
+    demucs_path, demucs_on_ramdisk = _mkdir_demucs(demucs_path, base_path, demucs_on_ramdisk)
     for path in [manifests_dir, logs_dir, summary_dir]:
         path.mkdir(parents=True, exist_ok=True)
 
@@ -1534,14 +1530,16 @@ def pipeline_parallel(
     # ── Build worker commands ───────────────────────────────────────────────
     refinery_cmd = sys.argv[0]
 
-    def _build_worker_cmd(device: str, manifest_path: Path, summary_path: Path, progress_path: Path) -> list[str]:
+    def _build_worker_cmd(
+        worker_device: str, manifest_path: Path, summary_path: Path, progress_path: Path
+    ) -> list[str]:
         cmd = [
             refinery_cmd,
             "pipeline",
             "--base-dir",
             str(base_path),
             "--device",
-            device,
+            worker_device,
             "--demucs-dir",
             str(demucs_path),
             "--manifest",
@@ -1579,9 +1577,9 @@ def _build_worker_cmd(device: str, manifest_path: Path, summary_path: Path, prog
     scratch_suffix = "(RAM disk)" if demucs_on_ramdisk else "(disk)"
     tflops_table = load_tflops_table()
 
-    def _gpu_stat_line(device: str) -> str:
-        idx = int(device.split(":")[1]) if ":" in device else 0
-        info = query_gpu_info(idx)
+    def _gpu_stat_line(gpu_device: str) -> str:
+        gpu_idx = int(gpu_device.split(":")[1]) if ":" in gpu_device else 0
+        info = query_gpu_info(gpu_idx)
         if info is None:
             return "[dim]GPU info unavailable[/dim]"
         vram_gb = round(info.vram_mib / 1024)
@@ -1621,10 +1619,10 @@ def _gpu_stat_line(device: str) -> str:
     gpu_temps: dict[str, int | None] = {w["device"]: None for w in workers}
     gpu_temp_readings: dict[str, list[int]] = {w["device"]: [] for w in workers}
 
-    def _read_progress(path: Path) -> dict:
+    def _read_progress(fpath: Path) -> dict:
         try:
-            return json.loads(path.read_text())
-        except Exception:
+            return json.loads(fpath.read_text())
+        except (OSError, json.JSONDecodeError):
             return {"done": 0, "total": "?", "current": "—", "stage": "starting"}
 
     def _worker_status_table() -> Table:
@@ -1642,18 +1640,18 @@ def _worker_status_table() -> Table:
         tbl.add_column("Stage", width=10)
         tbl.add_column("File")
         tbl.add_column("Progress", justify="right", width=10)
-        for w in workers:
-            p = _read_progress(w["progress"])
+        for wdict in workers:
+            p = _read_progress(wdict["progress"])
             done, total_w = p.get("done", 0), p.get("total", "?")
             stage_val = p.get("stage", "—")
             if stage_val == "done":
-                n_failures = p.get("failures", None)
-                stage_display = "[yellow]Done[/yellow]" if n_failures else "[green]Done[/green]"
+                worker_failures = p.get("failures", None)
+                stage_display = "[yellow]Done[/yellow]" if worker_failures else "[green]Done[/green]"
                 file_display = ""
             else:
                 stage_display, file_display = stage_val, p.get("current", "—")
-            temp_display = _fmt_temp(gpu_temps.get(w["device"]), temp_limit)
-            tbl.add_row(w["label"], w["device"], temp_display, stage_display, file_display, f"{done}/{total_w}")
+            temp_display = _fmt_temp(gpu_temps.get(wdict["device"]), temp_limit)
+            tbl.add_row(wdict["label"], wdict["device"], temp_display, stage_display, file_display, f"{done}/{total_w}")
         return tbl
 
     try:
@@ -1699,18 +1697,13 @@ def _worker_status_table() -> Table:
 
     total_time = time.monotonic() - t0
 
-    # ── Report exit status ──────────────────────────────────────────────────
-    for w in workers:
-        ok = w["rc"] == 0
-        status = "[green]OK[/green]" if ok else f"[red]FAILED (exit {w['rc']})[/red]"
-        console.print(f"Worker {w['label']} ({w['device']}): {status}")
     console.print(f"[dim]Total wall-clock time: {_fmt_time(total_time)}[/dim]\n")
 
     # ── Aggregate summaries ─────────────────────────────────────────────────
-    def _load_summary(path: Path) -> dict | None:
+    def _load_summary(fpath: Path) -> dict | None:
         try:
-            return json.loads(path.read_text())
-        except Exception:
+            return json.loads(fpath.read_text())
+        except (OSError, json.JSONDecodeError):
             return None
 
     all_combined_failures: list[dict] = []
@@ -1722,6 +1715,19 @@ def _load_summary(path: Path) -> dict | None:
             for f in summary.get("failures", []):
                 all_combined_failures.append({"worker": w["label"], "device": w["device"], **f})
 
+    # ── Report exit status ──────────────────────────────────────────────────
+    for i, w in enumerate(workers):
+        summary = worker_summaries[i]
+        if w["rc"] == 0:
+            status = "[green]OK[/green]"
+        elif summary is not None:
+            n_failures = len(summary.get("failures", []))
+            label = "failure" if n_failures == 1 else "failures"
+            status = f"[yellow]Completed ({n_failures} file {label})[/yellow]"
+        else:
+            status = f"[red]FAILED (exit {w['rc']})[/red]"
+        console.print(f"Worker {w['label']} ({w['device']}): {status}")
+
     _notif_processed = 0
     _notif_failures = 0
 
@@ -1732,11 +1738,11 @@ def _load_summary(path: Path) -> dict | None:
         combined.add_column("Skipped", justify="right")
         combined.add_column("Failed", justify="right")
 
-        def _agg(key: str, sub: str) -> int:
+        def _agg(agg_key: str, sub: str) -> int:
             agg_total = 0
             for s in worker_summaries:
                 if s:
-                    agg_total += s.get("stages", {}).get(key, {}).get(sub, 0)
+                    agg_total += s.get("stages", {}).get(agg_key, {}).get(sub, 0)
             return agg_total
 
         stage_rows = [
@@ -1776,6 +1782,10 @@ def _agg(key: str, sub: str) -> int:
 
     # ── Combined failure report (printed) ──────────────────────────────────
     if all_combined_failures:
+        failure_counts: Counter = Counter()
+        for f in all_combined_failures:
+            key = (f.get("worker", ""), f.get("device", ""), f.get("stage", ""), f.get("error", ""))
+            failure_counts[key] += 1
         fail_table = Table(
             title=f"Combined Failure Report ({len(all_combined_failures)} failures)",
             border_style="red",
@@ -1783,16 +1793,10 @@ def _agg(key: str, sub: str) -> int:
         fail_table.add_column("Worker", style="bold")
         fail_table.add_column("Device")
         fail_table.add_column("Stage")
-        fail_table.add_column("Content ID")
+        fail_table.add_column("Count", justify="right")
         fail_table.add_column("Error")
-        for f in all_combined_failures:
-            fail_table.add_row(
-                f.get("worker", ""),
-                f.get("device", ""),
-                f.get("stage", ""),
-                f.get("content_id", ""),
-                f.get("error", ""),
-            )
+        for (worker, device, stage, error), count in sorted(failure_counts.items()):
+            fail_table.add_row(worker, device, stage, str(count), error)
         console.print(fail_table)
 
     # ── Write combined_report.json (always) ────────────────────────────────
@@ -1805,14 +1809,24 @@ def _agg(key: str, sub: str) -> int:
         for device, readings in gpu_temp_readings.items()
         if readings
     } or None
+    _cr_n_files = len(all_ids)
+    _cr_total_audio_hours = round(sum(s.get("total_audio_hours", 0.0) for s in worker_summaries if s), 4)
+    _cr_source_audio_bytes = sum(s.get("source_audio_bytes", 0) for s in worker_summaries if s)
+    _cr_total_words = sum(s.get("total_words", 0) for s in worker_summaries if s)
+    _cr_total_segments = sum(s.get("total_segments", 0) for s in worker_summaries if s)
+    _cr_source_mb = _cr_source_audio_bytes / 1_000_000
     combined_report = {
         "run_at": datetime.now(UTC).isoformat(),
-        "total_discovered": len(all_ids),
+        "total_discovered": _cr_n_files,
         "total_time_seconds": round(total_time, 2),
-        "total_audio_hours": round(sum(s.get("total_audio_hours", 0.0) for s in worker_summaries if s), 4),
-        "source_audio_bytes": sum(s.get("source_audio_bytes", 0) for s in worker_summaries if s),
-        "total_words": sum(s.get("total_words", 0) for s in worker_summaries if s),
-        "total_segments": sum(s.get("total_segments", 0) for s in worker_summaries if s),
+        "total_audio_hours": _cr_total_audio_hours,
+        "source_audio_bytes": _cr_source_audio_bytes,
+        "total_words": _cr_total_words,
+        "total_segments": _cr_total_segments,
+        "avg_time_per_file_seconds": round(total_time / _cr_n_files, 2) if _cr_n_files else None,
+        "avg_time_per_mb_seconds": round(total_time / _cr_source_mb, 4) if _cr_source_mb else None,
+        "processing_speed_ratio": round(_cr_total_audio_hours * 3600 / total_time, 3) if total_time else None,
+        "words_per_audio_hour": round(_cr_total_words / _cr_total_audio_hours, 1) if _cr_total_audio_hours else None,
         "gpu_temp_celsius": _gpu_temp_summaries,
         "workers": [
             {
@@ -1829,21 +1843,32 @@ def _agg(key: str, sub: str) -> int:
     combined_report_path.write_text(json.dumps(combined_report, indent=2))
     console.print(f"[dim]Combined report written to: {combined_report_path}[/dim]")
 
-    worker_statuses = [(w["label"], w["device"], w["rc"] == 0) for w in workers]
+    worker_statuses = []
+    for i, w in enumerate(workers):
+        ws = worker_summaries[i]
+        n_fail = len(ws.get("failures", [])) if ws is not None else 0
+        worker_statuses.append((w["label"], w["device"], w["rc"], n_fail))
+    _parallel_stages: dict[str, dict[str, int]] | None = None
+    if any(s for s in worker_summaries):
+        _parallel_stages = {
+            stage_key: {
+                "processed": _agg(stage_key, "processed"),
+                "skipped": _agg(stage_key, "skipped"),
+                "failed": _agg(stage_key, "failed"),
+            }
+            for stage_key in (["separation", "diarization", "transcription"] + (["sentiment"] if sentiment else []))
+        }
     notify_pipeline_parallel_complete(
         worker_statuses=worker_statuses,
         total_discovered=len(all_ids),
         total_processed=_notif_processed,
         failures=_notif_failures,
         elapsed_seconds=total_time,
+        stages=_parallel_stages,
+        avg_per_file_seconds=combined_avg if _notif_processed else 0.0,
     )
 
     if not all(w["rc"] == 0 for w in workers):
         failed_logs = "\n".join(f"  {w['log']}" for w in workers)
         console.print(f"[dim]Worker logs retained for inspection:[/dim]\n{failed_logs}")
         sys.exit(1)
-
-
-cli.add_command(separate_cmd, name="separate")
-cli.add_command(diarize_cmd, name="diarize")
-cli.add_command(transcribe_cmd, name="transcribe")
diff --git a/src/diarizer.py b/src/diarizer.py
index 9aa6110..c8840f1 100644
--- a/src/diarizer.py
+++ b/src/diarizer.py
@@ -49,7 +49,7 @@ def __init__(self, message: str):
 
 
 def _resolve_hf_token(hf_token: str | None) -> str:
-    """Return the HuggingFace token, loading from environment if not provided.
+    """Return the HuggingFace token, loading from the environment if not provided.
 
     Raises DiarizationError if no token is found.
     """
@@ -87,7 +87,7 @@ def load_pipeline(model: str, device: str, hf_token: str):
     try:
         import torch
 
-        # Pass the token via environment variable rather than as a kwarg.
+        # Pass the token via the environment variable rather than as a kwarg.
         # pyannote's from_pretrained() signature varies across versions, but
         # huggingface_hub always reads HF_TOKEN from the environment automatically.
         os.environ["HF_TOKEN"] = hf_token
@@ -126,6 +126,8 @@ def diarize(
         max_speakers: Optional upper bound on speaker count.
         hf_token: HuggingFace token. If None, reads from HF_TOKEN env var.
         model: Pyannote model ID.
+        _pipeline: Pre-loaded Pyannote pipeline instance. Skips model loading when
+            provided; intended for testing and pipeline reuse.
 
     Returns:
         DiarizationResult with full provenance of the diarization run.
diff --git a/src/gpu_utils.py b/src/gpu_utils.py
index ba5ce45..0c36abd 100644
--- a/src/gpu_utils.py
+++ b/src/gpu_utils.py
@@ -60,7 +60,7 @@ def load_tflops_table() -> dict[str, float]:
         with open(_TFLOPS_TABLE_PATH, "rb") as f:
             data = tomllib.load(f)
         return {k: float(v) for k, v in data.get("tflops", {}).items()}
-    except Exception:
+    except (OSError, KeyError, ValueError, tomllib.TOMLDecodeError):
         return {}
 
 
@@ -123,13 +123,13 @@ def _sort_key(g: tuple[int, int, int, str]) -> tuple:
             tflops = lookup_tflops(name, tflops_table)
             if tflops is not None:
                 # Tier 1: known GPU — rank by TFLOPS.
-                return (1, tflops, 0, 0)
+                return 1, tflops, 0, 0
             # Tier 0: unknown GPU — rank by rounded VRAM then SM clock.
-            return (0, 0.0, round(mem_mib / 1024), sm_clock)
+            return 0, 0.0, round(mem_mib / 1024), sm_clock
 
         gpus.sort(key=_sort_key, reverse=True)
         return tuple(f"cuda:{g[0]}" for g in gpus)
-    except Exception:
+    except (subprocess.TimeoutExpired, subprocess.SubprocessError, OSError, ValueError):
         return ("cuda:0",)
 
 
diff --git a/src/notifier.py b/src/notifier.py
index 729b2cd..8751182 100644
--- a/src/notifier.py
+++ b/src/notifier.py
@@ -1,7 +1,7 @@
 """Slack webhook notifications for pipeline events.
 
 Set ``SLACK_WEBHOOK_URL`` in your ``.env`` file or the shell environment
-to enable.  All functions are fire-and-forget — errors are silently ignored so
+to enable.  All functions are fire-and-forget — errors are silently ignored, so
 a notification failure can never block or abort the pipeline.
 """
 
@@ -12,17 +12,16 @@
 import urllib.error
 import urllib.request
 
+try:
+    from dotenv import load_dotenv as _load_dotenv
+except ImportError:
+    _load_dotenv = None  # type: ignore[assignment]
+
 
 def _send(text: str) -> None:
     """POST a plain-text Slack message to the configured webhook URL, if any."""
-    # Load .env lazily so the webhook URL is available even if this fires before
-    # the diarizer stage (which is when load_dotenv() normally runs).
-    try:
-        from dotenv import load_dotenv
-
-        load_dotenv()
-    except ImportError:
-        pass
+    if _load_dotenv is not None:
+        _load_dotenv()
 
     url = os.getenv("SLACK_WEBHOOK_URL")
     if not url:
@@ -31,7 +30,7 @@ def _send(text: str) -> None:
         data = json.dumps({"text": text}).encode()
         req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
         urllib.request.urlopen(req, timeout=5)
-    except Exception:
+    except (urllib.error.URLError, OSError, ValueError):
         pass  # Never raise — notification failure must not interrupt the pipeline
 
 
@@ -40,20 +39,56 @@ def _fmt_elapsed(seconds: float) -> str:
     return f"{mins}m {secs}s" if mins else f"{secs}s"
 
 
+_STAGE_LABELS = {
+    "separation": "Vocal separation",
+    "diarization": "Speaker diarization",
+    "transcription": "Transcription",
+    "sentiment": "Text sentiment",
+}
+
+
 def notify_pipeline_complete(
     device: str,
     total: int,
     completed: int,
     failures: int,
     elapsed_seconds: float,
+    stages: dict[str, dict[str, int]] | None = None,
+    avg_per_file_seconds: float = 0.0,
 ) -> None:
-    """Send a Slack notification when a single-GPU pipeline run ends."""
+    """Send a Slack notification when a single-GPU pipeline run ends.
+
+    Args:
+        device: PyTorch device string, e.g. ``"cuda:0"``.
+        total: Total files discovered.
+        completed: Files successfully transcribed (processed + skipped).
+        failures: Total stage-level failures across all stages.
+        elapsed_seconds: Wall-clock runtime in seconds.
+        stages: Optional per-stage counts keyed by stage name, each containing
+            ``"processed"``, ``"skipped"``, and ``"failed"`` counts, e.g.
+            ``{"separation": {"processed": 12, "skipped": 0, "failed": 0}, ...}``.
+        avg_per_file_seconds: Average wall-clock time per file across the full pipeline.
+    """
     elapsed = _fmt_elapsed(elapsed_seconds)
     if failures == 0:
         icon, status = ":white_check_mark:", "Pipeline complete"
     else:
         icon, status = ":warning:", f"Pipeline complete with {failures} failure(s)"
-    _send(f"{icon} *{status}* on `{device}`\n{completed}/{total} files processed in {elapsed}")
+    avg_str = f"  ·  avg/file: {_fmt_elapsed(avg_per_file_seconds)}" if avg_per_file_seconds else ""
+    lines = [
+        f"{icon} *{status}* on `{device}`",
+        f"{completed}/{total} files transcribed in {elapsed}{avg_str}",
+    ]
+    if stages:
+        for stage_key, stage_label in _STAGE_LABELS.items():
+            s = stages.get(stage_key)
+            if s is None:
+                continue
+            lines.append(
+                f"{stage_label}: {s.get('processed', 0)} processed, "
+                f"{s.get('skipped', 0)} skipped, {s.get('failed', 0)} failed"
+            )
+    _send("\n".join(lines))
 
 
 def notify_thermal_shutdown(device: str, temp: int, limit: int) -> None:
@@ -62,34 +97,52 @@ def notify_thermal_shutdown(device: str, temp: int, limit: int) -> None:
 
 
 def notify_pipeline_parallel_complete(
-    worker_statuses: list[tuple[str, str, bool]],
+    worker_statuses: list[tuple[str, str, int, int]],
     total_discovered: int,
     total_processed: int,
     failures: int,
     elapsed_seconds: float,
+    stages: dict[str, dict[str, int]] | None = None,
+    avg_per_file_seconds: float = 0.0,
 ) -> None:
     """Send a Slack notification when a multi-GPU pipeline-parallel run ends.
 
     Args:
-        worker_statuses: One ``(label, device, ok)`` tuple per worker,
-            e.g. ``[("W0", "cuda:0", True), ("W1", "cuda:1", False)]``.
+        worker_statuses: One ``(label, device, exit_code, n_failures)`` tuple per worker,
+            e.g. ``[("W0", "cuda:0", 0, 0), ("W1", "cuda:1", 1, 3)]``.
         total_discovered: Total files discovered across all workers.
         total_processed: Files successfully transcribed (processed + skipped).
         failures: Total stage-level failures across all workers.
         elapsed_seconds: Wall-clock runtime in seconds.
+        stages: Optional aggregated per-stage counts keyed by stage name, each
+            containing ``"processed"``, ``"skipped"``, and ``"failed"`` counts.
+        avg_per_file_seconds: Combined average wall-clock time per file.
     """
     elapsed = _fmt_elapsed(elapsed_seconds)
-    all_ok = all(ok for _, _, ok in worker_statuses) and failures == 0
+    all_ok = all(rc == 0 for _, _, rc, _ in worker_statuses) and failures == 0
     icon = ":white_check_mark:" if all_ok else ":warning:"
     worker_parts = []
-    for label, device, ok in worker_statuses:
-        status_icon = ":white_check_mark:" if ok else ":x:"
-        worker_parts.append(f"{label} (`{device}`): {status_icon}")
+    for label, device, rc, n_fail in worker_statuses:
+        if rc == 0:
+            worker_icon = ":white_check_mark:"
+        elif n_fail > 0:
+            worker_icon = f":warning: ({n_fail} failure{'s' if n_fail != 1 else ''})"
+        else:
+            worker_icon = f":x: (exit {rc})"
+        worker_parts.append(f"{label} (`{device}`): {worker_icon}")
+    avg_str = f"  ·  avg/file: {_fmt_elapsed(avg_per_file_seconds)}" if avg_per_file_seconds else ""
     lines = [
         f"{icon} *Pipeline-parallel {'complete' if all_ok else 'finished with issues'}*",
         "  |  ".join(worker_parts),
-        f"{total_processed}/{total_discovered} files processed in {elapsed}",
+        f"{total_processed}/{total_discovered} files transcribed in {elapsed}{avg_str}",
     ]
-    if failures > 0:
-        lines.append(f"{failures} file(s) failed")
+    if stages:
+        for stage_key, stage_label in _STAGE_LABELS.items():
+            s = stages.get(stage_key)
+            if s is None:
+                continue
+            lines.append(
+                f"{stage_label}: {s.get('processed', 0)} processed, "
+                f"{s.get('skipped', 0)} skipped, {s.get('failed', 0)} failed"
+            )
     _send("\n".join(lines))
diff --git a/src/pipeline.py b/src/pipeline.py
index dc39f97..9b8c427 100644
--- a/src/pipeline.py
+++ b/src/pipeline.py
@@ -28,6 +28,7 @@
 
     _has_torch = True
 except ImportError:
+    _torch = None  # type: ignore[assignment]
     _has_torch = False
 
 from src.diarizer import (
@@ -218,7 +219,7 @@ def partition_ids(ids: list[str], n: int = 2) -> list[list[str]]:
 
     Worker i gets positions i, i+n, i+2n, ... This distributes workload more evenly
     than a naive chunked split when file durations correlate with naming. With n=2,
-    behavior is identical to the previous dual-worker implementation.
+    the behavior is identical to the previous dual-worker implementation.
 
     Args:
         ids: Ordered list of content_id strings (typically from discover_files()).
@@ -594,6 +595,7 @@ def run_pipeline(
         demucs_output_dir: Demucs output root directory (RAM disk strongly recommended).
         diarization_dir: Directory for diarization_<content_id>.json files.
         transcription_dir: Directory for transcription_<content_id>.json files.
+        sentiment_dir: Directory for sentiment_<content_id>.json files.
         device: GPU device string ('cuda', 'cuda:N', or 'cpu').
         segment: Demucs segment size in seconds (VRAM optimisation).
         compute_type: WhisperX CTranslate2 compute type.
@@ -662,7 +664,7 @@ def run_pipeline(
 
     # ── Load sentiment model if needed ────────────────────────────────────
     # Text-only; independent of audio model availability. Loaded before the
-    # early-return check so a sentiment-only run (all audio done, sentiment
+    # early-return check, so a sentiment-only run (all audio done, sentiment
     # pending) falls through to Pass 3 rather than returning early.
     sentiment_pipeline_obj = None
     sentiment_load_error: str | None = None
@@ -738,7 +740,7 @@ def run_pipeline(
                     _sep_rtf = round(sep.processing_time_seconds / _sep_dur, 3) if _sep_dur > 0 else None
                     _sep_vram = _read_vram(device)
                     # no_vocals.wav is only needed by step 6 (CLAP). Delete it now unless
-                    # events are enabled, to keep RAM disk usage as low as possible.
+                    # events are enabled to keep RAM disk usage as low as possible.
                     if not keep_scratch and not enable_events:
                         _cleanup_stem(no_vocals)
                     result.separation.outcomes.append(
@@ -874,7 +876,7 @@ def run_pipeline(
 
     # ── Pass 3: Text Sentiment Analysis ───────────────────────────────────────
     # Text-only step — reads the transcription JSON, produces per-segment scores.
-    # Runs after all audio stages so it covers both freshly-transcribed files
+    # Runs after all audio stages, so it covers both freshly-transcribed files
     # (processed in Pass 2) and previously-transcribed files (fast-tracked in
     # Pass 1). This means a first run with --sentiment on a previously-transcribed
     # base-dir produces sentiment for all files without re-running audio stages.
diff --git a/src/separator.py b/src/separator.py
index 1f6457c..8af87ae 100644
--- a/src/separator.py
+++ b/src/separator.py
@@ -21,7 +21,7 @@
 
 
 class SeparationError(Exception):
-    """Raised when Demucs subprocess fails."""
+    """Raised when the Demucs subprocess fails."""
 
     def __init__(self, message: str, returncode: int | None = None, stderr: str = ""):
         super().__init__(message)
diff --git a/src/transcriber.py b/src/transcriber.py
index 0741a86..7a05997 100644
--- a/src/transcriber.py
+++ b/src/transcriber.py
@@ -56,7 +56,7 @@ def _suppress_output():
 def _parse_whisperx_device(device: str) -> tuple[str, int]:
     """Split a PyTorch-style device string for ctranslate2's separate API.
 
-    ctranslate2 (whisperx's backend) takes device and device_index as separate
+    ctranslate2 (whisperx's backend) takes the device and device_index as separate
     parameters and does not accept 'cuda:N' as a combined string.
     PyTorch-style load_align_model and align() still use the original string.
 
@@ -100,6 +100,8 @@ def transcribe(
         diarization_file: Optional path to a DiarizationResult JSON from step 2.
             When provided, speaker labels are merged into the transcript output.
         model: Whisper model size (default: 'large-v3').
+        _whisperx_model: Pre-loaded WhisperX model instance. When provided, model
+            loading is skipped (used by the pipeline for batch efficiency).
 
     Returns:
         TranscriptionResult with full provenance of the transcription run.
@@ -185,7 +187,7 @@ def transcribe(
             aligned = whisperx.align(
                 raw_result["segments"], align_model, metadata, audio, device, return_char_alignments=False
             )
-        except Exception:
+        except (RuntimeError, ValueError, OSError, KeyError):
             alignment_fallback = True
             aligned = raw_result
 
diff --git a/uv.lock b/uv.lock
index dd84456..3baca07 100644
--- a/uv.lock
+++ b/uv.lock
@@ -126,7 +126,7 @@ wheels = [
 
 [[package]]
 name = "audio-refinery"
-version = "0.1.0"
+version = "0.1.1"
 source = { editable = "." }
 dependencies = [
     { name = "click" },