From 898fe12cf2d36bab5ac5562ac150e9e3cf35b0aa Mon Sep 17 00:00:00 2001
From: Jammy2211 <JNightingale2211@gmail.com>
Date: Thu, 21 May 2026 15:16:12 +0100
Subject: [PATCH] likelihood: split into likelihood_breakdown +
 likelihood_runtime packages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Separates the two distinct goals that previously coexisted in every
`likelihood/<class>/<model>.py` script:

- `likelihood_breakdown/` — per-step JIT decomposition, single config
  (CPU fp64 default; `--gpu` opt-in). Answers *"where does time go
  inside this likelihood?"* Pedagogical / optimisation-focused.
- `likelihood_runtime/` — full-pipeline JIT only, driven by the existing
  CPU/GPU/A100 × fp64/mp sweep harness. Answers *"how long will this
  take on this hardware?"* Production-cost-focused.

Side-effect: the cheaper per-script runtime mode runs in
seconds-to-low-minutes; the heavier per-step mode only runs on demand.

# Moves (git mv — history preserved)

- `likelihood/_profile_cli.py`        → `_profile_cli.py`           (repo root)
- `likelihood/adapt_image_util.py`    → `_adapt_image_util.py`      (repo root)
- `likelihood/OPTIMIZATION_NOTES.md`  → `likelihood_runtime/OPTIMIZATION_NOTES.md`
- `scripts/sweep_likelihood.py`       → `likelihood_runtime/sweep.py`
- `scripts/aggregate_sweep.py`        → `likelihood_runtime/aggregate.py`
- 9 per-cell scripts: `likelihood/<class>/<model>.py` → `likelihood_runtime/<class>/<model>.py`

# Per-cell split

5 cells get both variants (script previously did both jobs):

- `imaging/{mge,pixelization,delaunay}.py`
- `interferometer/delaunay.py`
- `datacube/delaunay.py`

For each, `likelihood_runtime/<class>/<model>.py` has the per-step
`PART B` JIT section + per-step JSON keys + per-step bar chart
deleted; `likelihood_breakdown/<class>/<model>.py` is the per-step
half written fresh, sharing the setup/Part-A code with runtime by
design (per CLAUDE.md "three similar lines is better than a
premature abstraction").

4 cells stay runtime-only (no breakdown variant) — their existing
scripts were full-pipeline-by-design:

- `interferometer/mge.py` (intentional, per docstring)
- `interferometer/pixelization.py` (sparse-DFT path is already a single block)
- `point_source/{image_plane,source_plane}.py` (single short JIT shots)

# Import-path simplification

The split lets the two-line sys.path-insert dance in every cell
collapse to a single insert at `parents[2]` (autolens_profiling root),
where both `_profile_cli` and `_adapt_image_util` now live.

# Deep-research deliverable

Per-package README.md docs the methodology, output schema, and
"when-to-use" guidance for each package. The empirical findings (per-cell
timings, mp verdicts, GPU NUFFT regression, upstream blockers) stay in
`likelihood_runtime/OPTIMIZATION_NOTES.md`.

# Removed

- The old `likelihood/` tree (all content moved or replicated).
- 5 per-class README.md files under `likelihood/{imaging,interferometer,
  datacube,point_source,}` — content folded into the two package
  READMEs.

# Follow-ups

- `scripts/build_readme.py` needs its glob paths updated for the new
  layout (it still scans `likelihood/results/*.json`). Out of scope for
  this PR; flagged in the top-level README.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 README.md                                     |  23 +-
 ...dapt_image_util.py => _adapt_image_util.py |   0
 likelihood/_profile_cli.py => _profile_cli.py |   0
 likelihood/README.md                          |  63 --
 likelihood/datacube/README.md                 |  57 --
 likelihood/imaging/README.md                  |  59 --
 likelihood/interferometer/README.md           |  47 --
 likelihood/point_source/README.md             |  57 --
 likelihood_breakdown/README.md                | 113 +++
 .../datacube/__init__.py                      |   0
 .../datacube/delaunay.py                      | 191 +-----
 .../imaging/delaunay.py                       | 221 +-----
 .../imaging/mge.py                            | 237 ++-----
 .../imaging/pixelization.py                   | 238 +------
 .../interferometer/delaunay.py                | 340 +--------
 .../OPTIMIZATION_NOTES.md                     |   0
 likelihood_runtime/README.md                  | 148 ++++
 .../aggregate.py                              |  18 +-
 likelihood_runtime/datacube/__init__.py       |   0
 likelihood_runtime/datacube/delaunay.py       | 601 ++++++++++++++++
 likelihood_runtime/imaging/delaunay.py        | 578 ++++++++++++++++
 likelihood_runtime/imaging/mge.py             | 507 ++++++++++++++
 likelihood_runtime/imaging/pixelization.py    | 574 ++++++++++++++++
 likelihood_runtime/interferometer/delaunay.py | 647 ++++++++++++++++++
 .../interferometer/mge.py                     |   1 -
 .../interferometer/pixelization.py            |   6 +-
 .../point_source/image_plane.py               |   1 -
 .../point_source/source_plane.py              |   1 -
 .../sweep.py                                  |  16 +-
 29 files changed, 3362 insertions(+), 1382 deletions(-)
 rename likelihood/adapt_image_util.py => _adapt_image_util.py (100%)
 rename likelihood/_profile_cli.py => _profile_cli.py (100%)
 delete mode 100644 likelihood/README.md
 delete mode 100644 likelihood/datacube/README.md
 delete mode 100644 likelihood/imaging/README.md
 delete mode 100644 likelihood/interferometer/README.md
 delete mode 100644 likelihood/point_source/README.md
 create mode 100644 likelihood_breakdown/README.md
 rename {likelihood => likelihood_breakdown}/datacube/__init__.py (100%)
 rename {likelihood => likelihood_breakdown}/datacube/delaunay.py (81%)
 rename {likelihood => likelihood_breakdown}/imaging/delaunay.py (81%)
 rename {likelihood => likelihood_breakdown}/imaging/mge.py (75%)
 rename {likelihood => likelihood_breakdown}/imaging/pixelization.py (80%)
 rename {likelihood => likelihood_breakdown}/interferometer/delaunay.py (70%)
 rename {likelihood => likelihood_runtime}/OPTIMIZATION_NOTES.md (100%)
 create mode 100644 likelihood_runtime/README.md
 rename scripts/aggregate_sweep.py => likelihood_runtime/aggregate.py (95%)
 create mode 100644 likelihood_runtime/datacube/__init__.py
 create mode 100644 likelihood_runtime/datacube/delaunay.py
 create mode 100644 likelihood_runtime/imaging/delaunay.py
 create mode 100644 likelihood_runtime/imaging/mge.py
 create mode 100644 likelihood_runtime/imaging/pixelization.py
 create mode 100644 likelihood_runtime/interferometer/delaunay.py
 rename {likelihood => likelihood_runtime}/interferometer/mge.py (99%)
 rename {likelihood => likelihood_runtime}/interferometer/pixelization.py (99%)
 rename {likelihood => likelihood_runtime}/point_source/image_plane.py (99%)
 rename {likelihood => likelihood_runtime}/point_source/source_plane.py (99%)
 rename scripts/sweep_likelihood.py => likelihood_runtime/sweep.py (94%)
diff --git a/README.md b/README.md
index 138d932..be3cc05 100644
--- a/README.md
+++ b/README.md
@@ -24,23 +24,9 @@ Results are framed by **astronomy instrument** (HST, Euclid, JWST, …) rather t
 
 ## Latest run-times
 
-The table below is auto-generated from the latest versioned artifacts under `results/`. Each row is the latest steady-state per-call cost for a likelihood path at a given instrument; numbers refresh whenever the producing scripts are rerun and committed. Hardware tier is **CPU only** today — laptop GPU and HPC GPU columns will land once `results/**` artifacts are tagged with a hardware label.
-
-<!-- BEGIN auto-table:headline -->
-| Section | Script | Instrument | Latest single-JIT per-call | PyAutoLens version |
-|---------|--------|------------|----------------------------|--------------------|
-| likelihood/datacube | `delaunay.py` | hannah | — | v2026.5.14.2 |
-| likelihood/imaging | `delaunay.py` | hst | 833.4 ms | v2026.5.14.2 |
-| likelihood/imaging | `mge.py` | hst | 41.6 ms | v2026.5.14.2 |
-| likelihood/imaging | `pixelization.py` | hst | 782.3 ms | v2026.5.14.2 |
-| likelihood/interferometer | `delaunay.py` | sma | 154.5 ms | v2026.5.14.2 |
-| likelihood/interferometer | `mge.py` | sma | 33.6 ms | v2026.5.14.2 |
-| likelihood/interferometer | `pixelization.py` | sma | 113.6 ms | v2026.5.14.2 |
-| likelihood/point_source | `image_plane.py` | — | 22.5 ms | v2026.5.14.2 |
-| likelihood/point_source | `source_plane.py` | — | 691 μs | v2026.5.14.2 |
-<!-- END auto-table:headline -->
-
-(Generator: `scripts/build_readme.py`. Run `python scripts/build_readme.py` after producing new artifacts to refresh; `--check` exits non-zero in CI if it would change anything.)
+Cell-level full-pipeline numbers live in [`likelihood_runtime/OPTIMIZATION_NOTES.md`](./likelihood_runtime/OPTIMIZATION_NOTES.md), which carries the latest CPU + local-GPU per-call costs together with per-cell "where to optimize next" recommendations and the mp-vs-fp64 verdicts. The detailed multi-config `comparison.json` artifacts are committed under [`autolens_workspace_developer/jax_profiling/results/jit/<class>/<model>/`](https://github.com/PyAutoLabs/autolens_workspace_developer/tree/main/jax_profiling/results/jit) and are re-aggregated by `likelihood_runtime/aggregate.py` whenever a new sweep finishes.
+
+(The previous auto-generated table in this README was retired when the likelihood profiling was split into `likelihood_breakdown/` + `likelihood_runtime/`; `scripts/build_readme.py` is queued for a path-update follow-up.)
 
 ## JAX gradients — currently out of scope
 
@@ -67,7 +53,8 @@ Examples that already exist in the source-of-truth repo:
 
 | Folder | Contents |
 |--------|----------|
-| [`likelihood/`](./likelihood/README.md) | Likelihood JIT profiling — imaging, interferometer, point-source, datacube. |
+| [`likelihood_breakdown/`](./likelihood_breakdown/README.md) | Per-step JIT decomposition. Single config. *Where does time go inside the likelihood?* |
+| [`likelihood_runtime/`](./likelihood_runtime/README.md) | Full-pipeline JIT only, driven by `sweep.py` across CPU/GPU/A100 × fp64/mp. *How long will this likelihood take on this hardware?* |
 | [`simulators/`](./simulators/README.md) | Run-time tracking for the PyAutoLens simulators. |
 | [`searches/`](./searches/README.md) | Sampler / search profiling, Nautilus first. |
 | [`results/`](./results/README.md) | Versioned JSON + PNG artifacts written by the above scripts. |
diff --git a/likelihood/adapt_image_util.py b/_adapt_image_util.py
similarity index 100%
rename from likelihood/adapt_image_util.py
rename to _adapt_image_util.py
diff --git a/likelihood/_profile_cli.py b/_profile_cli.py
similarity index 100%
rename from likelihood/_profile_cli.py
rename to _profile_cli.py
diff --git a/likelihood/README.md b/likelihood/README.md
deleted file mode 100644
index 0751281..0000000
--- a/likelihood/README.md
+++ /dev/null
@@ -1,63 +0,0 @@
-# likelihood
-
-JAX JIT profiling for the PyAutoLens likelihood function across imaging, interferometer, point-source, and datacube datasets, and across the MGE, pixelization, and Delaunay model compositions used in real science cases.
-
-## What "JIT likelihood profiling" means
-
-For each science case, the likelihood function turns a parameter vector into a single number (log-likelihood) via a chain of array operations: instantiate the model, build the `Tracer`, ray-trace grids through the lens, compute a mapping matrix, blur it with the PSF, solve a linear-algebra reconstruction problem, and finally compute a chi-squared. Under `xp=jnp`, every step is dispatched as a JAX op and can be compiled into a single XLA program with `jax.jit`.
-
-Profiling the **whole likelihood** as one JIT'd function gives the honest per-call cost a sampler will see in production. Profiling **each step individually** under its own JIT gives the breakdown that tells you where the time is going. Both numbers matter: the whole-function timing is the production cost, and the per-step breakdown is the optimisation target. Each script in this section reports both wherever the underlying pipeline supports per-step JIT-ing (the interferometer and datacube paths intentionally stay at full-pipeline JIT for now — see those subfolders' READMEs for why).
-
-Every script also reports a **batched (`jax.vmap`) per-likelihood cost** to make explicit how much the JIT amortises across a population of evaluations — the regime an actual sampler operates in.
-
-## How to read the per-script output
-
-Each script prints a structured narrative to stdout, ending in:
-
-- The eager (numpy) baseline log-likelihood for sanity.
-- The single-JIT lower / compile / first-call / steady-state per-call timings.
-- The vmap per-likelihood cost and speedup vs single-JIT.
-- A correctness check: eager ≡ JIT ≡ vmap log-likelihoods at `rtol=1e-4`.
-- A `results/likelihood/<type>/<script>_likelihood_summary_<instrument>_v<al.__version__>.{json,png}` write.
-
-The JSON carries the structured timings keyed by step name plus the model / dataset metadata. The PNG is a bar chart of per-step costs (where applicable) plus the single-JIT vs vmap comparison.
-
-## Versioned artifacts
-
-Result files are tagged with the PyAutoLens release that produced them (`al.__version__`). Old versions remain alongside new ones so cross-release trends stay visible — Phase 4's dashboard will read the latest per axis and present the headline numbers framed by astronomy instrument.
-
-See the top-level [results/README.md](../results/README.md) for the full filename convention.
-
-## Sections
-
-| Folder | Profiles |
-|--------|----------|
-| [`imaging/`](./imaging/README.md) | MGE, pixelization, and Delaunay likelihoods on imaging datasets (HST-resolution by default). |
-| [`interferometer/`](./interferometer/README.md) | MGE, pixelization, and Delaunay likelihoods on interferometer (visibility-space) datasets (SMA by default). |
-| [`point_source/`](./point_source/README.md) | Image-plane and source-plane chi-squared for lensed point sources. |
-| [`datacube/`](./datacube/README.md) | Multi-channel datacube likelihoods (e.g. ALMA-style) with Delaunay pixelization. |
-
-## Running a script
-
-From the repo root:
-
-```bash
-cd autolens_profiling
-python likelihood/imaging/mge.py
-```
-
-Scripts use the input datasets under `dataset/<type>/<instrument>/` (see top-level [README](../README.md)). The default instrument is encoded per-script; some support a CLI flag to switch instruments. Run with `--help` for the supported options.
-
-**Codex / sandboxed runs** — set writable cache dirs so numba and matplotlib don't choke on read-only home/source paths:
-
-```bash
-NUMBA_CACHE_DIR=/tmp/numba_cache MPLCONFIGDIR=/tmp/matplotlib python likelihood/imaging/mge.py
-```
-
-## Conventions inherited from `autolens_workspace_developer`
-
-The scripts follow the JIT conventions documented at `autolens_workspace_developer/CLAUDE.md`:
-
-- All autoarray types (`Array2D`, `Grid2D`, `Grid2DIrregular`, …) expose `.array` for the raw `np.ndarray` / `jax.Array` underneath. These are extracted before crossing the `jax.jit` boundary because autoarray types are not registered as JAX pytrees as **inputs**.
-- The `xp` parameter (`xp=np` default, `xp=jnp` for JAX) controls the backend. JIT'd closures pass `xp=jnp` through every nested call.
-- The model is converted to a JAX pytree via `autofit.jax.register_model(model)` so `af.ModelInstance` can cross the JIT boundary directly — no manual flat-vector unpacking.
diff --git a/likelihood/datacube/README.md b/likelihood/datacube/README.md
deleted file mode 100644
index 863b432..0000000
--- a/likelihood/datacube/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# likelihood/datacube
-
-JAX JIT profiling for the PyAutoLens **datacube** likelihood function — fitting an N-channel cube of interferometer observations (e.g. ALMA velocity / frequency channels) that share a single lens model, with each channel reconstructing its own Delaunay-pixelized source.
-
-## Channel-invariant vs channel-variant split
-
-The key new ingredient relative to the single-channel interferometer path is the **channel-invariant vs channel-variant** decomposition: most steps are computed once for the whole cube (shared lens model, shared mesh, shared mask), while only a few steps recur per channel.
-
-| Step | Channel-invariant? | Computed |
-|------|--------------------|----------|
-| 1. Ray-trace data grid | yes | once for the cube |
-| 2. Ray-trace mesh grid | yes | once for the cube |
-| 3. Inversion setup (border + mapper + NUFFT) | NUFFT depends on `uv_wavelengths` | once per channel |
-| 4. Data vector D | per channel | once per channel |
-| 5. Curvature matrix F | per channel | once per channel |
-| 6. Regularization matrix H | yes | once for the cube |
-| 7. Reconstruction (NNLS) | per channel | once per channel |
-| 8. Mapped recon + log-evidence | per channel | once per channel |
-
-The cube total is:
-
-```
-cube_cost = sum(channel_invariant_costs) + N_channels * sum(channel_variant_costs)
-```
-
-That number quantifies how much a future "shared `Lᵀ W̃ L`" optimisation would save: moving the curvature matrix from per-channel to shared would subtract `(N − 1) * curvature_matrix_cost` from the cube total. The profiling script reports this number directly.
-
-## Scripts
-
-| Script | What it profiles |
-|--------|------------------|
-| [`delaunay.py`](./delaunay.py) | Step-by-step JIT profiling of an N-channel datacube with shared lens model and per-channel Delaunay source reconstruction. Mirrors the per-step structure of [`../interferometer/delaunay.py`](../interferometer/delaunay.py). |
-
-## Default dataset
-
-`dataset/interferometer/sma/` — the same SMA-like mock used by the single-channel interferometer scripts, **loaded N times as a 4-channel cube**. Every channel has identical visibilities, noise map, and uv_wavelengths — the point here is timing, not science. The N-channel cube log-evidence is exactly `N × single-channel log-evidence`, which makes the regression assertion trivial.
-
-For a realistic per-channel-distinct cube, point the loader at the workspace simulator output at `autolens_workspace/dataset/interferometer/datacube/sim_simple/`. The JIT-cost taxonomy doesn't change — it's a function of which arrays are loop-variables in `FitInterferometer`, not the data values themselves.
-
-## Headline run-times (latest per dataset)
-
-Auto-generated by `scripts/build_readme.py` from the latest `*_summary_v<version>.json` artifacts under `results/likelihood/datacube/`. Hardware tier is CPU only today.
-
-<!-- BEGIN auto-table:likelihood-datacube -->
-| Script | Instrument | Latest single-JIT per-call | PyAutoLens version |
-|--------|------------|----------------------------|--------------------|
-| `delaunay.py` | hannah | — | v2026.5.14.2 |
-<!-- END auto-table:likelihood-datacube -->
-
-## Output
-
-The script writes:
-
-```
-results/likelihood/datacube/delaunay_likelihood_summary_<instrument>_v<al.__version__>.json
-results/likelihood/datacube/delaunay_likelihood_summary_<instrument>_v<al.__version__>.png
-```
diff --git a/likelihood/imaging/README.md b/likelihood/imaging/README.md
deleted file mode 100644
index 931d182..0000000
--- a/likelihood/imaging/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# likelihood/imaging
-
-JAX JIT profiling for the PyAutoLens **imaging** likelihood function — strong-lens reconstruction from a single 2D image with a known PSF.
-
-The imaging likelihood path is the most fully JIT-decomposed of the four data types. Each script here profiles the **whole likelihood** as one JIT'd program (the cost a sampler actually pays) **and** every internal step under its own JIT (the per-step breakdown that tells you where the time goes). Three model compositions are covered, all with the same step structure but different source representations:
-
-- **MGE** — multi-Gaussian expansion source. Linear light profiles only, so no profile-subtracted image stage. Simplest source path.
-- **Pixelization** — rectangular pixelized source. Adds mesh-to-image mapping and a regularization matrix.
-- **Delaunay** — Delaunay-triangulated source pixelization with adaptive resolution. Heaviest source path; closest to production reconstructions for irregular-morphology sources.
-
-## Step-by-step structure each script profiles
-
-Per the docstrings in each script, the imaging likelihood decomposes into:
-
-1. Instance from parameter vector
-2. Build `Tracer`
-3. Ray-trace grids through the lens
-4. Compute mapping matrix (per-profile images before PSF)
-5. Compute blurred mapping matrix (PSF convolution)
-6. Compute data vector D
-7. Compute curvature matrix F
-8. Reconstruction via positive-only NNLS
-9. Map reconstruction back to image plane
-10. Chi-squared and log-likelihood
-
-XLA may fuse these differently when compiled as one program vs separate pieces, so per-step timings are approximate. They are still the right tool for identifying which step dominates.
-
-## Scripts
-
-| Script | Source representation | What it profiles |
-|--------|-----------------------|------------------|
-| [`mge.py`](./mge.py) | MGE (linear light profiles only) | Lightest imaging path. Useful as the baseline against which pixelization / Delaunay are compared. |
-| [`pixelization.py`](./pixelization.py) | Rectangular pixelization | Adds source mesh + regularization. |
-| [`delaunay.py`](./delaunay.py) | Delaunay pixelization | Production-style irregular-source reconstruction. |
-
-## Default dataset
-
-`dataset/imaging/hst/` — an HST-resolution mock (pixel scale 0.05″, 21×21 PSF) committed to this repo. Other instruments (`euclid`, `jwst`, `ao`) can be regenerated via the source-of-truth scripts at `autolens_workspace_developer/jax_profiling/dataset_setup/imaging.py` and copied into `dataset/imaging/<instrument>/`.
-
-## Headline run-times (latest per script × instrument)
-
-Auto-generated by `scripts/build_readme.py` from the latest `*_summary_v<version>.json` artifacts under `results/likelihood/imaging/`. Hardware tier is CPU only today.
-
-<!-- BEGIN auto-table:likelihood-imaging -->
-| Script | Instrument | Latest single-JIT per-call | PyAutoLens version |
-|--------|------------|----------------------------|--------------------|
-| `delaunay.py` | hst | 833.4 ms | v2026.5.14.2 |
-| `mge.py` | hst | 41.6 ms | v2026.5.14.2 |
-| `pixelization.py` | hst | 782.3 ms | v2026.5.14.2 |
-<!-- END auto-table:likelihood-imaging -->
-
-## Output
-
-Each script writes:
-
-```
-results/likelihood/imaging/<script>_likelihood_summary_<instrument>_v<al.__version__>.json
-results/likelihood/imaging/<script>_likelihood_summary_<instrument>_v<al.__version__>.png
-```
diff --git a/likelihood/interferometer/README.md b/likelihood/interferometer/README.md
deleted file mode 100644
index 24e1ff5..0000000
--- a/likelihood/interferometer/README.md
+++ /dev/null
@@ -1,47 +0,0 @@
-# likelihood/interferometer
-
-JAX JIT profiling for the PyAutoLens **interferometer** likelihood function — strong-lens reconstruction from radio / sub-millimetre visibility data (e.g. SMA, ALMA, VLA).
-
-The interferometer likelihood path is profiled at **full-pipeline JIT** only, not per-step. The reason is deliberate: the interferometer pipeline runs a Fourier-transformed mapping matrix, a visibilities-space data vector and curvature matrix, and an NNLS solve whose `xp=jnp` threading hasn't been fully characterised yet. Decomposing into per-step JITs would risk missing the cross-step XLA fusion that matters in practice, and would risk hitting library-level JAX blockers that we'd want to raise as separate issues rather than work around in the profiling scripts. Once the full-pipeline JIT is stable across all three model compositions, the per-step breakdown can land as a follow-up.
-
-## What each script measures
-
-1. **Eager baseline** — `FitInterferometer` with `xp=np`, reporting `figure_of_merit` / `log_likelihood`.
-2. **Full-pipeline JIT** — `jax.jit(analysis.log_likelihood_function)` on a pytree-registered `ModelInstance`. Reports lower / compile / first-call / steady-state per-call timings.
-3. **Batched evaluation** — `jax.jit(jax.vmap(full_pipeline))` for the population-of-evaluations regime a sampler actually runs in. Reports per-likelihood cost and speedup vs the single-JIT path.
-4. **Correctness check** — eager ≡ JIT log-likelihood agreement at `rtol=1e-4`.
-5. **Static memory analysis** of the batched program.
-6. **Versioned JSON + PNG write** using the same schema as the imaging scripts so results compare side-by-side.
-
-## Scripts
-
-| Script | Source representation | What it profiles |
-|--------|-----------------------|------------------|
-| [`mge.py`](./mge.py) | MGE (linear light profiles only) | Baseline interferometer path. Exercises the `TuplePrior` pytree support landed in PyAutoFit#1222. |
-| [`pixelization.py`](./pixelization.py) | Rectangular pixelization | Adds source mesh + regularization to the visibility-space inversion. |
-| [`delaunay.py`](./delaunay.py) | Delaunay pixelization | Production-style irregular-source reconstruction in visibility space. |
-
-## Default dataset
-
-`dataset/interferometer/sma/` — an SMA-like mock (pixel scale 0.1″, real-space shape 256×256) committed to this repo. Includes `data.fits`, `noise_map.fits`, `uv_wavelengths.fits`, `positions.json`, and the seeded `tracer.json`. Other instruments (e.g. ALMA, VLA) can be regenerated via the source-of-truth scripts at `autolens_workspace_developer/jax_profiling/dataset_setup/interferometer.py` and copied into `dataset/interferometer/<instrument>/`.
-
-## Headline run-times (populated by Phase 4)
-
-Auto-generated by `scripts/build_readme.py` from the latest `*_summary_v<version>.json` artifacts under `results/likelihood/interferometer/`. Hardware tier is CPU only today.
-
-<!-- BEGIN auto-table:likelihood-interferometer -->
-| Script | Instrument | Latest single-JIT per-call | PyAutoLens version |
-|--------|------------|----------------------------|--------------------|
-| `delaunay.py` | sma | 154.5 ms | v2026.5.14.2 |
-| `mge.py` | sma | 33.6 ms | v2026.5.14.2 |
-| `pixelization.py` | sma | 113.6 ms | v2026.5.14.2 |
-<!-- END auto-table:likelihood-interferometer -->
-
-## Output
-
-Each script writes:
-
-```
-results/likelihood/interferometer/<script>_likelihood_summary_<instrument>_v<al.__version__>.json
-results/likelihood/interferometer/<script>_likelihood_summary_<instrument>_v<al.__version__>.png
-```
diff --git a/likelihood/point_source/README.md b/likelihood/point_source/README.md
deleted file mode 100644
index 80010f7..0000000
--- a/likelihood/point_source/README.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# likelihood/point_source
-
-JAX JIT profiling for the PyAutoLens **point-source** likelihood function — fitting the multiple images of a lensed point source (quasar, supernova, or compact emitter) given an observed set of image-plane positions and a parametric lens model.
-
-The point-source likelihood has two distinct chi-squared variants depending on where the comparison between model and data happens. Both are JIT-traceable end-to-end through the underlying `PointSolver` (a triangle-refinement loop that threads `xp=jnp` through every step).
-
-## Two chi-squared variants
-
-| Variant | Computed in | Comparison |
-|---------|-------------|------------|
-| **Image-plane** | image plane coordinates | Solve for the model image positions via `PointSolver`, pair each model image with the closest observed image, compute χ² in image-plane coordinates. |
-| **Source-plane** | source plane coordinates | Ray-trace observed image positions back to the source plane, compute χ² of the inferred source-plane scatter (i.e. how tightly the back-traced images cluster around a single source). |
-
-Image-plane fitting is closer to what an observer would naively expect ("how well do the model images line up with the data images?"), but is more expensive because of the forward solver. Source-plane fitting is cheaper (no forward solve) but is a proxy: zero scatter in the source plane does not exactly correspond to zero image-plane chi-squared.
-
-## What each script measures
-
-For both variants:
-
-1. **Eager baseline** — `xp=np` log-likelihood for sanity.
-2. **Single-JIT** — lower / compile / first-call / steady-state per-call timings.
-3. **vmap** — batched per-likelihood cost and speedup vs single-JIT.
-4. **Three-tier numerical correctness**:
-    - eager ≡ JIT
-    - JIT ≡ vmap (every entry of the batched output)
-    - a hardcoded `EXPECTED_LOG_LIKELIHOOD_*` regression constant guarding against silent drift in the solver / chi-squared stack. This depends on the seeded simulator (`noise_seed=1` in the upstream `dataset_setup/point_source.py`) staying bit-stable.
-
-## Scripts
-
-| Script | Variant | Notes |
-|--------|---------|-------|
-| [`image_plane.py`](./image_plane.py) | Image-plane χ² | Full pipeline JIT-traceable end-to-end because `PointSolver` threads `xp=jnp` through every step. |
-| [`source_plane.py`](./source_plane.py) | Source-plane χ² | Cheaper than image-plane; no forward solver. |
-
-## Default dataset
-
-`dataset/point_source/simple/` — a minimal seeded dataset with `point_dataset_positions_only.json` (4 observed image positions) and the truth `tracer.json`. Both files are committed to this repo.
-
-## Headline run-times (latest per script × dataset)
-
-Auto-generated by `scripts/build_readme.py` from the latest `*_summary_v<version>.json` artifacts under `results/likelihood/point_source/`. Hardware tier is CPU only today. **Cells may show `—` while [PyAutoLens#514](https://github.com/PyAutoLabs/PyAutoLens/issues/514) is open** — the regression assertion in both scripts is intentionally load-bearing while the upstream drift is triaged, so neither script reaches the JSON-write step in the current PyAutoLens release.
-
-<!-- BEGIN auto-table:likelihood-point_source -->
-| Script | Instrument | Latest single-JIT per-call | PyAutoLens version |
-|--------|------------|----------------------------|--------------------|
-| `image_plane.py` | — | 22.5 ms | v2026.5.14.2 |
-| `source_plane.py` | — | 691 μs | v2026.5.14.2 |
-<!-- END auto-table:likelihood-point_source -->
-
-## Output
-
-Each script writes:
-
-```
-results/likelihood/point_source/<script>_likelihood_summary_<dataset_name>_v<al.__version__>.json
-results/likelihood/point_source/<script>_likelihood_summary_<dataset_name>_v<al.__version__>.png
-```
diff --git a/likelihood_breakdown/README.md b/likelihood_breakdown/README.md
new file mode 100644
index 0000000..13a15cf
--- /dev/null
+++ b/likelihood_breakdown/README.md
@@ -0,0 +1,113 @@
+# likelihood_breakdown
+
+Per-step JIT decomposition of the PyAutoLens likelihood function. The headline question is:
+
+> *"Where, inside this likelihood, is the time actually going?"*
+
+Run one of these scripts when you want to find the **next source-code-level optimization target** for a given dataset class / model. Each script JIT-compiles every step of the pipeline as an isolated JAX program and reports its lower / compile / first-call / steady-state time, then writes a per-step JSON + horizontal bar chart so the dominant step is immediately visible.
+
+For *how long the likelihood actually takes* on production hardware — i.e. a single end-to-end number per (hardware, precision) config — use the sibling package [`likelihood_runtime/`](../likelihood_runtime/) instead. The two packages are deliberately disjoint so neither has to pay the other's cost.
+
+## Methodology
+
+For each pipeline step (e.g. *ray-trace grids* → *blurred mapping matrix* → *curvature matrix F* → *NNLS reconstruction* → *log-evidence*), the script:
+
+1. Wraps the step in a small Python function whose only argument is the upstream JAX array(s) it consumes.
+2. Calls a per-step `jit_profile(func, label, *args, n_repeats=10)` helper that records:
+   - **lower** — `jax.jit(func).lower(...)` (JAX → MLIR tracing time)
+   - **compile** — XLA → device-binary compile time
+   - **first call** — initial execution including any deferred kernel setup
+   - **steady_state × 10** — average over ten subsequent calls; this is the number that goes into the per-step bar.
+3. Asserts the JIT output matches the eager FitImaging / FitInterferometer reference at `rtol=1e-4`, so the per-step decomposition is provably equivalent to the production path.
+4. Emits a JSON with `{steps: {name: per_call_s}, total_step_by_step: ...}` and a single horizontal bar chart sorted by step cost.
+
+## Single-config rationale
+
+The per-step JIT compile is expensive — for a 1000-vertex Delaunay cell it's tens of minutes of compile time. Running the same 10-step decomposition six times (CPU/GPU/A100 × fp64/mp) burns roughly an hour of compile per cell, for marginal new signal: cross-hardware comparisons live in the runtime package. Default is **CPU fp64**.
+
+Opt in to GPU with `--gpu` when you suspect a step's bottleneck shape changes on different backends (XLA fusion behaviour, sparse-precision-matrix layout, etc.). This is the only multi-config knob the breakdown scripts expose.
+
+## XLA fusion caveat
+
+XLA can — and frequently does — fuse adjacent steps into a single kernel at full-pipeline JIT time, so the **sum of per-step bars is an *upper bound* on the production cost**, not the production cost itself. If
+
+    sum(steps) ≫ full_pipeline_single_jit  (from likelihood_runtime/)
+
+then fusion is doing most of the optimization for you and the per-step decomposition is a misleading guide; focus optimisation on whichever step is large *and* doesn't fuse cleanly (typically the ones that involve a Python-level callback, a serial NNLS solve, or a non-uniform memory pattern).
+
+If, on the other hand, the two numbers agree closely, the per-step bars are a faithful map of the per-call cost and the biggest bar is your target.
+
+## Scripts
+
+| Script | Dataset class | Source model | Notes |
+|--------|--------------|--------------|-------|
+| `imaging/mge.py` | Imaging | MGE linear bulge | Linear MGE source; 8-step pipeline. |
+| `imaging/pixelization.py` | Imaging | RectangularAdaptImage | 13-step pipeline incl. mesh + regularisation. |
+| `imaging/delaunay.py` | Imaging | DelaunayBrightnessImage | 13-step pipeline; Hilbert-curve mesh. |
+| `interferometer/delaunay.py` | Interferometer | DelaunayBrightnessImage + sparse-DFT | 11-step pipeline. The transform-mapping-matrix step is the interferometer-specific replacement for imaging's PSF convolution. |
+| `datacube/delaunay.py` | Datacube | DelaunayBrightnessImage × N channels | 8-step pipeline. Channel-invariant steps profiled once; channel-variant steps profiled on channel 0 and multiplied by `N_channels` for the cube cost. |
+
+Four cells are intentionally absent from this package:
+- `interferometer/mge` — full-pipeline-by-design, no per-step decomposition (see runtime).
+- `interferometer/pixelization` — same reason; the sparse precision-operator path doesn't decompose meaningfully.
+- `point_source/{image_plane,source_plane}` — single short JIT shots.
+
+These four live only in `likelihood_runtime/`.
+
+## How to read the output
+
+For each cell, the script writes two files into `results/breakdown/<class>/`:
+
+- `<model>_breakdown_<instrument>_v<al_version>.json` — schema:
+  ```jsonc
+  {
+    "autolens_version": "2026.5.14.2",
+    "device": {"backend": "cpu", "device": "TFRT_CPU_0"},
+    "instrument": "hst",
+    "configuration": { ... mask + mesh + pixel-scale snapshot ... },
+    "steps": {
+      "Ray-trace grids":           0.0020,
+      "Blurred image (PSF)":       0.0016,
+      "Mapping matrix":            0.0586,
+      "Blurred mapping matrix":    0.1493,
+      "Curvature matrix (F)":      0.0039,
+      "NNLS reconstruction":       0.0007,
+      "Mapped reconstructed image":0.0004,
+      "Chi-squared / log_evidence":0.0008
+    },
+    "total_step_by_step": 0.2173,
+    "log_likelihood_eager": ...
+  }
+  ```
+- `<model>_breakdown_<instrument>_v<al_version>.png` — horizontal bar chart, one bar per step, sorted by cost.
+
+### Reading the bar chart
+
+The top bar is the next thing to optimise. Compare against the runtime package's `full_pipeline_single_jit` for the same cell — if `total_step_by_step` is within a factor of ~2 of the full-pipeline number, the per-step decomposition is faithful. If it's much larger, XLA fusion is hiding the real cost; treat per-step as a coarse guide.
+
+## Running
+
+From the autolens_profiling root:
+
+```bash
+# Default (CPU fp64, sma/hst as appropriate to the cell)
+python likelihood_breakdown/imaging/mge.py
+
+# GPU backend
+python likelihood_breakdown/imaging/mge.py --gpu
+
+# Mixed precision (rare in breakdown — only when investigating fp32-induced step changes)
+python likelihood_breakdown/imaging/mge.py --use-mixed-precision
+```
+
+The dataset is auto-simulated via `simulators/<dataset_type>.py --instrument <name>` if `dataset/<class>/<instrument>/` is missing. Subsequent runs reuse the cached dataset.
+
+## When to choose breakdown vs runtime
+
+| Question | Package |
+|----------|---------|
+| "Where should I focus PyAutoLens optimisation work for this cell?" | **breakdown** |
+| "How long will my A100 sampler run take per likelihood call?" | runtime |
+| "Does mixed precision actually save time on this cell?" | runtime |
+| "Which step fuses cleanly under XLA and which doesn't?" | breakdown (compare total_step_by_step vs runtime's full_pipeline_single_jit) |
+| "How does the bottleneck shape change between consumer GPU and A100?" | runtime (and re-run breakdown on the new hardware if the shape changed) |
diff --git a/likelihood/datacube/__init__.py b/likelihood_breakdown/datacube/__init__.py
similarity index 100%
rename from likelihood/datacube/__init__.py
rename to likelihood_breakdown/datacube/__init__.py
diff --git a/likelihood/datacube/delaunay.py b/likelihood_breakdown/datacube/delaunay.py
similarity index 81%
rename from likelihood/datacube/delaunay.py
rename to likelihood_breakdown/datacube/delaunay.py
index ca2b3fc..3321d4e 100644
--- a/likelihood/datacube/delaunay.py
+++ b/likelihood_breakdown/datacube/delaunay.py
@@ -1,28 +1,23 @@
 """
-JAX Profiling: Delaunay Datacube Likelihood (Step-by-Step)
-==========================================================
-
-Profiles each step of the JAX likelihood function for an ALMA-style datacube —
-a list of N ``Interferometer`` channels sharing a single lens model — where
-each channel reconstructs its own source with a Delaunay pixelization +
-ConstantSplit regularization.
-
-Mirrors the step-by-step structure of
-``likelihood/interferometer/delaunay.py`` (Phase 2 of the datacube
-roadmap, just merged). The key new ingredient is the **channel-invariant vs
-channel-variant** split: most steps are computed once for the whole cube
-(shared lens, shared mesh, shared mask), only the NUFFT-based inversion-setup
-chain, the data vector, the curvature matrix, the reconstruction, and the
-log-evidence depend on per-channel data.
+JAX Profiling: Delaunay Datacube Likelihood — Per-Step Breakdown
+================================================================
+
+Decomposes each step of the JAX likelihood function for an ALMA-style
+datacube (a list of N ``Interferometer`` channels sharing a single lens
+model) into individual JIT-profiled stages.  Channel-invariant steps are
+timed once; channel-variant steps are JIT-compiled on channel 0 and the
+reported cube cost is ``N × per-channel steady-state per-call``.
+
+This is the **breakdown** companion to ``likelihood_runtime/datacube/delaunay.py``.
+The runtime variant benchmarks the full-pipeline single-JIT and vmap
+performance; this script isolates the cost of every individual pipeline
+stage so the shared-``Lᵀ W̃ L`` optimisation target is clearly visible
+in the bar chart.
 
 The cube total is::
 
     cube_cost = sum(channel_invariant_costs) + N_channels * sum(channel_variant_costs)
 
-That number quantifies how much the deferred shared-``Lᵀ W̃ L`` optimisation
-will save: moving the curvature matrix from per-channel to shared would
-subtract ``(N - 1) * curvature_matrix_cost`` from the cube total.
-
 Channel-invariant vs channel-variant taxonomy
 ---------------------------------------------
 
@@ -49,15 +44,7 @@
 This profiler reuses the SMA interferometer dataset
 (``dataset/interferometer/sma/``) loaded N times as a 4-channel
 "cube". Each channel has identical visibilities, noise map and uv_wavelengths
-— the point here is timing, not science. The N-channel cube log-evidence is
-``N × single-channel log-evidence`` exactly, which makes the regression
-assertion trivial.
-
-If you want a realistic per-channel-distinct cube, point the loader at the
-workspace simulator output at
-``../autolens_workspace/dataset/interferometer/datacube/sim_simple/``; the
-JIT-cost taxonomy doesn't change because it's a function of which arrays are
-loop-variables in ``FitInterferometer``, not the data values themselves.
+— the point here is timing, not science.
 
 Measures
 --------
@@ -68,23 +55,8 @@
    call (lower / compile / first-call / steady-state × 10). Channel-invariant
    stages are timed once; channel-variant stages are timed on channel 0 and
    the cube cost is reported as ``N × per-call``.
-3. Full-pipeline cube JIT: ``jax.jit`` over the explicit
-   ``sum(analysis.log_likelihood_function(instance) for analysis in
-   analysis_list)`` — the same shape as the user-facing
-   ``datacube/likelihood_function.py`` and the cube modeling scripts'
-   internal ``FactorGraphModel`` sum.
-4. Correctness: per-step recomputed cube log-evidence and full-pipeline JIT
-   log-evidence both match the summed eager ``FitInterferometer.log_evidence``
-   at ``rtol=1e-4``.
-5. Results JSON + bar chart written to ``results/jit/datacube/`` using the
-   same schema as the interferometer sibling. Bar chart shows the cube-total
-   form of every step (channel-variant entries pre-multiplied by N).
-
-vmap is **skipped** for the cube profiler. The natural batching dimension is
-"datasets" (one entry per channel) not "parameters" (which the
-interferometer-sibling vmap exercises). A vmap-over-channels variant would
-require a different graph shape and isn't the bottleneck we care about for
-the shared-``Lᵀ W̃ L`` optimisation.
+3. Results JSON + bar chart written to ``results/breakdown/datacube/`` using the
+   same schema as the interferometer sibling.
 """
 
 import numpy as np
@@ -102,8 +74,8 @@
 import autoarray as aa
 from autofit.jax import register_model as _register_model_pytrees
 
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-from adapt_image_util import adapt_image_for_dataset  # noqa: E402
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+from _adapt_image_util import adapt_image_for_dataset  # noqa: E402
 
 # ---------------------------------------------------------------------------
 # Instrument configuration
@@ -121,8 +93,6 @@
 
 # Sweep-driver CLI args (--config-name / --output-dir / --use-mixed-precision).
 # Tolerates extra/unknown args via parse_known_args inside the helper.
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
 from _profile_cli import (  # noqa: E402
     parse_profile_cli,
     device_info_dict,
@@ -786,88 +756,6 @@ def compute_log_evidence(
 )
 
 
-# ===================================================================
-# PART C — Full-pipeline cube JIT (sum of per-channel log_likelihoods)
-# ===================================================================
-
-print("\n" + "=" * 70)
-print("FULL-PIPELINE CUBE JIT (for comparison)")
-print("=" * 70)
-
-# Part C is expensive at large n_channels: lower + compile build a graph
-# proportional to n_channels (e.g. ~70s for n_channels=34 on a laptop CPU),
-# and the steady-state first-call follows. Default to skipping; opt in with
-# CUBE_FULL_JIT=1 when the full-pipeline timing matters (e.g. comparing
-# step-by-step total against single-JIT).
-_run_full_cube_jit = os.environ.get("CUBE_FULL_JIT") == "1"
-
-if _run_full_cube_jit:
-    analysis_list = [
-        al.AnalysisInterferometer(
-            dataset=d,
-            adapt_images=adapt_images,
-            settings=al.Settings(use_mixed_precision=_cli.use_mixed_precision),
-            use_jax=True,
-        )
-        for d in dataset_list
-    ]
-
-    def full_cube_pipeline_from_params(params_tree):
-        """Cube log-evidence via the explicit per-channel sum.
-
-        Same shape as the user-facing ``datacube/likelihood_function.py``:
-        feeds the shared instance to every per-channel
-        ``AnalysisInterferometer.log_likelihood_function`` and sums.
-        """
-        total = jnp.zeros(())
-        for analysis in analysis_list:
-            total = total + analysis.log_likelihood_function(instance=params_tree)
-        return total
-
-    _full_cube_n_repeats = 3
-    _, full_cube_result = jit_profile(
-        full_cube_pipeline_from_params,
-        "full_cube_pipeline",
-        params_tree,
-        n_repeats=_full_cube_n_repeats,
-    )
-    full_pipeline_per_call = timer.records[-1][1] / _full_cube_n_repeats
-
-    print(f"  full cube log_evidence (JIT) = {full_cube_result}")
-
-    np.testing.assert_allclose(
-        float(full_cube_result),
-        cube_log_evidence_ref,
-        rtol=1e-4,
-        err_msg="Full-pipeline cube JIT log_evidence does not match summed eager FitInterferometer.log_evidence",
-    )
-    print("  Eager-vs-JIT cube correctness PASSED")
-else:
-    full_cube_result = None
-    full_pipeline_per_call = float("nan")
-    print(
-        "  Full-pipeline cube JIT SKIPPED — opt-in via CUBE_FULL_JIT=1. "
-        f"At n_channels={n_channels} the lower + compile alone is on the order of "
-        f"{n_channels * 2}-{n_channels * 3}s, so it's gated to keep the default "
-        "runtime usable; the per-step Part B JIT data above is what feeds the "
-        "shared-Lᵀ W̃ L analysis."
-    )
-
-# ===================================================================
-# PART D — vmap (skipped for cube)
-# ===================================================================
-#
-# The natural batching axis for a cube fit is "datasets" (one entry per
-# channel), not "parameters" (which the interferometer-sibling vmap exercises).
-# vmap-over-channels would require a different graph shape and isn't where the
-# shared-Lᵀ W̃ L optimisation lives. Skipped.
-
-print("\n--- vmap (skipped) ---")
-print(
-    "  Cube batching dimension is 'datasets', not 'parameters'. The "
-    "interferometer-sibling vmap pattern doesn't map cleanly here. Skipped."
-)
-
 # ===================================================================
 # Summary + JSON + bar chart
 # ===================================================================
@@ -880,7 +768,7 @@ def full_cube_pipeline_from_params(params_tree):
 al_version = al.__version__
 
 print("\n" + "=" * 70)
-print(f"JAX LIKELIHOOD FUNCTION SUMMARY — CUBE {instrument.upper()} × {n_channels} — v{al_version}")
+print(f"JAX LIKELIHOOD BREAKDOWN SUMMARY — CUBE {instrument.upper()} × {n_channels} — v{al_version}")
 print("=" * 70)
 print(f"  Instrument:              {instrument}")
 print(f"  Channels:                {n_channels}")
@@ -892,10 +780,6 @@ def full_cube_pipeline_from_params(params_tree):
 print(f"  Edge zeroed pixels:      {edge_pixels_total}")
 print("-" * 70)
 print(f"  Cube reference log_evidence:  {cube_log_evidence_ref}")
-if full_cube_result is not None:
-    print(f"  Cube JIT log_evidence:        {float(full_cube_result)}")
-else:
-    print(f"  Cube JIT log_evidence:        SKIPPED (CUBE_FULL_JIT=1 to enable)")
 print("-" * 70)
 
 max_label = max(len(label) for label, _ in likelihood_steps)
@@ -911,10 +795,6 @@ def full_cube_pipeline_from_params(params_tree):
 
 print("-" * 70)
 print(f"      {'TOTAL (step-by-step cube cost)':<{max_label}}  {step_total:>12.6f} s")
-if np.isfinite(full_pipeline_per_call):
-    print(f"      {'Full pipeline cube (single JIT)':<{max_label}}  {full_pipeline_per_call:>12.6f} s")
-else:
-    print(f"      {'Full pipeline cube (single JIT)':<{max_label}}  SKIPPED")
 print(f"      {f'Shared-Lᵀ W̃ L savings (curvature only, est.)':<{max_label}}  {shared_lwl_savings:>12.6f} s")
 print("=" * 70)
 
@@ -937,9 +817,6 @@ def full_cube_pipeline_from_params(params_tree):
         "regularization_coefficient": regularization_coefficient,
     },
     "cube_log_evidence_eager": cube_log_evidence_ref,
-    "cube_log_evidence_jit": (
-        float(full_cube_result) if full_cube_result is not None else None
-    ),
     "log_evidence_per_channel_eager": [float(le) for le in log_evidence_per_channel],
     "steps_cube_cost": {label: per_call for label, per_call in likelihood_steps},
     "per_channel_costs": {
@@ -950,15 +827,13 @@ def full_cube_pipeline_from_params(params_tree):
         "log_evidence": log_evidence_per_channel_cost,
     },
     "total_step_by_step_cube": step_total,
-    "full_pipeline_cube_single_jit": full_pipeline_per_call,
     "shared_lwl_savings_estimate": shared_lwl_savings,
-    "vmap": "SKIPPED — cube batching axis is 'datasets', not 'parameters'",
 }
 
 dict_path, chart_path = resolve_output_paths(
     _cli,
-    default_dir=_workspace_root / "results" / "likelihood" / "datacube",
-    default_basename=f"delaunay_likelihood_summary_{instrument}_v{al_version}",
+    default_dir=_workspace_root / "results" / "breakdown" / "datacube",
+    default_basename=f"delaunay_breakdown_{instrument}_v{al_version}",
 )
 dict_path.write_text(json.dumps(likelihood_summary, indent=2))
 print(f"\n  Results dict saved to: {dict_path}")
@@ -983,14 +858,6 @@ def full_cube_pipeline_from_params(params_tree):
         fontsize=9,
     )
 
-if np.isfinite(full_pipeline_per_call):
-    ax.axvline(
-        full_pipeline_per_call,
-        color="#C44E52",
-        linestyle="--",
-        linewidth=1.5,
-        label=f"Full pipeline cube (single JIT): {full_pipeline_per_call:.6f} s",
-    )
 ax.axvline(
     shared_lwl_savings,
     color="#8172B2",
@@ -1004,7 +871,7 @@ def full_cube_pipeline_from_params(params_tree):
 ax.invert_yaxis()
 ax.set_xlabel("Cube cost per call (s)", fontsize=11)
 fig.suptitle(
-    f"Delaunay Datacube Likelihood — {instrument.upper()} × {n_channels} channels",
+    f"Delaunay Datacube Likelihood Breakdown — {instrument.upper()} × {n_channels} channels",
     fontsize=12,
     fontweight="bold",
 )
@@ -1029,8 +896,8 @@ def full_cube_pipeline_from_params(params_tree):
 # ===================================================================
 #
 # Identical channels = exact N × single-channel log-evidence (for "sma").
-# For "hannah" the per-channel literal isn't pinned yet, so the assertion is
-# skipped until the value below is filled in from a clean run.
+# For "alma" / "alma_high" the per-channel literal isn't pinned yet, so the
+# assertion is skipped until the value below is filled in from a clean run.
 EXPECTED_LOG_EVIDENCE_PER_CHANNEL = {
     "sma": None,
     "alma": None,
@@ -1064,11 +931,3 @@ def full_cube_pipeline_from_params(params_tree):
         f"\n  Eager cube regression assertion PASSED: log_evidence matches "
         f"{expected_cube_log_evidence:.6f}"
     )
-    if full_cube_result is not None:
-        np.testing.assert_allclose(
-            float(full_cube_result),
-            expected_cube_log_evidence,
-            rtol=1e-3,
-            err_msg=f"datacube/delaunay[{instrument}]: regression — full cube log_evidence drifted",
-        )
-        print(f"  Full-pipeline cube regression assertion PASSED")
diff --git a/likelihood/imaging/delaunay.py b/likelihood_breakdown/imaging/delaunay.py
similarity index 81%
rename from likelihood/imaging/delaunay.py
rename to likelihood_breakdown/imaging/delaunay.py
index 2e2cb0c..3eb7e88 100644
--- a/likelihood/imaging/delaunay.py
+++ b/likelihood_breakdown/imaging/delaunay.py
@@ -1,20 +1,20 @@
 """
-JAX Profiling: Delaunay Imaging Likelihood (Step-by-Step)
-=========================================================
+JAX Profiling: Delaunay Imaging Likelihood — Per-Step Breakdown
+================================================================
 
-Profiles each step of the JAX likelihood function for an imaging dataset where
-the source galaxy is reconstructed using a Delaunay triangulation mesh with
-ConstantSplit regularization.
+Decomposes the JAX likelihood function for an imaging dataset (Hilbert/Delaunay
+source model) into its individual pipeline steps and JIT-profiles each one
+separately. This script is the **breakdown** counterpart to
+``likelihood_runtime/imaging/delaunay.py``, which measures only the
+full-pipeline single-JIT cost.
 
-Key differences from the rectangular pixelization profiling script:
+Key differences from the rectangular pixelization breakdown script:
 
-- Mesh vertices are computed in the **image-plane** via an Overlay grid, then
-  ray-traced to the source-plane (rectangular computes directly in source-plane).
+- Mesh vertices are computed in the **image-plane** via a Hilbert image mesh,
+  then ray-traced to the source-plane.
 - Edge points are appended around the mask border and zeroed during inversion.
-- Uses **InterpolatorDelaunay** (barycentric interpolation within triangles)
-  instead of bilinear interpolation on a rectangular grid.
-- Uses **ConstantSplit** regularization (cross-derivative scheme) instead of
-  the simpler Constant neighbour-difference scheme.
+- Uses **InterpolatorDelaunay** (barycentric interpolation within triangles).
+- Uses **ConstantSplit** regularization (cross-derivative scheme).
 - Delaunay triangulation itself uses scipy on CPU and cannot be JIT-compiled.
 
 Pipeline steps:
@@ -26,18 +26,22 @@
 5. Border relocation (data grid + mesh grid)
 6. Delaunay triangulation + interpolation + mapper
 7. Mapping matrix
-8. Blurred mapping matrix (PSF convolution)
+8. Blurred mapping matrix / Inversion setup (steps 5-8 combined)
 9. Data vector (D)
 10. Curvature matrix (F)
 11. Regularization matrix (H) — ConstantSplit scheme
 12. Regularized reconstruction: s = (F + H)^{-1} D
 13. Map reconstruction to image + log evidence
 
-Caveat: XLA may fuse operations differently when compiled as one program vs
-separate pieces, so per-step timings are approximate. They are still useful
-for identifying which step dominates.
+Per-step timing is approximate: XLA may fuse operations differently when
+compiled as one program vs separate pieces. All JAX timings use
+``block_until_ready()`` to force synchronous measurement.
 
-All JAX timings use `block_until_ready()` to force synchronous measurement.
+Output
+------
+
+Results JSON and PNG are written to ``results/breakdown/imaging/`` using
+the basename ``delaunay_breakdown_{instrument}_v{al_version}``.
 """
 
 import numpy as np
@@ -54,8 +58,8 @@
 import autoarray as aa
 from autofit.jax import register_model as _register_model_pytrees
 
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-from adapt_image_util import adapt_image_for_dataset  # noqa: E402
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+from _adapt_image_util import adapt_image_for_dataset  # noqa: E402
 
 # ---------------------------------------------------------------------------
 # Instrument configuration
@@ -73,8 +77,6 @@
 
 # Sweep-driver CLI args (--config-name / --output-dir / --use-mixed-precision).
 # Tolerates extra/unknown args via parse_known_args inside the helper.
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
 from _profile_cli import (  # noqa: E402
     parse_profile_cli,
     device_info_dict,
@@ -217,14 +219,6 @@ def jit_profile(func, label, *args, n_repeats=10):
 # ---------------------------------------------------------------------------
 # 2. Adapt image + image mesh (Hilbert)
 # ---------------------------------------------------------------------------
-#
-# ``image_mesh.Hilbert`` places the source mesh vertices in the image plane by
-# inverse-transform-sampling a Hilbert-curve ordering of the lensed source
-# adapt image. The result is a sparser mesh in faint regions and a denser one
-# where the source actually lives — production-grade, replaces the
-# uniform-coverage ``image_mesh.Overlay`` + circular-edge fallback that
-# preceded the Hilbert path. ``zeroed_pixels=0`` because Hilbert's placement
-# is data-driven; there are no fixed-position edge points to mask out.
 
 print("\n--- Adapt image (lensed source) ---")
 
@@ -897,105 +891,7 @@ def compute_log_evidence(
 print("  Assertion PASSED: inversion-matrix log_evidence matches FitImaging.log_evidence")
 
 # ===================================================================
-# PART C — Full-pipeline JIT for comparison
-# ===================================================================
-
-print("\n" + "=" * 70)
-print("FULL-PIPELINE JIT (for comparison)")
-print("=" * 70)
-
-analysis = al.AnalysisImaging(dataset=dataset, adapt_images=adapt_images, use_jax=True)
-
-def full_pipeline_from_params(params_tree):
-    return analysis.log_likelihood_function(instance=params_tree)
-
-_, full_result = jit_profile(full_pipeline_from_params, "full_pipeline", params_tree)
-full_pipeline_per_call = timer.records[-1][1] / 10
-
-print(f"  full log_likelihood = {full_result}")
-
-# ===================================================================
-# PART D — vmap + correctness
-# ===================================================================
-
-print("\n--- vmap batched evaluation ---")
-
-# WARNING: The vmap compilation for the Delaunay pipeline takes 20+ minutes on CPU.
-# The XLA graph for a batched Delaunay inversion (including scipy triangulation,
-# border relocation, interpolation, mapping matrix construction, and PSF convolution)
-# is extremely large. The single-call JIT above compiles in ~2s and runs in ~1.8s,
-# but vmap recompiles the entire graph for batch_size independent evaluations.
-#
-# This is likely a candidate for optimisation — either via custom_vjp to avoid
-# retracing the full pipeline, or by restructuring the Delaunay steps to reduce
-# the XLA graph size. For now, skip vmap by default and run it only when explicitly
-# requested via DELAUNAY_VMAP=1 environment variable.
-
-import os
-run_vmap = os.environ.get("DELAUNAY_VMAP", "0") == "1"
-
-if not run_vmap:
-    print("  SKIPPED: vmap compilation takes 20+ minutes for Delaunay pipeline.")
-    print("  Set DELAUNAY_VMAP=1 to run this section.")
-    vmap_batch_time = None
-    vmap_per_call = None
-    vmap_speedup = None
-else:
-
-    batch_size = 3
-    parameters = jax.tree_util.tree_map(
-        lambda leaf: jnp.broadcast_to(leaf, (batch_size, *leaf.shape)),
-        params_tree,
-    )
-
-    vmapped_full = jax.jit(jax.vmap(full_pipeline_from_params))
-
-    with timer.section("vmap_first_call"):
-        result_vmap = vmapped_full(parameters)
-        block(result_vmap)
-
-    n_vmap_repeats = 10
-    with timer.section(f"vmap_steady_x{n_vmap_repeats}"):
-        for _ in range(n_vmap_repeats):
-            result_vmap = vmapped_full(parameters)
-            block(result_vmap)
-
-    vmap_batch_time = timer.records[-1][1] / n_vmap_repeats
-    vmap_per_call = vmap_batch_time / batch_size
-    vmap_speedup = full_pipeline_per_call / vmap_per_call
-
-    print(f"  batch results = {result_vmap}")
-    print(f"  vmap batch of {batch_size}:   {vmap_batch_time:.6f} s")
-    print(f"  vmap per call:         {vmap_per_call:.6f} s")
-    print(f"  single JIT per call:   {full_pipeline_per_call:.6f} s")
-    print(f"  vmap speedup:          {vmap_speedup:.1f}x faster per likelihood")
-
-    np.testing.assert_allclose(
-        np.array(result_vmap),
-        float(full_result),
-        rtol=1e-4,
-        err_msg="delaunay: JAX vmap likelihood mismatch",
-    )
-    print("  Correctness check PASSED")
-
-    # --- Static memory analysis ---
-
-    print("\n--- Static memory analysis ---")
-
-    lowered_batched = vmapped_full.lower(parameters)
-    compiled_batched = lowered_batched.compile()
-
-    memory_analysis = compiled_batched.memory_analysis()
-    print(f"  Output size:  {memory_analysis.output_size_in_bytes / 1024**2:.3f} MB")
-    print(f"  Temp size:    {memory_analysis.temp_size_in_bytes / 1024**2:.3f} MB")
-    print(
-        f"  Total:        "
-        f"{(memory_analysis.output_size_in_bytes + memory_analysis.temp_size_in_bytes) / 1024**2:.3f} MB"
-    )
-
-
-# ===================================================================
-# JAX Likelihood Function Summary
+# Per-step breakdown summary + JSON + PNG
 # ===================================================================
 
 import json
@@ -1006,7 +902,7 @@ def full_pipeline_from_params(params_tree):
 al_version = al.__version__
 
 print("\n" + "=" * 70)
-print(f"JAX LIKELIHOOD FUNCTION SUMMARY — {instrument.upper()} — v{al_version}")
+print(f"PER-STEP BREAKDOWN SUMMARY — {instrument.upper()} — v{al_version}")
 print("=" * 70)
 print(f"  Instrument:            {instrument}")
 print(f"  Pixel scale:           {pixel_scale} arcsec/pixel")
@@ -1025,17 +921,11 @@ def full_pipeline_from_params(params_tree):
 
 print("-" * 70)
 print(f"      {'TOTAL (step-by-step)':<{max_label}}  {step_total:>12.6f} s")
-print(f"      {'Full pipeline (single JIT)':<{max_label}}  {full_pipeline_per_call:>12.6f} s")
-if vmap_per_call is not None:
-    print(f"      {f'vmap batch (per call)':<{max_label}}  {vmap_per_call:>12.6f} s")
-    print(f"      {f'vmap speedup vs single JIT':<{max_label}}  {vmap_speedup:>11.1f}x")
-else:
-    print(f"      {'vmap':<{max_label}}  {'SKIPPED':>12}")
 print("=" * 70)
 
 # --- Save results dictionary ---
 
-likelihood_summary = {
+breakdown_summary = {
     "autolens_version": al_version,
     "device": device_info_dict(),
     "instrument": instrument,
@@ -1049,24 +939,14 @@ def full_pipeline_from_params(params_tree):
     },
     "steps": {label: per_call for label, per_call in likelihood_steps},
     "total_step_by_step": step_total,
-    "full_pipeline_single_jit": full_pipeline_per_call,
-    "vmap": "SKIPPED — compilation takes 20+ minutes (set DELAUNAY_VMAP=1)",
 }
 
-if vmap_per_call is not None:
-    likelihood_summary["vmap"] = {
-        "batch_size": batch_size,
-        "batch_time": vmap_batch_time,
-        "per_call": vmap_per_call,
-        "speedup_vs_single_jit": round(vmap_speedup, 1),
-    }
-
 dict_path, chart_path = resolve_output_paths(
     _cli,
-    default_dir=_workspace_root / "results" / "likelihood" / "imaging",
-    default_basename=f"delaunay_likelihood_summary_{instrument}_v{al_version}",
+    default_dir=_workspace_root / "results" / "breakdown" / "imaging",
+    default_basename=f"delaunay_breakdown_{instrument}_v{al_version}",
 )
-dict_path.write_text(json.dumps(likelihood_summary, indent=2))
+dict_path.write_text(json.dumps(breakdown_summary, indent=2))
 print(f"\n  Results dict saved to: {dict_path}")
 
 # --- Save bar chart ---
@@ -1087,28 +967,12 @@ def full_pipeline_from_params(params_tree):
         fontsize=9,
     )
 
-ax.axvline(
-    full_pipeline_per_call,
-    color="#C44E52",
-    linestyle="--",
-    linewidth=1.5,
-    label=f"Full pipeline (single JIT): {full_pipeline_per_call:.6f} s",
-)
-if vmap_per_call is not None:
-    ax.axvline(
-        vmap_per_call,
-        color="#55A868",
-        linestyle="--",
-        linewidth=1.5,
-        label=f"vmap batch per call: {vmap_per_call:.6f} s ({vmap_speedup:.1f}x faster)",
-    )
-
 ax.set_yticks(y_pos)
 ax.set_yticklabels(labels, fontsize=10)
 ax.invert_yaxis()
 ax.set_xlabel("Time per call (s)", fontsize=11)
 fig.suptitle(
-    f"Delaunay Imaging Likelihood — {instrument.upper()}",
+    f"Delaunay Imaging Likelihood — Per-Step Breakdown — {instrument.upper()}",
     fontsize=12,
     fontweight="bold",
 )
@@ -1118,7 +982,6 @@ def full_pipeline_from_params(params_tree):
     f"total: {step_total:.6f} s",
     fontsize=9,
 )
-ax.legend(loc="lower right", fontsize=9)
 ax.margins(x=0.15)
 fig.tight_layout()
 
@@ -1128,15 +991,9 @@ def full_pipeline_from_params(params_tree):
 
 
 # ===================================================================
-# Regression assertion — realistic-scale deterministic log-evidence
+# Regression assertion — eager log_evidence only
 # ===================================================================
-#
-# Simulator truth parameters via GaussianPrior(mean=truth, sigma=small)
-# make the full-pipeline log-evidence deterministic at the prior median.
-# Hilbert image_mesh + 1500-pixel Delaunay; rtol=1e-3 for the JIT paths
-# matches imaging/pixelization (adaptive meshes amplify fp drift through
-# Cholesky / log_det). vmap result asserted only when DELAUNAY_VMAP=1
-# (vmap compile takes 20+ min).
+
 EXPECTED_LOG_EVIDENCE_HST = 29110.92085793  # 1500-pixel Hilbert/Delaunay, MGE-60 lens, adapt_image=lensed_source
 
 np.testing.assert_allclose(
@@ -1152,17 +1009,3 @@ def full_pipeline_from_params(params_tree):
     f"  Eager regression assertion PASSED: log_evidence matches "
     f"{EXPECTED_LOG_EVIDENCE_HST:.6f}"
 )
-np.testing.assert_allclose(
-    float(full_result),
-    EXPECTED_LOG_EVIDENCE_HST,
-    rtol=1e-3,
-    err_msg=f"imaging/delaunay[{instrument}]: regression — full log_evidence drifted",
-)
-if run_vmap:
-    np.testing.assert_allclose(
-        np.array(result_vmap),
-        EXPECTED_LOG_EVIDENCE_HST,
-        rtol=1e-3,
-        err_msg=f"imaging/delaunay[{instrument}]: regression — vmap log_evidence drifted",
-    )
-print(f"  Regression assertion PASSED: log_evidence matches {EXPECTED_LOG_EVIDENCE_HST:.6f}")
diff --git a/likelihood/imaging/mge.py b/likelihood_breakdown/imaging/mge.py
similarity index 75%
rename from likelihood/imaging/mge.py
rename to likelihood_breakdown/imaging/mge.py
index 84eccc4..6b259f6 100644
--- a/likelihood/imaging/mge.py
+++ b/likelihood_breakdown/imaging/mge.py
@@ -1,52 +1,45 @@
 """
-JAX Profiling: MGE Imaging Likelihood (Step-by-Step)
-=====================================================
-
-Profiles each step of the JAX likelihood function for an imaging dataset where
-the lens galaxy's light is modelled with a multi-Gaussian expansion (MGE).
-
-Rather than timing the whole likelihood as a single JIT-compiled block (which
-hides internal bottlenecks), this script JIT-compiles and times each step of
-the pipeline individually:
-
-1. Instance from parameter vector
-2. Build Tracer
-3. Ray-trace grids through the lens
-4. Compute mapping matrix (per-profile images before PSF)
-5. Compute blurred mapping matrix (PSF convolution)
-6. Compute data vector  (D)
-7. Compute curvature matrix  (F)
-8. Reconstruction via positive-only NNLS
-9. Map reconstruction back to image plane
-10. Chi-squared and log likelihood
+JAX Profiling: MGE Imaging Likelihood — Per-Step Breakdown
+===========================================================
+
+Decomposes the JAX likelihood function for an imaging dataset (MGE lens model)
+into its individual pipeline steps and JIT-profiles each one separately. This
+script is the **breakdown** counterpart to ``likelihood_runtime/imaging/mge.py``,
+which measures only the full-pipeline single-JIT cost.
+
+Per-step timing is approximate: XLA may fuse operations differently when
+compiled as one program vs separate pieces, but the breakdown is still useful
+for identifying which step dominates the runtime budget.
+
+Steps profiled:
+
+1. Ray-trace grids
+2. Mapping matrix (linear profile images before PSF)
+3. Blurred mapping matrix (PSF convolution of each profile)
+4. Data vector (D)
+5. Curvature matrix (F)
+6. Reconstruction via positive-only NNLS
+7. Map reconstruction back to image plane
+8. Chi-squared and log likelihood
 
 Note: because the MGE model uses only linear light profiles (lp_linear),
 there is no non-linear blurred image or profile-subtracted image step.
 
-Caveat: XLA may fuse operations differently when compiled as one program vs
-separate pieces, so per-step timings are approximate. They are still useful
-for identifying which step dominates.
+All JAX timings use ``block_until_ready()`` to force synchronous measurement.
 
-All JAX timings use `block_until_ready()` to force synchronous measurement.
-
-Pytree-native parameter inputs (recommended pattern)
-----------------------------------------------------
+Pytree-native parameter inputs
+------------------------------------
 
 This script uses ``af.ModelInstance`` as the JIT input via PyAutoFit's
-opt-in pytree registration (``autofit.jax.register_model(model)``). The
-JIT'd closures consume the instance directly, so:
-
-* ``model.instance_from_vector`` is no longer called inside the JIT trace —
-  parameter unpacking happens once at registration time and JAX walks the
-  pytree on every call.
-* Parameter identity is preserved through ``jax.jit`` and ``jax.vmap``;
-  XLA cache keys reflect the structured pytree, not a flat vector shape.
-* ``vmap`` batching is ``jax.tree_util.tree_map`` over the instance leaves
-  — callers no longer have to stack a ``(batch, N)`` array.
-
-New profiling scripts should follow this pattern. The flat-vector path in
-``Fitness.call`` / ``model.instance_from_vector(..., xp=jnp)`` remains the
-production likelihood entry point and is intentionally untouched here.
+opt-in pytree registration (``autofit.jax.register_model(model)``). See
+``likelihood_runtime/imaging/mge.py`` for a full description of the pytree
+pattern. This breakdown script shares the same setup.
+
+Output
+------
+
+Results JSON and PNG are written to ``results/breakdown/imaging/`` using
+the basename ``mge_breakdown_{instrument}_v{al_version}``.
 """
 
 import numpy as np
@@ -79,7 +72,6 @@
 
 # Sweep-driver CLI args (--config-name / --output-dir / --use-mixed-precision).
 # Tolerates extra/unknown args via parse_known_args inside the helper.
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
 from _profile_cli import (  # noqa: E402
     parse_profile_cli,
@@ -602,100 +594,7 @@ def compute_log_likelihood(data, noise_map, mapped_recon):
 print("  Assertion PASSED: step-by-step matches FitImaging.log_likelihood")
 
 # ===================================================================
-# PART C — Full-pipeline JIT for comparison
-# ===================================================================
-
-print("\n" + "=" * 70)
-print("FULL-PIPELINE JIT (for comparison)")
-print("=" * 70)
-
-# Build the analysis with ``use_jax=True`` so its ``log_likelihood_function``
-# threads ``xp=jnp`` through every internal call (border relocation, profile
-# evaluation, inversion, etc.). This is the same wiring that ``Fitness.call``
-# uses in production — we just feed it our pytree-native instance directly
-# instead of going through ``model.instance_from_vector(parameters, xp=jnp)``.
-analysis = al.AnalysisImaging(dataset=dataset, use_jax=True)
-
-def full_pipeline_from_params(params_tree):
-    """Full likelihood from a pytree-shaped ``ModelInstance``.
-
-    No flat-vector unpacking inside the trace — the instance crosses the JIT
-    boundary directly, with constants (redshifts, etc.) kept static via the
-    ``aux_data`` partition set up by ``autofit.jax.register_model``.
-    """
-    return analysis.log_likelihood_function(instance=params_tree)
-
-_, full_result = jit_profile(full_pipeline_from_params, "full_pipeline", params_tree)
-full_pipeline_per_call = timer.records[-1][1] / 10
-
-print(f"  full log_likelihood = {full_result}")
-
-# ===================================================================
-# PART D — vmap + correctness
-# ===================================================================
-
-print("\n--- vmap batched evaluation ---")
-
-batch_size = 3
-
-# Build the batched pytree: every leaf gets a fresh leading batch axis. No
-# flat-vector reshaping required — JAX walks the pytree via the registration
-# we added in PART A.
-parameters = jax.tree_util.tree_map(
-    lambda leaf: jnp.broadcast_to(leaf, (batch_size, *leaf.shape)),
-    params_tree,
-)
-
-vmapped_full = jax.jit(jax.vmap(full_pipeline_from_params))
-
-with timer.section("vmap_first_call"):
-    result_vmap = vmapped_full(parameters)
-    block(result_vmap)
-
-n_vmap_repeats = 10
-with timer.section(f"vmap_steady_x{n_vmap_repeats}"):
-    for _ in range(n_vmap_repeats):
-        result_vmap = vmapped_full(parameters)
-        block(result_vmap)
-
-vmap_batch_time = timer.records[-1][1] / n_vmap_repeats
-vmap_per_call = vmap_batch_time / batch_size
-vmap_speedup = full_pipeline_per_call / vmap_per_call
-
-print(f"  batch results = {result_vmap}")
-print(f"  vmap batch of {batch_size}:   {vmap_batch_time:.6f} s")
-print(f"  vmap per call:         {vmap_per_call:.6f} s")
-print(f"  single JIT per call:   {full_pipeline_per_call:.6f} s")
-print(f"  vmap speedup:          {vmap_speedup:.1f}x faster per likelihood")
-
-np.testing.assert_allclose(
-    np.array(result_vmap),
-    float(full_result),
-    rtol=1e-4,
-    err_msg="mge: JAX vmap likelihood mismatch",
-)
-print("  Correctness check PASSED")
-
-# ===================================================================
-# PART E — Static memory analysis
-# ===================================================================
-
-print("\n--- Static memory analysis ---")
-
-lowered_batched = vmapped_full.lower(parameters)
-compiled_batched = lowered_batched.compile()
-
-memory_analysis = compiled_batched.memory_analysis()
-print(f"  Output size:  {memory_analysis.output_size_in_bytes / 1024**2:.3f} MB")
-print(f"  Temp size:    {memory_analysis.temp_size_in_bytes / 1024**2:.3f} MB")
-print(
-    f"  Total:        "
-    f"{(memory_analysis.output_size_in_bytes + memory_analysis.temp_size_in_bytes) / 1024**2:.3f} MB"
-)
-
-
-# ===================================================================
-# JAX Likelihood Function Summary
+# Per-step breakdown summary + JSON + PNG
 # ===================================================================
 
 import json
@@ -706,7 +605,7 @@ def full_pipeline_from_params(params_tree):
 al_version = al.__version__
 
 print("\n" + "=" * 70)
-print(f"JAX LIKELIHOOD FUNCTION SUMMARY — {instrument.upper()} — v{al_version}")
+print(f"PER-STEP BREAKDOWN SUMMARY — {instrument.upper()} — v{al_version}")
 print("=" * 70)
 print(f"  Instrument:            {instrument}")
 print(f"  Pixel scale:           {pixel_scale} arcsec/pixel")
@@ -724,15 +623,13 @@ def full_pipeline_from_params(params_tree):
 
 print("-" * 70)
 print(f"      {'TOTAL (step-by-step)':<{max_label}}  {step_total:>12.6f} s")
-print(f"      {'Full pipeline (single JIT)':<{max_label}}  {full_pipeline_per_call:>12.6f} s")
-print(f"      {f'vmap batch={batch_size} (per call)':<{max_label}}  {vmap_per_call:>12.6f} s")
-print(f"      {f'vmap speedup vs single JIT':<{max_label}}  {vmap_speedup:>11.1f}x")
 print("=" * 70)
 
 # --- Save results dictionary ---
 
-likelihood_summary = {
+breakdown_summary = {
     "autolens_version": al_version,
+    "device": device_info_dict(),
     "instrument": instrument,
     "configuration": {
         "pixel_scale_arcsec": pixel_scale,
@@ -743,20 +640,14 @@ def full_pipeline_from_params(params_tree):
     },
     "steps": {label: per_call for label, per_call in likelihood_steps},
     "total_step_by_step": step_total,
-    "full_pipeline_single_jit": full_pipeline_per_call,
-    "vmap": {
-        "batch_size": batch_size,
-        "batch_time": vmap_batch_time,
-        "per_call": vmap_per_call,
-        "speedup_vs_single_jit": round(vmap_speedup, 1),
-    },
 }
 
-results_dir = _workspace_root / "results" / "likelihood" / "imaging"
-results_dir.mkdir(parents=True, exist_ok=True)
-
-dict_path = results_dir / f"mge_likelihood_summary_{instrument}_v{al_version}.json"
-dict_path.write_text(json.dumps(likelihood_summary, indent=2))
+dict_path, chart_path = resolve_output_paths(
+    _cli,
+    default_dir=_workspace_root / "results" / "breakdown" / "imaging",
+    default_basename=f"mge_breakdown_{instrument}_v{al_version}",
+)
+dict_path.write_text(json.dumps(breakdown_summary, indent=2))
 print(f"\n  Results dict saved to: {dict_path}")
 
 # --- Save bar chart ---
@@ -777,27 +668,12 @@ def full_pipeline_from_params(params_tree):
         fontsize=9,
     )
 
-ax.axvline(
-    full_pipeline_per_call,
-    color="#C44E52",
-    linestyle="--",
-    linewidth=1.5,
-    label=f"Full pipeline (single JIT): {full_pipeline_per_call:.6f} s",
-)
-ax.axvline(
-    vmap_per_call,
-    color="#55A868",
-    linestyle="--",
-    linewidth=1.5,
-    label=f"vmap batch={batch_size} per call: {vmap_per_call:.6f} s ({vmap_speedup:.1f}x faster)",
-)
-
 ax.set_yticks(y_pos)
 ax.set_yticklabels(labels, fontsize=10)
 ax.invert_yaxis()
 ax.set_xlabel("Time per call (s)", fontsize=11)
 fig.suptitle(
-    f"MGE Imaging Likelihood — {instrument.upper()}",
+    f"MGE Imaging Likelihood — Per-Step Breakdown — {instrument.upper()}",
     fontsize=12,
     fontweight="bold",
 )
@@ -807,24 +683,18 @@ def full_pipeline_from_params(params_tree):
     f"total: {step_total:.6f} s",
     fontsize=9,
 )
-ax.legend(loc="lower right", fontsize=9)
 ax.margins(x=0.15)
 fig.tight_layout()
 
-chart_path = results_dir / f"mge_likelihood_summary_{instrument}_v{al_version}.png"
 fig.savefig(chart_path, dpi=150)
 plt.close(fig)
 print(f"  Bar chart saved to:    {chart_path}")
 
 
 # ===================================================================
-# Regression assertion — realistic-scale deterministic likelihood
+# Regression assertion — eager log_likelihood only
 # ===================================================================
-#
-# Simulator truth parameters (mass + shear fixed; MGE bulges free around
-# default centre/ell_comps priors) put the evaluation point at the
-# physically-meaningful truth operating point. Eager, JIT, and vmap all
-# agree to ~1e-11 precision.
+
 EXPECTED_LOG_LIKELIHOOD_HST = 27379.38890685539
 
 np.testing.assert_allclose(
@@ -840,16 +710,3 @@ def full_pipeline_from_params(params_tree):
     f"  Eager regression assertion PASSED: log_likelihood matches "
     f"{EXPECTED_LOG_LIKELIHOOD_HST:.6f}"
 )
-np.testing.assert_allclose(
-    float(full_result),
-    EXPECTED_LOG_LIKELIHOOD_HST,
-    rtol=1e-4,
-    err_msg=f"imaging/mge[{instrument}]: regression — full log_likelihood drifted",
-)
-np.testing.assert_allclose(
-    np.array(result_vmap),
-    EXPECTED_LOG_LIKELIHOOD_HST,
-    rtol=1e-4,
-    err_msg=f"imaging/mge[{instrument}]: regression — vmap log_likelihood drifted",
-)
-print(f"  Regression assertion PASSED: log_likelihood matches {EXPECTED_LOG_LIKELIHOOD_HST:.6f}")
diff --git a/likelihood/imaging/pixelization.py b/likelihood_breakdown/imaging/pixelization.py
similarity index 80%
rename from likelihood/imaging/pixelization.py
rename to likelihood_breakdown/imaging/pixelization.py
index 5416338..697499e 100644
--- a/likelihood/imaging/pixelization.py
+++ b/likelihood_breakdown/imaging/pixelization.py
@@ -1,14 +1,14 @@
 """
-JAX Profiling: Pixelization Imaging Likelihood (Step-by-Step)
-=============================================================
+JAX Profiling: Pixelization Imaging Likelihood — Per-Step Breakdown
+====================================================================
 
-Profiles each step of the JAX likelihood function for an imaging dataset where
-the source galaxy is reconstructed using a rectangular pixelization with
-constant regularization.
+Decomposes the JAX likelihood function for an imaging dataset (rectangular
+pixelization source model) into its individual pipeline steps and
+JIT-profiles each one separately. This script is the **breakdown** counterpart
+to ``likelihood_runtime/imaging/pixelization.py``, which measures only the
+full-pipeline single-JIT cost.
 
-Rather than timing the whole likelihood as a single JIT-compiled block (which
-hides internal bottlenecks), this script JIT-compiles and times each step of
-the pipeline individually:
+Steps profiled:
 
 1. Ray-trace grids through the lens
 2. Blurred image of lens light (non-linear profiles)
@@ -17,18 +17,24 @@
 5. Overlay grid (source pixel centres)
 6. Interpolation weights and mapper construction
 7. Mapping matrix
-8. Blurred mapping matrix (PSF convolution)
+8. Blurred mapping matrix / Inversion setup (steps 4-8 combined)
 9. Data vector (D)
 10. Curvature matrix (F)
 11. Regularization matrix (H)
 12. Regularized reconstruction: s = (F + H)^{-1} D
 13. Map reconstruction to image + log evidence
 
-Caveat: XLA may fuse operations differently when compiled as one program vs
-separate pieces, so per-step timings are approximate. They are still useful
-for identifying which step dominates.
+Per-step timing is approximate: XLA may fuse operations differently when
+compiled as one program vs separate pieces, but the breakdown is still useful
+for identifying which step dominates the runtime budget.
 
-All JAX timings use `block_until_ready()` to force synchronous measurement.
+All JAX timings use ``block_until_ready()`` to force synchronous measurement.
+
+Output
+------
+
+Results JSON and PNG are written to ``results/breakdown/imaging/`` using
+the basename ``pixelization_breakdown_{instrument}_v{al_version}``.
 """
 
 import numpy as np
@@ -47,8 +53,8 @@
 
 # Shared adapt-image loader: load or compute+cache `lensed_source.fits`
 # next to the dataset, then return the masked ``aa.Array2D``.
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-from adapt_image_util import adapt_image_for_dataset  # noqa: E402
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+from _adapt_image_util import adapt_image_for_dataset  # noqa: E402
 
 # ---------------------------------------------------------------------------
 # Instrument configuration
@@ -66,8 +72,6 @@
 
 # Sweep-driver CLI args (--config-name / --output-dir / --use-mixed-precision).
 # Tolerates extra/unknown args via parse_known_args inside the helper.
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
 from _profile_cli import (  # noqa: E402
     parse_profile_cli,
     device_info_dict,
@@ -301,10 +305,7 @@ def jit_profile(func, label, *args, n_repeats=10):
 
 # ---------------------------------------------------------------------------
 # 4. Adapt image — PSF-convolved lensed-source image used by
-#    ``RectangularAdaptImage`` to weight mesh pixels. Loads ``lensed_source.fits``
-#    from the dataset directory if present, otherwise computes it from the
-#    truth tracer and caches the file for sibling scripts on the same
-#    instrument.
+#    ``RectangularAdaptImage`` to weight mesh pixels.
 # ---------------------------------------------------------------------------
 
 print("\n--- Adapt image (lensed source) ---")
@@ -313,10 +314,6 @@ def jit_profile(func, label, *args, n_repeats=10):
     adapt_image = adapt_image_for_dataset(
         dataset_path=dataset_path, dataset=dataset
     )
-    # ``galaxy_image_dict`` (Galaxy-object-keyed) feeds the eager-path
-    # ``image_for_galaxy`` lookup; ``galaxy_name_image_dict`` (path-tuple
-    # str-keyed) is rebuilt inside JIT closures where the Galaxy objects
-    # are reconstructed on every call. Both must be supplied here.
     adapt_images = al.AdaptImages(
         galaxy_image_dict={instance.galaxies.source: adapt_image},
         galaxy_name_image_dict={"('galaxies', 'source')": adapt_image},
@@ -863,129 +860,7 @@ def compute_log_evidence(
 print("  Assertion PASSED: inversion-matrix log_evidence matches FitImaging.log_evidence")
 
 # ===================================================================
-# PART C — Full-pipeline JIT for comparison
-# ===================================================================
-
-print("\n" + "=" * 70)
-print("FULL-PIPELINE JIT (for comparison)")
-print("=" * 70)
-
-analysis = al.AnalysisImaging(
-    dataset=dataset,
-    adapt_images=adapt_images,
-    settings=al.Settings(
-        use_border_relocator=True,
-        use_mixed_precision=_cli.use_mixed_precision,
-    ),
-    use_jax=True,
-)
-
-def full_pipeline_from_params(params_tree):
-    return analysis.log_likelihood_function(instance=params_tree)
-
-_, full_result = jit_profile(full_pipeline_from_params, "full_pipeline", params_tree)
-full_pipeline_per_call = timer.records[-1][1] / 10
-
-print(f"  full log_likelihood = {full_result}")
-
-# ===================================================================
-# PART D — vmap + correctness
-# ===================================================================
-#
-# NOTE: vmap requires at least one JAX array leaf in the params_tree.
-# When model.total_free_parameters == 0 (all params fixed to truth), the
-# pytree has no array leaves and vmap cannot batch over it. Skip in that case.
-
-print("\n--- vmap batched evaluation ---")
-
-batch_size = 3
-vmap_batch_time = None
-vmap_per_call = None
-vmap_speedup = None
-result_vmap = None
-
-_n_leaves = len(jax.tree_util.tree_leaves(params_tree))
-_vmap_skipped_reason = None
-if _n_leaves == 0:
-    _vmap_skipped_reason = (
-        "model has 0 free parameters (all fixed to truth); vmap "
-        "requires at least one array leaf."
-    )
-else:
-    parameters = jax.tree_util.tree_map(
-        lambda leaf: jnp.broadcast_to(leaf, (batch_size, *leaf.shape)),
-        params_tree,
-    )
-
-    vmapped_full = jax.jit(jax.vmap(full_pipeline_from_params))
-
-    # 1521-source-pixel adapt-mesh pipelines push the per-batch working
-    # set past 2.5 GB; on smaller GPUs the vmap compile / first call can
-    # OOM. Catch and skip cleanly rather than killing the script.
-    try:
-        with timer.section("vmap_first_call"):
-            result_vmap = vmapped_full(parameters)
-            block(result_vmap)
-    except Exception as exc:
-        if "RESOURCE_EXHAUSTED" in str(exc) or "Out of memory" in str(exc):
-            _vmap_skipped_reason = (
-                f"OOM during vmap first call (batch_size={batch_size}); skip vmap. "
-                f"Re-run on a bigger device or lower `batch_size`."
-            )
-        else:
-            raise
-
-if _vmap_skipped_reason is None and _n_leaves > 0:
-    n_vmap_repeats = 10
-    with timer.section(f"vmap_steady_x{n_vmap_repeats}"):
-        for _ in range(n_vmap_repeats):
-            result_vmap = vmapped_full(parameters)
-            block(result_vmap)
-
-    vmap_batch_time = timer.records[-1][1] / n_vmap_repeats
-    vmap_per_call = vmap_batch_time / batch_size
-    vmap_speedup = full_pipeline_per_call / vmap_per_call
-
-    print(f"  batch results = {result_vmap}")
-    print(f"  vmap batch of {batch_size}:   {vmap_batch_time:.6f} s")
-    print(f"  vmap per call:         {vmap_per_call:.6f} s")
-    print(f"  single JIT per call:   {full_pipeline_per_call:.6f} s")
-    print(f"  vmap speedup:          {vmap_speedup:.1f}x faster per likelihood")
-
-    np.testing.assert_allclose(
-        np.array(result_vmap),
-        float(full_result),
-        rtol=1e-4,
-        err_msg="pixelization: JAX vmap likelihood mismatch",
-    )
-    print("  Correctness check PASSED")
-else:
-    print(f"  SKIPPED: {_vmap_skipped_reason}")
-
-# ===================================================================
-# PART E — Static memory analysis
-# ===================================================================
-
-print("\n--- Static memory analysis ---")
-
-if _vmap_skipped_reason is not None:
-    print(f"  SKIPPED: {_vmap_skipped_reason}")
-    memory_analysis = None
-else:
-    lowered_batched = vmapped_full.lower(parameters)
-    compiled_batched = lowered_batched.compile()
-
-    memory_analysis = compiled_batched.memory_analysis()
-    print(f"  Output size:  {memory_analysis.output_size_in_bytes / 1024**2:.3f} MB")
-    print(f"  Temp size:    {memory_analysis.temp_size_in_bytes / 1024**2:.3f} MB")
-    print(
-        f"  Total:        "
-        f"{(memory_analysis.output_size_in_bytes + memory_analysis.temp_size_in_bytes) / 1024**2:.3f} MB"
-    )
-
-
-# ===================================================================
-# JAX Likelihood Function Summary
+# Per-step breakdown summary + JSON + PNG
 # ===================================================================
 
 import json
@@ -996,7 +871,7 @@ def full_pipeline_from_params(params_tree):
 al_version = al.__version__
 
 print("\n" + "=" * 70)
-print(f"JAX LIKELIHOOD FUNCTION SUMMARY — {instrument.upper()} — v{al_version}")
+print(f"PER-STEP BREAKDOWN SUMMARY — {instrument.upper()} — v{al_version}")
 print("=" * 70)
 print(f"  Instrument:            {instrument}")
 print(f"  Pixel scale:           {pixel_scale} arcsec/pixel")
@@ -1015,17 +890,11 @@ def full_pipeline_from_params(params_tree):
 
 print("-" * 70)
 print(f"      {'TOTAL (step-by-step)':<{max_label}}  {step_total:>12.6f} s")
-print(f"      {'Full pipeline (single JIT)':<{max_label}}  {full_pipeline_per_call:>12.6f} s")
-if vmap_per_call is not None:
-    print(f"      {f'vmap batch={batch_size} (per call)':<{max_label}}  {vmap_per_call:>12.6f} s")
-    print(f"      {f'vmap speedup vs single JIT':<{max_label}}  {vmap_speedup:>11.1f}x")
-else:
-    print(f"      {'vmap':<{max_label}}  {'SKIPPED (0 free params)':>12}")
 print("=" * 70)
 
 # --- Save results dictionary ---
 
-likelihood_summary = {
+breakdown_summary = {
     "autolens_version": al_version,
     "device": device_info_dict(),
     "instrument": instrument,
@@ -1039,21 +908,14 @@ def full_pipeline_from_params(params_tree):
     },
     "steps": {label: per_call for label, per_call in likelihood_steps},
     "total_step_by_step": step_total,
-    "full_pipeline_single_jit": full_pipeline_per_call,
-    "vmap": "SKIPPED — model has 0 free parameters (all fixed to truth)" if vmap_per_call is None else {
-        "batch_size": batch_size,
-        "batch_time": vmap_batch_time,
-        "per_call": vmap_per_call,
-        "speedup_vs_single_jit": round(vmap_speedup, 1),
-    },
 }
 
 dict_path, chart_path = resolve_output_paths(
     _cli,
-    default_dir=_workspace_root / "results" / "likelihood" / "imaging",
-    default_basename=f"pixelization_likelihood_summary_{instrument}_v{al_version}",
+    default_dir=_workspace_root / "results" / "breakdown" / "imaging",
+    default_basename=f"pixelization_breakdown_{instrument}_v{al_version}",
 )
-dict_path.write_text(json.dumps(likelihood_summary, indent=2))
+dict_path.write_text(json.dumps(breakdown_summary, indent=2))
 print(f"\n  Results dict saved to: {dict_path}")
 
 # --- Save bar chart ---
@@ -1074,28 +936,12 @@ def full_pipeline_from_params(params_tree):
         fontsize=9,
     )
 
-ax.axvline(
-    full_pipeline_per_call,
-    color="#C44E52",
-    linestyle="--",
-    linewidth=1.5,
-    label=f"Full pipeline (single JIT): {full_pipeline_per_call:.6f} s",
-)
-if vmap_per_call is not None:
-    ax.axvline(
-        vmap_per_call,
-        color="#55A868",
-        linestyle="--",
-        linewidth=1.5,
-        label=f"vmap batch={batch_size} per call: {vmap_per_call:.6f} s ({vmap_speedup:.1f}x faster)",
-    )
-
 ax.set_yticks(y_pos)
 ax.set_yticklabels(labels, fontsize=10)
 ax.invert_yaxis()
 ax.set_xlabel("Time per call (s)", fontsize=11)
 fig.suptitle(
-    f"Pixelization Imaging Likelihood — {instrument.upper()}",
+    f"Pixelization Imaging Likelihood — Per-Step Breakdown — {instrument.upper()}",
     fontsize=12,
     fontweight="bold",
 )
@@ -1105,7 +951,6 @@ def full_pipeline_from_params(params_tree):
     f"total: {step_total:.6f} s",
     fontsize=9,
 )
-ax.legend(loc="lower right", fontsize=9)
 ax.margins(x=0.15)
 fig.tight_layout()
 
@@ -1115,16 +960,9 @@ def full_pipeline_from_params(params_tree):
 
 
 # ===================================================================
-# Regression assertion — realistic-scale deterministic log-evidence
+# Regression assertion — eager log_evidence only
 # ===================================================================
-#
-# RectangularAdaptImage at prior medians anchors the regression on the
-# *eager* FitImaging value (deterministic to fp64 noise). The full-pipeline
-# single-JIT / vmap paths agree with eager to ~1e-3 only: adaptive mesh
-# weighting amplifies fp accumulation in Cholesky / log_det on the bigger
-# 1581x1581 mapping matrix relative to the non-adaptive baseline (which
-# previously matched at 1e-4). The 1e-3 envelope is still tight enough to
-# catch real numerical regressions while accommodating the adaptive path.
+
 EXPECTED_LOG_EVIDENCE_HST = 28370.27770182  # 39x39 = 1521 source pixels, MGE-60 lens light, adapt_image=lensed_source
 
 np.testing.assert_allclose(
@@ -1140,17 +978,3 @@ def full_pipeline_from_params(params_tree):
     f"  Eager regression assertion PASSED: log_evidence matches "
     f"{EXPECTED_LOG_EVIDENCE_HST:.6f}"
 )
-np.testing.assert_allclose(
-    float(full_result),
-    EXPECTED_LOG_EVIDENCE_HST,
-    rtol=1e-3,
-    err_msg=f"imaging/pixelization[{instrument}]: regression — full log_evidence drifted",
-)
-if result_vmap is not None:
-    np.testing.assert_allclose(
-        np.array(result_vmap),
-        EXPECTED_LOG_EVIDENCE_HST,
-        rtol=1e-3,
-        err_msg=f"imaging/pixelization[{instrument}]: regression — vmap log_evidence drifted",
-    )
-print(f"  Regression assertion PASSED: log_evidence matches {EXPECTED_LOG_EVIDENCE_HST:.6f}")
diff --git a/likelihood/interferometer/delaunay.py b/likelihood_breakdown/interferometer/delaunay.py
similarity index 70%
rename from likelihood/interferometer/delaunay.py
rename to likelihood_breakdown/interferometer/delaunay.py
index 29d75d9..75e971f 100644
--- a/likelihood/interferometer/delaunay.py
+++ b/likelihood_breakdown/interferometer/delaunay.py
@@ -1,87 +1,43 @@
 """
-JAX Profiling: Delaunay Interferometer Likelihood
-=================================================
+JAX Profiling: Delaunay Interferometer Likelihood — Per-Step Breakdown
+=======================================================================
 
-Profiles the JAX likelihood function for an interferometer dataset where the
-source galaxy is reconstructed using a Delaunay pixelization with cross-
-derivative (``ConstantSplit``) regularization, and the lens galaxy is an
-Isothermal + ExternalShear.
+Decomposes the JAX likelihood function for an interferometer dataset
+(Hilbert/Delaunay source model) into its individual pipeline steps and
+JIT-profiles each one separately. This script is the **breakdown** counterpart
+to ``likelihood_runtime/interferometer/delaunay.py``, which measures the
+full-pipeline single-JIT cost and vmap speedup.
 
-Mirrors ``likelihood/interferometer/pixelization.py`` (Phase 2) with the
-``RectangularUniform`` source replaced by a ``Delaunay`` mesh — matching
-``likelihood/imaging/delaunay.py`` so imaging vs interferometer Delaunay
-results can be compared side-by-side.
-
-Matches the step-by-step pedagogy of ``likelihood/imaging/delaunay.py``
+Matches the step-by-step pedagogy of ``likelihood_runtime/imaging/delaunay.py``
 applied to the visibility-space pipeline. The 11 per-step JIT-profiled stages
 map 1:1 onto sections in
-``autolens_workspace/scripts/interferometer/features/datacube/likelihood_function.py``
-and its single-channel parent
-``interferometer/features/pixelization/likelihood_function.py``.
+``autolens_workspace/scripts/interferometer/features/datacube/likelihood_function.py``.
 
 Pipeline steps (matching the imaging-delaunay numbering for cross-reference;
 the two lens-light steps from the imaging sibling are dropped since the
 interferometer pixelization model has no parametric lens light):
 
  1. Ray-trace data grid to source plane.
- 2. Ray-trace mesh grid (image-plane Overlay vertices) to source plane.
+ 2. Ray-trace mesh grid (image-plane Hilbert vertices) to source plane.
  5. Border relocation (data grid + mesh grid).
  6. Delaunay triangulation + interpolation + mapper.
  7. Mapping matrix.
- 8. Transformed mapping matrix (NUFFT) — interferometer-specific. Replaces
-    imaging's PSF-convolved blurred mapping matrix; the difference is the
-    Fourier transform to visibility space rather than image-space convolution.
+ 8. Transformed mapping matrix (NUFFT) — interferometer-specific.
  9. Data vector D — visibility-space (real and imaginary components).
  10. Curvature matrix F — real and imaginary curvatures summed.
  11. Regularization matrix H — ConstantSplit (same as imaging).
- 12. Reconstruction s = NNLS(F + H, D) (same NNLS path as imaging).
- 13. Mapped reconstructed visibilities + log evidence (visibility-space χ²).
-
-Measures:
-
-1. Eager baseline: ``FitInterferometer`` with ``xp=np``, print
-   ``figure_of_merit`` / ``log_likelihood``.
-2. Per-step JIT profiling: each pipeline stage above gets its own
-   ``jit_profile()`` call (lower / compile / first-call / steady-state ×10).
-3. Full-pipeline JIT: ``jax.jit(analysis.log_likelihood_function)`` on a
-   pytree-registered ``ModelInstance``. Measure lower / compile / first-call /
-   steady-state per-call.
-4. Batched evaluation (opt-in via ``DELAUNAY_VMAP=1``): ``jax.jit(jax.vmap(...))``.
-   Skipped by default because Delaunay vmap compilation can take 20+ minutes
-   on CPU due to triangulation + interpolation graph size.
-5. Correctness: eager vs JIT log-evidence agreement at ``rtol=1e-4`` for both
-   the per-step recomputation and the full pipeline.
-6. Static memory analysis of the batched program (only when vmap runs).
-7. Results JSON + PNG written to ``results/`` with per-step entries that
-   slot into the same bar-chart shape as ``likelihood/imaging/delaunay.py``.
-
-JIT-blocker notes
------------------
-
-Per-step decomposition risks missing cross-step XLA fusion and hitting
-library-level JAX blockers. Caveats from the previous opt-out version that
-still apply:
-
-- ``dataset.transformer.transform_mapping_matrix`` is JIT-friendly for
-  ``TransformerDFT`` (a single matrix multiply) and the default SMA preset
-  uses it. The JAX-native ``al.TransformerNUFFT`` (nufftax-backed) IS
-  JIT-friendly today, but it is currently incompatible with
-  ``apply_sparse_operator`` (see
-  ``PyAutoArray/autoarray/dataset/interferometer/dataset.py:261``) — the
-  Delaunay path here relies on the sparse precision operator, so the
-  transformer stays on DFT. The legacy ``TransformerNUFFTPyNUFFT`` is
-  pynufft-based and is not JIT-friendly.
-- The visibility-space χ² in step 13 separates the complex visibilities and
-  noise into real/imag components inside the JIT body (matching the
-  ``pixelization/likelihood_function.py`` reference). Complex-valued JIT
-  with autoarray ``Visibilities`` wrappers is avoided.
-
-Pytree-native parameter inputs
-------------------------------
-
-Uses ``af.ModelInstance`` as the JIT input via PyAutoFit's opt-in pytree
-registration (``autofit.jax.register_model``). Exercises the ``TuplePrior``
-pytree support landed in PyAutoFit#1222.
+ 12. Reconstruction s = NNLS(F + H, D).
+ 13. Mapped reconstructed visibilities + log evidence (visibility-space chi²).
+
+Per-step timing is approximate: XLA may fuse operations differently when
+compiled as one program vs separate pieces. All JAX timings use
+``block_until_ready()`` to force synchronous measurement.
+
+Output
+------
+
+Results JSON and PNG are written to ``results/breakdown/interferometer/`` using
+the basename ``delaunay_breakdown_{instrument}_v{al_version}``.
 """
 
 import os
@@ -98,8 +54,8 @@
 import autolens as al
 from autofit.jax import register_model as _register_model_pytrees
 
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-from adapt_image_util import adapt_image_for_dataset  # noqa: E402
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+from _adapt_image_util import adapt_image_for_dataset  # noqa: E402
 
 # ---------------------------------------------------------------------------
 # Instrument configuration
@@ -117,8 +73,6 @@
 
 # Sweep-driver CLI args (--config-name / --output-dir / --use-mixed-precision).
 # Tolerates extra/unknown args via parse_known_args inside the helper.
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
 from _profile_cli import (  # noqa: E402
     parse_profile_cli,
     device_info_dict,
@@ -244,12 +198,6 @@ def jit_profile(func, label, *args, n_repeats=10):
 # ---------------------------------------------------------------------------
 # 2. Adapt image + image mesh (Hilbert)
 # ---------------------------------------------------------------------------
-#
-# ``image_mesh.Hilbert`` adaptively places the source mesh vertices in the
-# image plane based on the lensed-source adapt image — denser where the
-# source lives, sparser elsewhere. Replaces the regular ``image_mesh.Overlay``
-# + circular-edge fallback that preceded this path. ``zeroed_pixels=0``
-# because Hilbert's placement is data-driven (no fixed edge points to mask).
 
 print("\n--- Adapt image (lensed source) ---")
 
@@ -406,8 +354,6 @@ def jit_profile(func, label, *args, n_repeats=10):
 # ---------------------------------------------------------------------------
 # Step 1: Ray-trace data grid to source plane
 # ---------------------------------------------------------------------------
-# Same operation as ``pixelization/likelihood_function.py:__Ray Tracing__``
-# applied to the data grid (one of the two grids the inversion uses).
 
 print("\n--- Step 1: Ray-trace data grid ---")
 
@@ -471,9 +417,6 @@ def ray_trace_mesh_raw(mesh_raw):
 # Steps 3-4 from the imaging sibling (lens-light pre-PSF image and PSF-blurred
 # image) don't exist for the interferometer pixelization model — there's no
 # parametric lens light. We jump straight to step 5.
-#
-# Same as ``pixelization/likelihood_function.py:__Border Relocation__``;
-# scipy-based so eager-only.
 
 print("\n--- Step 5: Border relocation ---")
 
@@ -504,7 +447,6 @@ def ray_trace_mesh_raw(mesh_raw):
 # ---------------------------------------------------------------------------
 # Step 6: Delaunay triangulation + interpolation + mapper
 # ---------------------------------------------------------------------------
-# scipy-based; same as imaging-delaunay step 6.
 
 print("\n--- Step 6: Delaunay triangulation + Interpolation + Mapper ---")
 
@@ -528,10 +470,6 @@ def ray_trace_mesh_raw(mesh_raw):
 # ---------------------------------------------------------------------------
 # Steps 7-13: Extract matrices from FitInterferometer.inversion for consistency
 # ---------------------------------------------------------------------------
-# The FitInterferometer pipeline handles edge-pixel zeroing, curvature-diagonal
-# adjustments, and settings that are difficult to replicate manually. We extract
-# the production matrices upfront and then JIT-profile the linear-algebra ops
-# on the reference inputs, matching the imaging-sibling pattern.
 
 print("\n--- Extracting inversion matrices from FitInterferometer ---")
 
@@ -556,7 +494,6 @@ def ray_trace_mesh_raw(mesh_raw):
 # ---------------------------------------------------------------------------
 # Step 7: Mapping matrix
 # ---------------------------------------------------------------------------
-# Same as ``pixelization/likelihood_function.py:__Mapping Matrix__``.
 
 print("\n--- Step 7: Mapping matrix ---")
 
@@ -568,13 +505,6 @@ def ray_trace_mesh_raw(mesh_raw):
 # ---------------------------------------------------------------------------
 # Step 8: Transformed mapping matrix (NUFFT) — interferometer-specific
 # ---------------------------------------------------------------------------
-# Replaces the imaging sibling's "Blurred mapping matrix (PSF convolution)".
-#
-# For ``TransformerDFT`` this is a single complex matrix multiply
-# ``D @ M`` where D is the discrete Fourier matrix (n_vis × n_image) and
-# M is the real mapping matrix (n_image × source_pixels). JIT-friendly.
-# For ``TransformerNUFFT`` (pynufft-based, ALMA-scale) this step would fall
-# back to eager — flag if you swap the transformer.
 
 print("\n--- Step 8: Transformed mapping matrix (NUFFT) ---")
 
@@ -630,7 +560,6 @@ def transformed_mm_from_params(params_tree):
 # ---------------------------------------------------------------------------
 # Step 9: Data vector (D) — visibility-space
 # ---------------------------------------------------------------------------
-# Same as ``pixelization/likelihood_function.py:__Data Vector (D)__``.
 
 print("\n--- Step 9: Data vector (D) ---")
 
@@ -666,8 +595,6 @@ def compute_data_vector(
 # ---------------------------------------------------------------------------
 # Step 10: Curvature matrix (F)
 # ---------------------------------------------------------------------------
-# Same as ``pixelization/likelihood_function.py:__Curvature Matrix (F)__``:
-# F = sum_j f_ij f_kj / sigma_j^2, computed separately for real / imag and summed.
 
 print("\n--- Step 10: Curvature matrix (F) ---")
 
@@ -713,8 +640,6 @@ def compute_curvature_matrix(
 # ---------------------------------------------------------------------------
 # Step 11: Regularization matrix (H) — ConstantSplit scheme
 # ---------------------------------------------------------------------------
-# Same as ``pixelization/likelihood_function.py:__Regularization Matrix (H)__``;
-# extracted from the inversion for consistency with the production setup.
 
 print("\n--- Step 11: Regularization matrix (ConstantSplit) ---")
 
@@ -765,9 +690,6 @@ def compute_reconstruction(data_vector, curvature_matrix, regularization_matrix)
 # ---------------------------------------------------------------------------
 # Step 13: Mapped reconstructed visibilities + log evidence
 # ---------------------------------------------------------------------------
-# Same five-term log-evidence as the imaging sibling but with visibility-space
-# χ² (real + imag). Matches the formula in
-# ``pixelization/likelihood_function.py:__Likelihood Function — Five Terms__``.
 
 print("\n--- Step 13: Mapped reconstructed visibilities + log evidence ---")
 
@@ -786,7 +708,7 @@ def compute_log_evidence(
     mapped_real = jnp.matmul(transformed_mm_real, reconstruction)
     mapped_imag = jnp.matmul(transformed_mm_imag, reconstruction)
 
-    # χ² in visibility space (real + imag)
+    # chi² in visibility space (real + imag)
     chi_real = jnp.sum(((data_real - mapped_real) / noise_real) ** 2)
     chi_imag = jnp.sum(((data_imag - mapped_imag) / noise_imag) ** 2)
     chi_squared = chi_real + chi_imag
@@ -870,131 +792,7 @@ def compute_log_evidence(
 
 
 # ===================================================================
-# PART C — Full-pipeline JIT (for comparison)
-# ===================================================================
-
-print("\n" + "=" * 70)
-print("FULL-PIPELINE JIT")
-print("=" * 70)
-
-analysis = al.AnalysisInterferometer(
-    dataset=dataset,
-    adapt_images=adapt_images,
-    settings=al.Settings(use_mixed_precision=_cli.use_mixed_precision),
-    use_jax=True,
-)
-
-def full_pipeline_from_params(params_tree):
-    """Full interferometer likelihood from a pytree-shaped ``ModelInstance``.
-
-    No flat-vector unpacking inside the trace — the instance crosses the JIT
-    boundary directly, with constants (redshifts, etc.) kept static via the
-    ``aux_data`` partition set up by ``autofit.jax.register_model``.
-    """
-    return analysis.log_likelihood_function(instance=params_tree)
-
-_, full_result = jit_profile(full_pipeline_from_params, "full_pipeline", params_tree)
-full_pipeline_per_call = timer.records[-1][1] / 10
-
-print(f"  full log_evidence = {full_result}")
-
-# Correctness: for inversion models (pixelization + regularization), the
-# analysis "log_likelihood_function" actually returns the log-evidence
-# (= figure_of_merit), which includes the regularization/determinant terms.
-# Match against figure_of_merit_ref, not log_likelihood_ref.
-np.testing.assert_allclose(
-    float(full_result),
-    float(figure_of_merit_ref),
-    rtol=1e-4,
-    err_msg="interferometer/delaunay: JIT log-evidence does not match eager figure_of_merit",
-)
-print("  Eager-vs-JIT correctness PASSED")
-
-# ===================================================================
-# PART D — vmap (opt-in) + correctness
-# ===================================================================
-#
-# Delaunay vmap compilation can take 20+ minutes on CPU due to the size of
-# the triangulation + interpolation XLA graph. Skipped by default — set
-# DELAUNAY_VMAP=1 to opt in.
-
-print("\n--- vmap batched evaluation ---")
-
-run_vmap = os.environ.get("DELAUNAY_VMAP", "0") == "1"
-
-batch_size = 3
-vmap_batch_time = None
-vmap_per_call = None
-vmap_speedup = None
-result_vmap = None
-vmapped_full = None
-parameters = None
-
-_n_leaves = len(jax.tree_util.tree_leaves(params_tree))
-if not run_vmap:
-    print("  SKIPPED: opt-in via DELAUNAY_VMAP=1 (compilation can take 20+ minutes).")
-elif _n_leaves == 0:
-    print(f"  SKIPPED: model has 0 free parameters (all fixed to truth); "
-          f"vmap requires at least one array leaf.")
-else:
-    parameters = jax.tree_util.tree_map(
-        lambda leaf: jnp.broadcast_to(leaf, (batch_size, *leaf.shape)),
-        params_tree,
-    )
-
-    vmapped_full = jax.jit(jax.vmap(full_pipeline_from_params))
-
-    with timer.section("vmap_first_call"):
-        result_vmap = vmapped_full(parameters)
-        block(result_vmap)
-
-    n_vmap_repeats = 10
-    with timer.section(f"vmap_steady_x{n_vmap_repeats}"):
-        for _ in range(n_vmap_repeats):
-            result_vmap = vmapped_full(parameters)
-            block(result_vmap)
-
-    vmap_batch_time = timer.records[-1][1] / n_vmap_repeats
-    vmap_per_call = vmap_batch_time / batch_size
-    vmap_speedup = full_pipeline_per_call / vmap_per_call
-
-    print(f"  batch results = {result_vmap}")
-    print(f"  vmap batch of {batch_size}:   {vmap_batch_time:.6f} s")
-    print(f"  vmap per call:         {vmap_per_call:.6f} s")
-    print(f"  single JIT per call:   {full_pipeline_per_call:.6f} s")
-    print(f"  vmap speedup:          {vmap_speedup:.1f}x faster per likelihood")
-
-    np.testing.assert_allclose(
-        np.array(result_vmap),
-        float(full_result),
-        rtol=1e-4,
-        err_msg="interferometer/delaunay: JAX vmap likelihood mismatch",
-    )
-    print("  vmap-vs-single-JIT correctness PASSED")
-
-# ===================================================================
-# PART E — Static memory analysis (only if vmap ran)
-# ===================================================================
-
-print("\n--- Static memory analysis ---")
-
-if vmapped_full is None:
-    print("  SKIPPED: vmap path was not exercised this run.")
-    memory_analysis = None
-else:
-    lowered_batched = vmapped_full.lower(parameters)
-    compiled_batched = lowered_batched.compile()
-
-    memory_analysis = compiled_batched.memory_analysis()
-    print(f"  Output size:  {memory_analysis.output_size_in_bytes / 1024**2:.3f} MB")
-    print(f"  Temp size:    {memory_analysis.temp_size_in_bytes / 1024**2:.3f} MB")
-    print(
-        f"  Total:        "
-        f"{(memory_analysis.output_size_in_bytes + memory_analysis.temp_size_in_bytes) / 1024**2:.3f} MB"
-    )
-
-# ===================================================================
-# JAX Likelihood Function Summary + artefacts
+# Per-step breakdown summary + JSON + PNG
 # ===================================================================
 
 import json
@@ -1005,7 +803,7 @@ def full_pipeline_from_params(params_tree):
 al_version = al.__version__
 
 print("\n" + "=" * 70)
-print(f"JAX LIKELIHOOD FUNCTION SUMMARY — {instrument.upper()} — v{al_version}")
+print(f"PER-STEP BREAKDOWN SUMMARY — {instrument.upper()} — v{al_version}")
 print("=" * 70)
 print(f"  Instrument:              {instrument}")
 print(f"  Pixel scale:             {pixel_scale} arcsec/pixel")
@@ -1015,10 +813,6 @@ def full_pipeline_from_params(params_tree):
 print(f"  Delaunay vertices:       {n_mesh_vertices}")
 print(f"  Edge zeroed pixels:      {edge_pixels_total}")
 print("-" * 70)
-print(f"  Eager log_likelihood:    {log_likelihood_ref}")
-print(f"  Eager figure_of_merit:   {figure_of_merit_ref}  (log-evidence)")
-print(f"  JIT  log-evidence:       {float(full_result)}")
-print("-" * 70)
 
 max_label = max(len(label) for label, _ in likelihood_steps)
 step_total = 0.0
@@ -1028,30 +822,11 @@ def full_pipeline_from_params(params_tree):
 
 print("-" * 70)
 print(f"      {'TOTAL (step-by-step)':<{max_label}}  {step_total:>12.6f} s")
-print(f"      {'Full pipeline (single JIT)':<{max_label}}  {full_pipeline_per_call:>12.6f} s")
-if vmap_per_call is not None:
-    print(f"      {'vmap batch (per call)':<{max_label}}  {vmap_per_call:>12.6f} s")
-    print(f"      {'vmap speedup vs single JIT':<{max_label}}  {vmap_speedup:>11.1f}x")
-else:
-    print(f"      {'vmap':<{max_label}}  {'SKIPPED':>12}")
 print("=" * 70)
 
 # --- Save results dictionary ---
 
-if vmap_per_call is None:
-    if not run_vmap:
-        vmap_payload = "SKIPPED — opt-in via DELAUNAY_VMAP=1"
-    else:
-        vmap_payload = "SKIPPED — model has 0 free parameters (all fixed to truth)"
-else:
-    vmap_payload = {
-        "batch_size": batch_size,
-        "batch_time": vmap_batch_time,
-        "per_call": vmap_per_call,
-        "speedup_vs_single_jit": round(vmap_speedup, 1),
-    }
-
-likelihood_summary = {
+breakdown_summary = {
     "autolens_version": al_version,
     "device": device_info_dict(),
     "instrument": instrument,
@@ -1068,23 +843,16 @@ def full_pipeline_from_params(params_tree):
     },
     "log_likelihood_eager": float(log_likelihood_ref),
     "figure_of_merit_eager": float(figure_of_merit_ref),
-    "log_evidence_jit": float(full_result),
     "steps": {label: per_call for label, per_call in likelihood_steps},
     "total_step_by_step": step_total,
-    "full_pipeline_single_jit": full_pipeline_per_call,
-    "vmap": vmap_payload,
-    "memory_mb": None if memory_analysis is None else {
-        "output": memory_analysis.output_size_in_bytes / 1024**2,
-        "temp": memory_analysis.temp_size_in_bytes / 1024**2,
-    },
 }
 
 dict_path, chart_path = resolve_output_paths(
     _cli,
-    default_dir=_workspace_root / "results" / "likelihood" / "interferometer",
-    default_basename=f"delaunay_likelihood_summary_{instrument}_v{al_version}",
+    default_dir=_workspace_root / "results" / "breakdown" / "interferometer",
+    default_basename=f"delaunay_breakdown_{instrument}_v{al_version}",
 )
-dict_path.write_text(json.dumps(likelihood_summary, indent=2))
+dict_path.write_text(json.dumps(breakdown_summary, indent=2))
 print(f"\n  Results dict saved to: {dict_path}")
 
 # --- Save bar chart ---
@@ -1105,28 +873,12 @@ def full_pipeline_from_params(params_tree):
         fontsize=9,
     )
 
-ax.axvline(
-    full_pipeline_per_call,
-    color="#C44E52",
-    linestyle="--",
-    linewidth=1.5,
-    label=f"Full pipeline (single JIT): {full_pipeline_per_call:.6f} s",
-)
-if vmap_per_call is not None:
-    ax.axvline(
-        vmap_per_call,
-        color="#55A868",
-        linestyle="--",
-        linewidth=1.5,
-        label=f"vmap batch per call: {vmap_per_call:.6f} s ({vmap_speedup:.1f}x faster)",
-    )
-
 ax.set_yticks(y_pos)
 ax.set_yticklabels(labels, fontsize=10)
 ax.invert_yaxis()
 ax.set_xlabel("Time per call (s)", fontsize=11)
 fig.suptitle(
-    f"Delaunay Interferometer Likelihood — {instrument.upper()}",
+    f"Delaunay Interferometer Likelihood — Per-Step Breakdown — {instrument.upper()}",
     fontsize=12,
     fontweight="bold",
 )
@@ -1137,7 +889,6 @@ def full_pipeline_from_params(params_tree):
     f"total: {step_total:.6f} s",
     fontsize=9,
 )
-ax.legend(loc="lower right", fontsize=9)
 ax.margins(x=0.15)
 fig.tight_layout()
 
@@ -1147,15 +898,9 @@ def full_pipeline_from_params(params_tree):
 
 
 # ===================================================================
-# Regression assertion — realistic-scale deterministic log-evidence
+# Regression assertion — eager log_evidence only
 # ===================================================================
-#
-# Simulator truth parameters via GaussianPrior(mean=truth, sigma=small)
-# make the full-pipeline log-evidence deterministic at the prior median.
-# Pinned empirically per instrument; ``None`` means "skip the assertion and
-# print the value so it can be pasted in here on a clean run". sma was
-# bumped to mask_radius=3.5 in 2026-05-21's INSTRUMENTS refactor — the
-# old mask_radius=3.0 value no longer applies and needs re-measuring.
+
 EXPECTED_LOG_EVIDENCE = {
     "sma": None,
     "alma": None,
@@ -1184,18 +929,3 @@ def full_pipeline_from_params(params_tree):
         f"  Eager regression assertion PASSED: log_evidence matches "
         f"{expected_log_evidence:.6f}"
     )
-    np.testing.assert_allclose(
-        float(full_result),
-        expected_log_evidence,
-        rtol=1e-3,
-        err_msg=f"interferometer/delaunay[{instrument}]: regression — full log_evidence drifted",
-    )
-    print(f"  Full-pipeline regression assertion PASSED")
-    if result_vmap is not None:
-        np.testing.assert_allclose(
-            np.array(result_vmap),
-            expected_log_evidence,
-            rtol=1e-3,
-            err_msg=f"interferometer/delaunay[{instrument}]: regression — vmap log_evidence drifted",
-        )
-        print(f"  vmap regression assertion PASSED")
diff --git a/likelihood/OPTIMIZATION_NOTES.md b/likelihood_runtime/OPTIMIZATION_NOTES.md
similarity index 100%
rename from likelihood/OPTIMIZATION_NOTES.md
rename to likelihood_runtime/OPTIMIZATION_NOTES.md
diff --git a/likelihood_runtime/README.md b/likelihood_runtime/README.md
new file mode 100644
index 0000000..fc2456f
--- /dev/null
+++ b/likelihood_runtime/README.md
@@ -0,0 +1,148 @@
+# likelihood_runtime
+
+End-to-end full-pipeline timing of the PyAutoLens likelihood function across hardware tiers and precisions. The headline question is:
+
+> *"How long will this likelihood take per call on this hardware?"*
+
+Run scripts in this package — or, more commonly, the [`sweep.py`](sweep.py) driver — when you need to predict production sampler cost, compare CPU/GPU/A100 throughput on the same dataset, or measure the impact of mixed precision on a specific cell. The output of a multi-config sweep is a single `comparison.json` + `comparison.png` per cell with a row per hardware/precision config and the production cost on that row.
+
+For *where the time goes inside the likelihood*, use the sibling package [`likelihood_breakdown/`](../likelihood_breakdown/). The two packages are deliberately disjoint so neither has to pay the other's cost.
+
+The empirical findings from previous sweeps — per-cell timings, mp verdicts, the GPU-NUFFT regression, the upstream blockers — live in [`OPTIMIZATION_NOTES.md`](OPTIMIZATION_NOTES.md) in this directory.
+
+## Methodology
+
+Each script measures **one** quantity per run: the steady-state cost of the entire likelihood as a single JIT-compiled JAX program. There is no per-step decomposition. The measurement is:
+
+1. **Eager numpy baseline** — `FitImaging` / `FitInterferometer` with `xp=np`, used as the correctness reference.
+2. **Full-pipeline JIT** — `jax.jit(analysis.log_likelihood_function)(instance)` on a pytree-registered `ModelInstance`. Records lower / compile / first-call / steady-state × 10. The `steady × 10 / 10` average is the headline `full_pipeline_per_call` number.
+3. **Vmap batched evaluation** — `jax.jit(jax.vmap(full_pipeline_from_params))(batched_params)` with `batch_size=3`. Records the same four phases. The reported `vmap_per_call` is `batch_time / batch_size`; speedup-vs-single-JIT is `single_jit / vmap_per_call`. Some cells skip vmap by design (datacube — wrong batching axis; delaunay — opt-in via `DELAUNAY_VMAP=1` because compilation can take 20+ minutes).
+4. **Correctness assertions** — eager ≡ JIT and JIT ≡ vmap log-likelihoods agree at `rtol=1e-4`. Mp paths shift the inversion compute dtype to fp32, which loosens the rtol to `1e-3` for the JIT/vmap checks.
+5. **Static memory analysis** — XLA's compiled-program memory footprint (output + temp) is recorded in `memory_mb`.
+
+## The 6-config matrix
+
+The sweep harness drives every in-scope cell through this matrix:
+
+| Config | Backend | Precision | Env / Flag |
+|--------|---------|-----------|------------|
+| `local_cpu_fp64` | CPU | fp64 | `JAX_PLATFORM_NAME=cpu JAX_PLATFORMS=cpu` |
+| `local_cpu_mp` | CPU | mixed (fp32 inversion) | same + `--use-mixed-precision` |
+| `local_gpu_fp64` | RTX 2060 (consumer) | fp64 | `JAX_PLATFORM_NAME=cuda JAX_PLATFORMS=cuda,cpu` |
+| `local_gpu_mp` | RTX 2060 | mixed | same + `--use-mixed-precision` |
+| `hpc_a100_fp64` | A100 (80 GB) | fp64 | SLURM-dispatched via `z_projects/profiling/hpc/sync` |
+| `hpc_a100_mp` | A100 | mixed | same + `--use-mixed-precision` |
+
+The `cuda,cpu` listing on GPU configs is load-bearing: the Delaunay + datacube paths use `jax.pure_callback` for Hilbert-curve mesh generation, which needs a CPU device available even when the primary platform is CUDA. Without the trailing `cpu` the callback raises `pure_callback failed to find a local CPU device`.
+
+## What mixed precision actually means
+
+`--use-mixed-precision` threads `SettingsInversion(use_mixed_precision=True)` through `FitImaging` / `FitInterferometer` / `AnalysisInterferometer` and into the inversion's compute path. Under JAX (`xp=jnp`) this drops the inversion's working dtype to fp32; under numpy mp is a no-op. Paths that honour the flag:
+
+- The PSF FFT convolution in `Convolver.convolved_image_from` (linear MGE bulge path).
+- The cube allocation in `mapper_util.mapping_matrix_from`.
+- The noise-weighted curvature accumulation in `inversion_util.curvature_matrix_via_mapping_matrix_from`.
+
+Paths that intentionally stay fp64:
+
+- The NNLS reconstruction (active-set / Cholesky / `cho_solve`) — sensitive to fp32 noise on ill-conditioned source meshes.
+- The log-determinant of the curvature regularisation matrix used by `figure_of_merit` — condition numbers can exceed 1e6 on fine pixelisations.
+- Light profile evaluation on the over-sampled grid; only the resulting mapping matrix is downcast.
+
+Empirical effect: mp on CPU consistently helps 7 – 23 %. On GPU it's more variable — significant on FFT-heavy MGE (27 % on the local NUFFT mge run), modest-to-zero on pixelisation/delaunay, slightly *negative* on some Delaunay configs where the fp32/fp64 cast at the sparse-operator boundary costs more than it saves. Per-cell verdicts live in `OPTIMIZATION_NOTES.md`.
+
+## Scripts
+
+| Script | Dataset class | Source model |
+|--------|--------------|--------------|
+| `imaging/mge.py` | Imaging | MGE linear bulge |
+| `imaging/pixelization.py` | Imaging | RectangularAdaptImage |
+| `imaging/delaunay.py` | Imaging | DelaunayBrightnessImage |
+| `interferometer/mge.py` | Interferometer | MGE linear bulge; `TransformerNUFFT` (nufftax) default, `--use-dft` opt-in |
+| `interferometer/pixelization.py` | Interferometer | RectangularAdaptImage + `apply_sparse_operator(use_jax=True)` |
+| `interferometer/delaunay.py` | Interferometer | DelaunayBrightnessImage + `apply_sparse_operator(use_jax=True)` |
+| `datacube/delaunay.py` | Datacube (34-channel cube) | DelaunayBrightnessImage per channel, shared lens model |
+| `point_source/image_plane.py` | Point source | Image-plane χ² via `PointSolver` |
+| `point_source/source_plane.py` | Point source | Source-plane χ² (cheaper proxy) |
+
+## Driving the matrix — `sweep.py` and `aggregate.py`
+
+The harness drives each cell through every config as a subprocess; the aggregator consolidates the per-config JSONs.
+
+```bash
+# Run the full local matrix (CPU + GPU × fp64 + mp) on every in-scope cell
+python likelihood_runtime/sweep.py
+
+# Restrict to certain cells
+python likelihood_runtime/sweep.py --only interferometer/mge interferometer/delaunay
+
+# Skip a backend
+python likelihood_runtime/sweep.py --skip-gpu       # CPU only
+python likelihood_runtime/sweep.py --skip-cpu       # GPU only
+
+# Skip the mixed-precision rows
+python likelihood_runtime/sweep.py --skip-mp
+
+# Dry-run to inspect the planned subprocess commands
+python likelihood_runtime/sweep.py --dry-run
+```
+
+The sweep writes per-config artifacts at:
+
+```
+<output-root>/<class>/<model>/local_cpu_fp64.json
+<output-root>/<class>/<model>/local_cpu_fp64.png
+<output-root>/<class>/<model>/local_cpu_fp64.log   (captured stdout/stderr)
+```
+
+Default `<output-root>` is `autolens_workspace_developer/jax_profiling/results/jit/` — the canonical multi-config result store across PRs.
+
+Then aggregate:
+
+```bash
+# Aggregate every cell that has at least one local_*.json
+python likelihood_runtime/aggregate.py
+
+# One cell only
+python likelihood_runtime/aggregate.py --cell interferometer/mge
+```
+
+The aggregator writes `comparison.json` + `comparison.png` into the same per-cell directory. The PNG is a log-scale grouped bar chart with one bar per (step, config); the JSON is one entry per config containing the full payload from each per-config JSON.
+
+## How to read the output
+
+For each cell, the headline reading order is:
+
+1. **`comparison.png`** — log-scale view of full_pipeline_per_call per config. The slope from CPU → consumer GPU → A100 tells you what hardware tier this cell needs.
+2. **`comparison.json` → `configs.<name>.full_pipeline_per_call`** — the production cost a sampler will pay per likelihood evaluation on that hardware.
+3. **`vmap.per_call` + `vmap.speedup_vs_single_jit`** — the cheapest throughput lever. For cells where vmap helps (typically MGE / non-iterative inversions), batching is the right knob; for cells where vmap ≤ 1× (sparse pixelisation, datacube), reach for data-parallel processes instead.
+4. **`memory_mb.temp`** — XLA's compiled-program working memory. Compare against your hardware budget (RTX 2060 is 6 GB; A100 is 80 GB) before increasing vmap batch size.
+5. **mp verdict** — `(fp64 - mp) / fp64`. A solid 10 %+ win, with the log-likelihood unchanged at `rtol=1e-3`, means "default to mp on this hardware tier".
+
+For headline cross-cell insights and the running list of where each cell sits in terms of "where to optimize next", read [`OPTIMIZATION_NOTES.md`](OPTIMIZATION_NOTES.md).
+
+## Auto-simulation
+
+If `dataset/<class>/<instrument>/` is missing, the script shells out to `simulators/<dataset_type>.py --instrument <name>` and waits for the dataset to land before continuing. Datasets are seeded — re-running the simulator produces bit-identical files. The simulator INSTRUMENTS dict is the single source of truth that the runtime scripts import directly:
+
+```python
+from simulators.interferometer import INSTRUMENTS
+```
+
+Currently configured presets:
+
+| Class | Presets |
+|-------|---------|
+| `imaging` | euclid, hst, jwst, ao |
+| `interferometer` | sma (190 vis), alma (1 M vis), alma_high (10 M vis) |
+| `point_source` | simple |
+
+## When to choose runtime vs breakdown
+
+| Question | Package |
+|----------|---------|
+| "How long will my A100 sampler run take per likelihood call?" | **runtime** |
+| "Does mixed precision actually save time on this cell?" | **runtime** |
+| "How does production cost change between consumer GPU and A100?" | **runtime** |
+| "Where should I focus PyAutoLens optimisation work for this cell?" | breakdown |
+| "Which step fuses cleanly under XLA and which doesn't?" | breakdown (compare against runtime's `full_pipeline_per_call`) |
diff --git a/scripts/aggregate_sweep.py b/likelihood_runtime/aggregate.py
similarity index 95%
rename from scripts/aggregate_sweep.py
rename to likelihood_runtime/aggregate.py
index 730c764..6d102e2 100644
--- a/scripts/aggregate_sweep.py
+++ b/likelihood_runtime/aggregate.py
@@ -1,24 +1,24 @@
 """Aggregate per-config JSONs for a swept likelihood cell into comparison.{json,png}.
 
 Reads every ``<config_name>.json`` under a cell's output dir (see
-``sweep_likelihood.py``) and produces a single ``comparison.json`` whose
-schema mirrors the existing
+``sweep.py``) and produces a single ``comparison.json`` whose schema
+mirrors the existing
 ``autolens_workspace_developer/jax_profiling/results/jit/imaging/{mge,
-pixelization,delaunay}/comparison.json`` artifacts so the existing readers
-(and the OPTIMIZATION_NOTES doc) continue to work.
+pixelization,delaunay}/comparison.json`` artifacts so the existing
+readers (and the OPTIMIZATION_NOTES doc) continue to work.
 
 The ``comparison.png`` is a log-scale grouped bar chart: one bar per
-(step, config), sorted by step cost on the slowest config. The full-pipeline
-single-JIT row and the vmap per-call row are appended at the bottom so the
-production-cost numbers stand out.
+(step, config), sorted by step cost on the slowest config. The
+full-pipeline single-JIT row and the vmap per-call row are appended at
+the bottom so the production-cost numbers stand out.
 
 Usage::
 
     # All cells under the default sweep output root
-    python scripts/aggregate_sweep.py
+    python likelihood_runtime/aggregate.py
 
     # One cell only
-    python scripts/aggregate_sweep.py --cell interferometer/mge
+    python likelihood_runtime/aggregate.py --cell interferometer/mge
 """
 
 from __future__ import annotations
diff --git a/likelihood_runtime/datacube/__init__.py b/likelihood_runtime/datacube/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/likelihood_runtime/datacube/delaunay.py b/likelihood_runtime/datacube/delaunay.py
new file mode 100644
index 0000000..c0f9fef
--- /dev/null
+++ b/likelihood_runtime/datacube/delaunay.py
@@ -0,0 +1,601 @@
+"""
+JAX Profiling: Delaunay Datacube Likelihood (Step-by-Step)
+==========================================================
+
+Profiles each step of the JAX likelihood function for an ALMA-style datacube —
+a list of N ``Interferometer`` channels sharing a single lens model — where
+each channel reconstructs its own source with a Delaunay pixelization +
+ConstantSplit regularization.
+
+Mirrors the step-by-step structure of
+``likelihood/interferometer/delaunay.py`` (Phase 2 of the datacube
+roadmap, just merged). The key new ingredient is the **channel-invariant vs
+channel-variant** split: most steps are computed once for the whole cube
+(shared lens, shared mesh, shared mask), only the NUFFT-based inversion-setup
+chain, the data vector, the curvature matrix, the reconstruction, and the
+log-evidence depend on per-channel data.
+
+The cube total is::
+
+    cube_cost = sum(channel_invariant_costs) + N_channels * sum(channel_variant_costs)
+
+That number quantifies how much the deferred shared-``Lᵀ W̃ L`` optimisation
+will save: moving the curvature matrix from per-channel to shared would
+subtract ``(N - 1) * curvature_matrix_cost`` from the cube total.
+
+Channel-invariant vs channel-variant taxonomy
+---------------------------------------------
+
+For the canonical datacube case where the lens model is shared across all
+channels:
+
+============================================  ================  =========================
+Step                                          Channel-invariant Computed
+============================================  ================  =========================
+1. Ray-trace data grid                        yes               once for the cube
+2. Ray-trace mesh grid                        yes               once for the cube
+3. Inversion setup (border + mapper + NUFFT)  **NUFFT depends   once per channel
+                                              on uv_wavelengths**
+4. Data vector D                              per channel       once per channel
+5. Curvature matrix F                         per channel       once per channel
+6. Regularization matrix H                    yes               once for the cube
+7. Reconstruction (NNLS)                      per channel       once per channel
+8. Mapped recon + log evidence                per channel       once per channel
+============================================  ================  =========================
+
+Dataset
+-------
+
+This profiler reuses the SMA interferometer dataset
+(``dataset/interferometer/sma/``) loaded N times as a 4-channel
+"cube". Each channel has identical visibilities, noise map and uv_wavelengths
+— the point here is timing, not science. The N-channel cube log-evidence is
+``N × single-channel log-evidence`` exactly, which makes the regression
+assertion trivial.
+
+If you want a realistic per-channel-distinct cube, point the loader at the
+workspace simulator output at
+``../autolens_workspace/dataset/interferometer/datacube/sim_simple/``; the
+JIT-cost taxonomy doesn't change because it's a function of which arrays are
+loop-variables in ``FitInterferometer``, not the data values themselves.
+
+Measures
+--------
+
+1. Eager baseline: ``FitInterferometer`` per channel with ``xp=np``; cube
+   reference log-evidence is the sum.
+2. Per-step JIT profiling: each pipeline stage gets its own ``jit_profile()``
+   call (lower / compile / first-call / steady-state × 10). Channel-invariant
+   stages are timed once; channel-variant stages are timed on channel 0 and
+   the cube cost is reported as ``N × per-call``.
+3. Full-pipeline cube JIT: ``jax.jit`` over the explicit
+   ``sum(analysis.log_likelihood_function(instance) for analysis in
+   analysis_list)`` — the same shape as the user-facing
+   ``datacube/likelihood_function.py`` and the cube modeling scripts'
+   internal ``FactorGraphModel`` sum.
+4. Correctness: per-step recomputed cube log-evidence and full-pipeline JIT
+   log-evidence both match the summed eager ``FitInterferometer.log_evidence``
+   at ``rtol=1e-4``.
+5. Results JSON + bar chart written to ``results/jit/datacube/`` using the
+   same schema as the interferometer sibling. Bar chart shows the cube-total
+   form of every step (channel-variant entries pre-multiplied by N).
+
+vmap is **skipped** for the cube profiler. The natural batching dimension is
+"datasets" (one entry per channel) not "parameters" (which the
+interferometer-sibling vmap exercises). A vmap-over-channels variant would
+require a different graph shape and isn't the bottleneck we care about for
+the shared-``Lᵀ W̃ L`` optimisation.
+"""
+
+import numpy as np
+import jax
+import jax.numpy as jnp
+import os
+import time
+import subprocess
+import sys
+from pathlib import Path
+from contextlib import contextmanager
+
+import autofit as af
+import autolens as al
+import autoarray as aa
+from autofit.jax import register_model as _register_model_pytrees
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+from _adapt_image_util import adapt_image_for_dataset  # noqa: E402
+
+# ---------------------------------------------------------------------------
+# Instrument configuration
+# ---------------------------------------------------------------------------
+
+
+# AUTOLENS_PROFILING_SMOKE=1 short-circuit (Phase 5 / CI lint smoke).
+# Verifies the import graph + module-level setup succeeded without running
+# the full profiling pipeline. Skipped entirely when the env var is unset.
+import os as _smoke_os
+import sys as _smoke_sys
+if _smoke_os.environ.get("AUTOLENS_PROFILING_SMOKE") == "1":
+    print(f"[smoke] {__file__}: imports + module setup OK; exiting.")
+    _smoke_sys.exit(0)
+
+# Sweep-driver CLI args (--config-name / --output-dir / --use-mixed-precision).
+# Tolerates extra/unknown args via parse_known_args inside the helper.
+from _profile_cli import (  # noqa: E402
+    parse_profile_cli,
+    device_info_dict,
+    resolve_output_paths,
+    auto_simulate_if_missing,
+)
+from simulators.interferometer import INSTRUMENTS  # noqa: E402
+_cli = parse_profile_cli()
+
+instrument = "sma"  # <-- change to profile a different instrument; cube is N copies of the per-instrument dataset
+
+# n_channels = 34 matches the prior Hannah ALMA cube fiducial. For quick
+# iteration on the smaller sma dataset, drop this to 4.
+n_channels = 34
+hilbert_pixels = 500  # 500-tier production fiducial per channel (× n_channels)
+regularization_coefficient = 1.0
+
+
+# ---------------------------------------------------------------------------
+# Profiling helpers
+# ---------------------------------------------------------------------------
+
+class Timer:
+    """Accumulates named timing measurements and prints a summary."""
+
+    def __init__(self):
+        self.records: list[tuple[str, float]] = []
+
+    @contextmanager
+    def section(self, label: str):
+        start = time.perf_counter()
+        yield
+        elapsed = time.perf_counter() - start
+        self.records.append((label, elapsed))
+        print(f"  [{label}] {elapsed:.4f} s")
+
+
+def block(x):
+    if hasattr(x, "block_until_ready"):
+        x.block_until_ready()
+    return x
+
+
+def jit_profile(func, label, *args, n_repeats=10):
+    """JIT-compile *func*, time lower / compile / first call / steady state."""
+    jitted = jax.jit(func)
+
+    with timer.section(f"{label}_lower"):
+        lowered = jitted.lower(*args)
+
+    with timer.section(f"{label}_compile"):
+        compiled = lowered.compile()
+
+    with timer.section(f"{label}_first_call"):
+        result = compiled(*args)
+        block(result)
+
+    with timer.section(f"{label}_steady_x{n_repeats}"):
+        for _ in range(n_repeats):
+            result = compiled(*args)
+            block(result)
+
+    per_call = timer.records[-1][1] / n_repeats
+    print(f"    -> per-call avg: {per_call:.6f} s")
+    return compiled, result
+
+
+timer = Timer()
+
+# ===================================================================
+# PART A — Setup (not JIT-compiled)
+# ===================================================================
+
+# ---------------------------------------------------------------------------
+# 1. Dataset loading: reuse SMA interferometer dataset N times
+# ---------------------------------------------------------------------------
+
+print(f"\n--- Dataset loading [{instrument}, {n_channels} channels] ---")
+
+_script_dir = Path(__file__).resolve().parent
+_workspace_root = _script_dir.parents[1]
+pixel_scale = INSTRUMENTS[instrument]["pixel_scale"]
+real_space_shape = INSTRUMENTS[instrument]["real_space_shape"]
+dataset_path = Path("dataset") / "interferometer" / instrument
+
+auto_simulate_if_missing(
+    dataset_path,
+    dataset_type="interferometer",
+    instrument=instrument,
+    workspace_root=_workspace_root,
+)
+
+mask_radius = INSTRUMENTS[instrument]["mask_radius"]
+
+real_space_mask = al.Mask2D.circular(
+    shape_native=real_space_shape,
+    pixel_scales=pixel_scale,
+    radius=mask_radius,
+)
+
+with timer.section("dataset_list_load"):
+    # apply_sparse_operator: precompute the visibility-space sparse precision
+    # operator so per-fit curvature assembly uses the FFT-based sparse path
+    # instead of a dense DFT for every source pixel. Unblocked by
+    # PyAutoArray#316 (the Pmax > 1 extent-indexing fix); on Delaunay this was
+    # previously guarded with NotImplementedError.
+    dataset_list = [
+        al.Interferometer.from_fits(
+            data_path=dataset_path / "data.fits",
+            noise_map_path=dataset_path / "noise_map.fits",
+            uv_wavelengths_path=dataset_path / "uv_wavelengths.fits",
+            real_space_mask=real_space_mask,
+            transformer_class=al.TransformerDFT,
+            # DFT is mandatory here: apply_sparse_operator is not yet
+            # compatible with the new nufftax-backed al.TransformerNUFFT (see
+            # PyAutoArray/autoarray/dataset/interferometer/dataset.py:261).
+            # Swapping the transformer would raise NotImplementedError.
+            raise_error_dft_visibilities_limit=False,
+        ).apply_sparse_operator(use_jax=True, show_progress=False)
+        for _ in range(n_channels)
+    ]
+
+n_visibilities = dataset_list[0].uv_wavelengths.shape[0]
+print(f"  Channels:           {n_channels}")
+print(f"  Visibilities/chan:  {n_visibilities}")
+
+# ---------------------------------------------------------------------------
+# 2. Adapt image + image mesh (Hilbert, channel-invariant)
+# ---------------------------------------------------------------------------
+#
+# Adapt image is computed once from the truth tracer and reused across every
+# channel — the lens model is channel-invariant, so the lensed-source image
+# in image plane is the same for each channel. ``image_mesh.Hilbert`` then
+# adaptively places source mesh vertices to follow the source intensity.
+
+print("\n--- Adapt image (lensed source) ---")
+
+with timer.section("adapt_image_build"):
+    adapt_image = adapt_image_for_dataset(
+        dataset_path=dataset_path, dataset=dataset_list[0]
+    )
+
+print(f"  adapt_image shape (slim): {adapt_image.shape_slim}")
+
+print("\n--- Image mesh construction (Hilbert) ---")
+
+with timer.section("image_mesh_hilbert"):
+    image_mesh = al.image_mesh.Hilbert(
+        pixels=hilbert_pixels, weight_power=1.0, weight_floor=0.0
+    )
+    image_plane_mesh_grid = image_mesh.image_plane_mesh_grid_from(
+        mask=dataset_list[0].real_space_mask, adapt_data=adapt_image
+    )
+
+n_mesh_vertices = image_plane_mesh_grid.shape[0]
+edge_pixels_total = 0
+print(f"  Hilbert pixels: {hilbert_pixels}")
+print(f"  Mesh vertices placed: {n_mesh_vertices}")
+
+# ---------------------------------------------------------------------------
+# 3. Model construction
+# ---------------------------------------------------------------------------
+
+print("\n--- Model construction ---")
+
+with timer.section("model_build"):
+    mass = af.Model(al.mp.Isothermal)
+    mass.centre.centre_0 = af.GaussianPrior(mean=0.0, sigma=0.005)
+    mass.centre.centre_1 = af.GaussianPrior(mean=0.0, sigma=0.005)
+    mass.einstein_radius = af.GaussianPrior(mean=1.6, sigma=0.05)
+    _lens_mass_ell = al.convert.ell_comps_from(axis_ratio=0.9, angle=45.0)
+    mass.ell_comps.ell_comps_0 = af.GaussianPrior(mean=_lens_mass_ell[0], sigma=0.01)
+    mass.ell_comps.ell_comps_1 = af.GaussianPrior(mean=_lens_mass_ell[1], sigma=0.01)
+
+    shear = af.Model(al.mp.ExternalShear)
+    shear.gamma_1 = af.GaussianPrior(mean=0.05, sigma=0.005)
+    shear.gamma_2 = af.GaussianPrior(mean=0.05, sigma=0.005)
+
+    lens = af.Model(al.Galaxy, redshift=0.5, mass=mass, shear=shear)
+
+    mesh = al.mesh.Delaunay(
+        pixels=n_mesh_vertices,
+        zeroed_pixels=0,
+    )
+    regularization = al.reg.ConstantSplit(coefficient=regularization_coefficient)
+    pixelization = al.Pixelization(mesh=mesh, regularization=regularization)
+
+    source = af.Model(al.Galaxy, redshift=1.0, pixelization=pixelization)
+
+    model = af.Collection(galaxies=af.Collection(lens=lens, source=source))
+
+print(f"  Total free parameters: {model.total_free_parameters}")
+print(f"  Delaunay pixels: {n_mesh_vertices}")
+
+# ---------------------------------------------------------------------------
+# 4. Instantiate concrete objects from prior medians
+# ---------------------------------------------------------------------------
+
+print("\n--- Instantiate concrete model ---")
+
+with timer.section("instance_from_vector"):
+    param_vector = model.physical_values_from_prior_medians
+    instance = model.instance_from_vector(vector=param_vector)
+
+with timer.section("register_pytrees"):
+    _register_model_pytrees(model)
+
+params_tree = jax.tree_util.tree_map(jnp.asarray, instance)
+
+tracer = al.Tracer(galaxies=list(instance.galaxies))
+
+# The adapt_images object is channel-invariant — the image-plane Delaunay mesh
+# vertices are shared across channels (the lens model is shared).
+adapt_images = al.AdaptImages(
+    galaxy_image_plane_mesh_grid_dict={
+        instance.galaxies.source: image_plane_mesh_grid,
+    },
+    galaxy_name_image_plane_mesh_grid_dict={
+        "('galaxies', 'source')": image_plane_mesh_grid,
+    },
+)
+
+print(f"  Tracer planes: {tracer.total_planes}")
+
+# ---------------------------------------------------------------------------
+# 5. Configuration summary
+# ---------------------------------------------------------------------------
+
+print("\n--- Configuration (determines run time) ---")
+print(f"  Instrument:              {instrument}")
+print(f"  Channels:                {n_channels}")
+print(f"  Pixel scale:             {pixel_scale} arcsec/pixel")
+print(f"  Real-space mask radius:  {mask_radius} arcsec")
+print(f"  Real-space grid shape:   {real_space_shape[0]} x {real_space_shape[1]}")
+print(f"  Visibilities/chan:       {n_visibilities}")
+print(f"  Hilbert pixels:          {hilbert_pixels}")
+print(f"  Delaunay vertices:       {n_mesh_vertices}")
+print(f"  Edge zeroed pixels:      {edge_pixels_total}")
+print(f"  Reg. coefficient:        {regularization_coefficient}")
+
+# ---------------------------------------------------------------------------
+# 6. Per-channel eager FitInterferometer baseline
+# ---------------------------------------------------------------------------
+
+print(f"\n--- Per-channel eager FitInterferometer baselines ({n_channels} channels) ---")
+
+fit_list = []
+log_evidence_per_channel = []
+with timer.section(f"eager_fit_per_channel_x{n_channels}"):
+    for c, dataset in enumerate(dataset_list):
+        f = al.FitInterferometer(
+            dataset=dataset,
+            tracer=tracer,
+            adapt_images=adapt_images,
+            settings=al.Settings(use_mixed_precision=_cli.use_mixed_precision),
+            xp=np,
+        )
+        fit_list.append(f)
+        log_evidence_per_channel.append(f.log_evidence)
+
+for c, le in enumerate(log_evidence_per_channel):
+    print(f"  channel {c}: log_evidence = {le:.6f}")
+
+cube_log_evidence_ref = float(sum(log_evidence_per_channel))
+print(f"  cube reference log_evidence (sum) = {cube_log_evidence_ref:.6f}")
+
+
+# ===================================================================
+# PART C — Full-pipeline cube JIT (sum of per-channel log_likelihoods)
+# ===================================================================
+
+print("\n" + "=" * 70)
+print("FULL-PIPELINE CUBE JIT (for comparison)")
+print("=" * 70)
+
+# Part C is expensive at large n_channels: lower + compile build a graph
+# proportional to n_channels (e.g. ~70s for n_channels=34 on a laptop CPU),
+# and the steady-state first-call follows. Default to skipping; opt in with
+# CUBE_FULL_JIT=1 when the full-pipeline timing matters (e.g. comparing
+# step-by-step total against single-JIT).
+_run_full_cube_jit = os.environ.get("CUBE_FULL_JIT") == "1"
+
+if _run_full_cube_jit:
+    analysis_list = [
+        al.AnalysisInterferometer(
+            dataset=d,
+            adapt_images=adapt_images,
+            settings=al.Settings(use_mixed_precision=_cli.use_mixed_precision),
+            use_jax=True,
+        )
+        for d in dataset_list
+    ]
+
+    def full_cube_pipeline_from_params(params_tree):
+        """Cube log-evidence via the explicit per-channel sum.
+
+        Same shape as the user-facing ``datacube/likelihood_function.py``:
+        feeds the shared instance to every per-channel
+        ``AnalysisInterferometer.log_likelihood_function`` and sums.
+        """
+        total = jnp.zeros(())
+        for analysis in analysis_list:
+            total = total + analysis.log_likelihood_function(instance=params_tree)
+        return total
+
+    _full_cube_n_repeats = 3
+    _, full_cube_result = jit_profile(
+        full_cube_pipeline_from_params,
+        "full_cube_pipeline",
+        params_tree,
+        n_repeats=_full_cube_n_repeats,
+    )
+    full_pipeline_per_call = timer.records[-1][1] / _full_cube_n_repeats
+
+    print(f"  full cube log_evidence (JIT) = {full_cube_result}")
+
+    np.testing.assert_allclose(
+        float(full_cube_result),
+        cube_log_evidence_ref,
+        rtol=1e-4,
+        err_msg="Full-pipeline cube JIT log_evidence does not match summed eager FitInterferometer.log_evidence",
+    )
+    print("  Eager-vs-JIT cube correctness PASSED")
+else:
+    full_cube_result = None
+    full_pipeline_per_call = float("nan")
+    print(
+        "  Full-pipeline cube JIT SKIPPED — opt-in via CUBE_FULL_JIT=1. "
+        f"At n_channels={n_channels} the lower + compile alone is on the order of "
+        f"{n_channels * 2}-{n_channels * 3}s, so it's gated to keep the default "
+        "runtime usable; the per-step Part B JIT data above is what feeds the "
+        "shared-Lᵀ W̃ L analysis."
+    )
+
+# ===================================================================
+# PART D — vmap (skipped for cube)
+# ===================================================================
+#
+# The natural batching axis for a cube fit is "datasets" (one entry per
+# channel), not "parameters" (which the interferometer-sibling vmap exercises).
+# vmap-over-channels would require a different graph shape and isn't where the
+# shared-Lᵀ W̃ L optimisation lives. Skipped.
+
+print("\n--- vmap (skipped) ---")
+print(
+    "  Cube batching dimension is 'datasets', not 'parameters'. The "
+    "interferometer-sibling vmap pattern doesn't map cleanly here. Skipped."
+)
+
+# ===================================================================
+# Summary + JSON + bar chart
+# ===================================================================
+
+import json
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+al_version = al.__version__
+
+print("\n" + "=" * 70)
+print(f"JAX LIKELIHOOD FUNCTION SUMMARY — CUBE {instrument.upper()} × {n_channels} — v{al_version}")
+print("=" * 70)
+print(f"  Instrument:              {instrument}")
+print(f"  Channels:                {n_channels}")
+print(f"  Pixel scale:             {pixel_scale} arcsec/pixel")
+print(f"  Real-space mask radius:  {mask_radius} arcsec")
+print(f"  Real-space grid shape:   {real_space_shape[0]} x {real_space_shape[1]}")
+print(f"  Visibilities/chan:       {n_visibilities}")
+print(f"  Delaunay vertices:       {n_mesh_vertices}")
+print(f"  Edge zeroed pixels:      {edge_pixels_total}")
+print("-" * 70)
+print(f"  Cube reference log_evidence:  {cube_log_evidence_ref}")
+if full_cube_result is not None:
+    print(f"  Cube JIT log_evidence:        {float(full_cube_result)}")
+else:
+    print(f"  Cube JIT log_evidence:        SKIPPED (CUBE_FULL_JIT=1 to enable)")
+print("-" * 70)
+
+# Shared-Lᵀ W̃ L optimisation savings estimate:
+# Moving the curvature matrix from per-channel to shared would save
+# (n_channels - 1) × per-channel curvature matrix cost.
+shared_lwl_savings = (n_channels - 1) * curvature_matrix_per_channel
+
+print("-" * 70)
+if np.isfinite(full_pipeline_per_call):
+    print(f"      {'Full pipeline cube (single JIT)':<50}  {full_pipeline_per_call:>12.6f} s")
+else:
+    print(f"      {'Full pipeline cube (single JIT)':<50}  SKIPPED")
+print(f"      {'Shared-Lᵀ W̃ L savings (curvature only, est.)':<50}  {shared_lwl_savings:>12.6f} s")
+print("=" * 70)
+
+# --- Save results dictionary ---
+
+likelihood_summary = {
+    "autolens_version": al_version,
+    "device": device_info_dict(),
+    "instrument": instrument,
+    "model": "delaunay",
+    "n_channels": n_channels,
+    "configuration": {
+        "pixel_scale_arcsec": pixel_scale,
+        "mask_radius_arcsec": mask_radius,
+        "real_space_shape": list(real_space_shape),
+        "visibilities_per_channel": int(n_visibilities),
+        "hilbert_pixels": int(hilbert_pixels),
+        "delaunay_vertices": int(n_mesh_vertices),
+        "edge_zeroed_pixels": int(edge_pixels_total),
+        "regularization_coefficient": regularization_coefficient,
+    },
+    "cube_log_evidence_eager": cube_log_evidence_ref,
+    "cube_log_evidence_jit": (
+        float(full_cube_result) if full_cube_result is not None else None
+    ),
+    "log_evidence_per_channel_eager": [float(le) for le in log_evidence_per_channel],
+    "full_pipeline_cube_single_jit": full_pipeline_per_call,
+    "shared_lwl_savings_estimate": shared_lwl_savings,
+    "vmap": "SKIPPED — cube batching axis is 'datasets', not 'parameters'",
+}
+
+dict_path, chart_path = resolve_output_paths(
+    _cli,
+    default_dir=_workspace_root / "results" / "likelihood" / "datacube",
+    default_basename=f"delaunay_likelihood_summary_{instrument}_v{al_version}",
+)
+dict_path.write_text(json.dumps(likelihood_summary, indent=2))
+print(f"\n  Results dict saved to: {dict_path}")
+print(f"  Bar chart path:        {chart_path} (no per-step chart in runtime variant)")
+
+
+# ===================================================================
+# Regression assertion — deterministic cube log-evidence
+# ===================================================================
+#
+# Identical channels = exact N × single-channel log-evidence (for "sma").
+# For "hannah" the per-channel literal isn't pinned yet, so the assertion is
+# skipped until the value below is filled in from a clean run.
+EXPECTED_LOG_EVIDENCE_PER_CHANNEL = {
+    "sma": None,
+    "alma": None,
+    "alma_high": None,
+}
+
+_per_channel = EXPECTED_LOG_EVIDENCE_PER_CHANNEL.get(instrument)
+expected_cube_log_evidence = (
+    n_channels * _per_channel if _per_channel is not None else None
+)
+
+if expected_cube_log_evidence is None:
+    print(
+        f"\n  Cube regression assertion SKIPPED for [{instrument}] — "
+        f"capture this run's eager cube log_evidence ({cube_log_evidence_ref}), "
+        f"divide by n_channels ({n_channels}) to get the per-channel value "
+        f"({cube_log_evidence_ref / n_channels}), and paste that into "
+        f"EXPECTED_LOG_EVIDENCE_PER_CHANNEL[{instrument!r}]."
+    )
+else:
+    np.testing.assert_allclose(
+        cube_log_evidence_ref,
+        expected_cube_log_evidence,
+        rtol=1e-4,
+        err_msg=(
+            f"datacube/delaunay[{instrument}]: regression — eager cube log_evidence "
+            f"drifted (got {cube_log_evidence_ref}, expected {expected_cube_log_evidence})"
+        ),
+    )
+    print(
+        f"\n  Eager cube regression assertion PASSED: log_evidence matches "
+        f"{expected_cube_log_evidence:.6f}"
+    )
+    if full_cube_result is not None:
+        np.testing.assert_allclose(
+            float(full_cube_result),
+            expected_cube_log_evidence,
+            rtol=1e-3,
+            err_msg=f"datacube/delaunay[{instrument}]: regression — full cube log_evidence drifted",
+        )
+        print(f"  Full-pipeline cube regression assertion PASSED")
diff --git a/likelihood_runtime/imaging/delaunay.py b/likelihood_runtime/imaging/delaunay.py
new file mode 100644
index 0000000..42f74ff
--- /dev/null
+++ b/likelihood_runtime/imaging/delaunay.py
@@ -0,0 +1,578 @@
+"""
+JAX Profiling: Delaunay Imaging Likelihood (Step-by-Step)
+=========================================================
+
+Profiles each step of the JAX likelihood function for an imaging dataset where
+the source galaxy is reconstructed using a Delaunay triangulation mesh with
+ConstantSplit regularization.
+
+Key differences from the rectangular pixelization profiling script:
+
+- Mesh vertices are computed in the **image-plane** via an Overlay grid, then
+  ray-traced to the source-plane (rectangular computes directly in source-plane).
+- Edge points are appended around the mask border and zeroed during inversion.
+- Uses **InterpolatorDelaunay** (barycentric interpolation within triangles)
+  instead of bilinear interpolation on a rectangular grid.
+- Uses **ConstantSplit** regularization (cross-derivative scheme) instead of
+  the simpler Constant neighbour-difference scheme.
+- Delaunay triangulation itself uses scipy on CPU and cannot be JIT-compiled.
+
+Pipeline steps:
+
+1. Ray-trace data grid to source plane
+2. Ray-trace mesh grid (image-plane vertices) to source plane
+3. Lens light images (pre-PSF, JIT) + PSF convolution (eager)
+4. Profile-subtracted image
+5. Border relocation (data grid + mesh grid)
+6. Delaunay triangulation + interpolation + mapper
+7. Mapping matrix
+8. Blurred mapping matrix (PSF convolution)
+9. Data vector (D)
+10. Curvature matrix (F)
+11. Regularization matrix (H) — ConstantSplit scheme
+12. Regularized reconstruction: s = (F + H)^{-1} D
+13. Map reconstruction to image + log evidence
+
+Caveat: XLA may fuse operations differently when compiled as one program vs
+separate pieces, so per-step timings are approximate. They are still useful
+for identifying which step dominates.
+
+All JAX timings use `block_until_ready()` to force synchronous measurement.
+"""
+
+import numpy as np
+import jax
+import jax.numpy as jnp
+import time
+import subprocess
+import sys
+from pathlib import Path
+from contextlib import contextmanager
+
+import autofit as af
+import autolens as al
+import autoarray as aa
+from autofit.jax import register_model as _register_model_pytrees
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+from _adapt_image_util import adapt_image_for_dataset  # noqa: E402
+
+# ---------------------------------------------------------------------------
+# Instrument configuration
+# ---------------------------------------------------------------------------
+
+
+# AUTOLENS_PROFILING_SMOKE=1 short-circuit (Phase 5 / CI lint smoke).
+# Verifies the import graph + module-level setup succeeded without running
+# the full profiling pipeline. Skipped entirely when the env var is unset.
+import os as _smoke_os
+import sys as _smoke_sys
+if _smoke_os.environ.get("AUTOLENS_PROFILING_SMOKE") == "1":
+    print(f"[smoke] {__file__}: imports + module setup OK; exiting.")
+    _smoke_sys.exit(0)
+
+# Sweep-driver CLI args (--config-name / --output-dir / --use-mixed-precision).
+# Tolerates extra/unknown args via parse_known_args inside the helper.
+from _profile_cli import (  # noqa: E402
+    parse_profile_cli,
+    device_info_dict,
+    resolve_output_paths,
+    auto_simulate_if_missing,
+)
+from simulators.imaging import INSTRUMENTS  # noqa: E402
+_cli = parse_profile_cli()
+
+instrument = "hst"  # <-- change this to profile a different instrument
+
+
+# ---------------------------------------------------------------------------
+# Profiling helpers
+# ---------------------------------------------------------------------------
+
+class Timer:
+    """Accumulates named timing measurements and prints a summary."""
+
+    def __init__(self):
+        self.records: list[tuple[str, float]] = []
+
+    @contextmanager
+    def section(self, label: str):
+        """Context manager that records wall-clock time for *label*."""
+        start = time.perf_counter()
+        yield
+        elapsed = time.perf_counter() - start
+        self.records.append((label, elapsed))
+        print(f"  [{label}] {elapsed:.4f} s")
+
+    def summary(self):
+        print("\n" + "=" * 70)
+        print("PROFILING SUMMARY")
+        print("=" * 70)
+        max_label = max(len(r[0]) for r in self.records)
+        total = 0.0
+        for label, elapsed in self.records:
+            print(f"  {label:<{max_label}}  {elapsed:>10.4f} s")
+            total += elapsed
+        print("-" * 70)
+        print(f"  {'TOTAL':<{max_label}}  {total:>10.4f} s")
+        print("=" * 70)
+
+
+def block(x):
+    """Call block_until_ready if available (JAX arrays)."""
+    if hasattr(x, "block_until_ready"):
+        x.block_until_ready()
+    return x
+
+
+def jit_profile(func, label, *args, n_repeats=10):
+    """JIT-compile *func*, time first call and steady-state average.
+
+    Returns the compiled function and its result.
+    """
+    jitted = jax.jit(func)
+
+    with timer.section(f"{label}_lower"):
+        lowered = jitted.lower(*args)
+
+    with timer.section(f"{label}_compile"):
+        compiled = lowered.compile()
+
+    with timer.section(f"{label}_first_call"):
+        result = compiled(*args)
+        block(result)
+
+    with timer.section(f"{label}_steady_x{n_repeats}"):
+        for _ in range(n_repeats):
+            result = compiled(*args)
+            block(result)
+
+    per_call = timer.records[-1][1] / n_repeats
+    print(f"    -> per-call avg: {per_call:.6f} s")
+    return compiled, result
+
+
+timer = Timer()
+
+# ===================================================================
+# PART A — Setup (not JIT-compiled)
+# ===================================================================
+
+# ---------------------------------------------------------------------------
+# 1. Dataset
+# ---------------------------------------------------------------------------
+
+print(f"\n--- Dataset loading & masking [{instrument}] ---")
+
+_script_dir = Path(__file__).resolve().parent
+_workspace_root = _script_dir.parents[1]
+pixel_scale = INSTRUMENTS[instrument]["pixel_scale"]
+dataset_path = Path("dataset") / "imaging" / instrument
+
+auto_simulate_if_missing(
+    dataset_path,
+    dataset_type="imaging",
+    instrument=instrument,
+    workspace_root=_workspace_root,
+)
+
+with timer.section("dataset_load"):
+    dataset = al.Imaging.from_fits(
+        data_path=dataset_path / "data.fits",
+        psf_path=dataset_path / "psf.fits",
+        noise_map_path=dataset_path / "noise_map.fits",
+        pixel_scales=pixel_scale,
+    )
+
+with timer.section("mask_and_oversample"):
+    mask_radius = 3.5
+
+    mask = al.Mask2D.circular(
+        shape_native=dataset.shape_native,
+        pixel_scales=dataset.pixel_scales,
+        radius=mask_radius,
+    )
+
+    dataset = dataset.apply_mask(mask=mask)
+    dataset = dataset.apply_over_sampling(
+        over_sample_size_lp=4,
+        over_sample_size_pixelization=1,
+    )
+
+    over_sample_size = al.util.over_sample.over_sample_size_via_radial_bins_from(
+        grid=dataset.grid,
+        sub_size_list=[4, 2, 1],
+        radial_list=[0.3, 0.6],
+        centre_list=[(0.0, 0.0)],
+    )
+
+    dataset = dataset.apply_over_sampling(
+        over_sample_size_lp=over_sample_size,
+        over_sample_size_pixelization=1,
+    )
+
+# ---------------------------------------------------------------------------
+# 2. Adapt image + image mesh (Hilbert)
+# ---------------------------------------------------------------------------
+#
+# ``image_mesh.Hilbert`` places the source mesh vertices in the image plane by
+# inverse-transform-sampling a Hilbert-curve ordering of the lensed source
+# adapt image. The result is a sparser mesh in faint regions and a denser one
+# where the source actually lives — production-grade, replaces the
+# uniform-coverage ``image_mesh.Overlay`` + circular-edge fallback that
+# preceded the Hilbert path. ``zeroed_pixels=0`` because Hilbert's placement
+# is data-driven; there are no fixed-position edge points to mask out.
+
+print("\n--- Adapt image (lensed source) ---")
+
+with timer.section("adapt_image_build"):
+    adapt_image = adapt_image_for_dataset(
+        dataset_path=dataset_path, dataset=dataset
+    )
+
+print(f"  adapt_image shape (slim): {adapt_image.shape_slim}")
+
+print("\n--- Image mesh construction (Hilbert) ---")
+
+n_mesh_vertices = 1500  # 1500-tier production fiducial
+
+with timer.section("image_mesh_hilbert"):
+    image_mesh = al.image_mesh.Hilbert(
+        pixels=n_mesh_vertices, weight_power=1.0, weight_floor=0.0
+    )
+    image_plane_mesh_grid = image_mesh.image_plane_mesh_grid_from(
+        mask=dataset.mask, adapt_data=adapt_image
+    )
+
+edge_pixels_total = 0
+print(f"  Hilbert pixels: {n_mesh_vertices}")
+print(f"  Mesh vertices placed: {image_plane_mesh_grid.shape[0]}")
+
+# ---------------------------------------------------------------------------
+# 3. Model construction
+# ---------------------------------------------------------------------------
+
+print("\n--- Model construction ---")
+
+with timer.section("model_build"):
+    # GaussianPrior(mean=truth, sigma=small) centres prior-median at the
+    # simulator truth while keeping params free so gradient diagnostics
+    # have dimensionality.
+    # Lens light: MGE-60 (full production-fiducial) — replaces single Sersic.
+    # The 60 linear Gaussians enter the inversion's mapping matrix
+    # alongside the source-pixel columns.
+    lens_bulge = al.model_util.mge_model_from(
+        mask_radius=mask_radius,
+        total_gaussians=60,
+        centre_prior_is_uniform=True,
+    )
+
+    mass = af.Model(al.mp.Isothermal)
+    mass.centre.centre_0 = af.GaussianPrior(mean=0.0, sigma=0.005)
+    mass.centre.centre_1 = af.GaussianPrior(mean=0.0, sigma=0.005)
+    mass.einstein_radius = af.GaussianPrior(mean=1.6, sigma=0.05)
+    _lens_mass_ell = al.convert.ell_comps_from(axis_ratio=0.9, angle=45.0)
+    mass.ell_comps.ell_comps_0 = af.GaussianPrior(mean=_lens_mass_ell[0], sigma=0.01)
+    mass.ell_comps.ell_comps_1 = af.GaussianPrior(mean=_lens_mass_ell[1], sigma=0.01)
+
+    shear = af.Model(al.mp.ExternalShear)
+    shear.gamma_1 = af.GaussianPrior(mean=0.05, sigma=0.005)
+    shear.gamma_2 = af.GaussianPrior(mean=0.05, sigma=0.005)
+
+    lens = af.Model(
+        al.Galaxy, redshift=0.5, bulge=lens_bulge, mass=mass, shear=shear
+    )
+
+    mesh = al.mesh.Delaunay(
+        pixels=n_mesh_vertices,
+        zeroed_pixels=0,
+    )
+    regularization = al.reg.ConstantSplit(coefficient=1.0)
+    pixelization = al.Pixelization(mesh=mesh, regularization=regularization)
+
+    source = af.Model(al.Galaxy, redshift=1.0, pixelization=pixelization)
+
+    model = af.Collection(galaxies=af.Collection(lens=lens, source=source))
+
+print(f"  Total free parameters: {model.total_free_parameters}")
+print(f"  Delaunay pixels: {n_mesh_vertices}")
+print(f"  Zeroed edge pixels: {edge_pixels_total}")
+
+# ---------------------------------------------------------------------------
+# 4. Instantiate concrete objects from prior medians
+# ---------------------------------------------------------------------------
+
+print("\n--- Instantiate concrete model ---")
+
+with timer.section("instance_from_vector"):
+    param_vector = model.physical_values_from_prior_medians
+    instance = model.instance_from_vector(vector=param_vector)
+
+with timer.section("register_pytrees"):
+    _register_model_pytrees(model)
+
+params_tree = jax.tree_util.tree_map(jnp.asarray, instance)
+
+n_pytree_leaves = len(jax.tree_util.tree_leaves(params_tree))
+print(f"  Pytree JAX leaves: {n_pytree_leaves}")
+
+tracer = al.Tracer(galaxies=list(instance.galaxies))
+
+# AdaptImages tells FitImaging where mesh vertices live in image-plane
+adapt_images = al.AdaptImages(
+    galaxy_image_plane_mesh_grid_dict={
+        instance.galaxies.source: image_plane_mesh_grid,
+    },
+    galaxy_name_image_plane_mesh_grid_dict={
+        "('galaxies', 'source')": image_plane_mesh_grid,
+    },
+)
+
+print(f"  Tracer planes: {tracer.total_planes}")
+
+# ---------------------------------------------------------------------------
+# Key configuration that dictates run time
+# ---------------------------------------------------------------------------
+
+n_image_pixels = dataset.data.shape[0]
+n_over_sampled_pixels = dataset.grids.lp.over_sampled.shape[0]
+n_source_pixels = n_mesh_vertices
+
+print("\n--- Configuration (determines run time) ---")
+print(f"  Instrument:              {instrument}")
+print(f"  Pixel scale:             {pixel_scale} arcsec/pixel")
+print(f"  Mask radius:             {mask_radius} arcsec")
+print(f"  Image pixels (masked):   {n_image_pixels}")
+print(f"  Over-sampled pixels:     {n_over_sampled_pixels}")
+print(f"  Delaunay vertices:       {n_source_pixels}")
+print(f"  Edge zeroed pixels:      {edge_pixels_total}")
+
+# ---------------------------------------------------------------------------
+# 5. Full-pipeline reference (FitImaging) — eager baseline
+# ---------------------------------------------------------------------------
+
+print("\n--- Full FitImaging (eager baseline) ---")
+
+with timer.section("fit_imaging_eager"):
+    fit = al.FitImaging(
+        dataset=dataset,
+        tracer=tracer,
+        adapt_images=adapt_images,
+        settings=al.Settings(
+            use_border_relocator=True,
+            use_mixed_precision=_cli.use_mixed_precision,
+        ),
+        xp=np,
+    )
+    log_evidence_ref = fit.figure_of_merit
+    log_likelihood_ref = fit.log_likelihood
+
+print(f"  figure_of_merit (log_evidence) = {log_evidence_ref}")
+print(f"  log_likelihood                 = {log_likelihood_ref}")
+
+
+# ===================================================================
+# PART C — Full-pipeline JIT for comparison
+# ===================================================================
+
+print("\n" + "=" * 70)
+print("FULL-PIPELINE JIT (for comparison)")
+print("=" * 70)
+
+analysis = al.AnalysisImaging(dataset=dataset, adapt_images=adapt_images, use_jax=True)
+
+def full_pipeline_from_params(params_tree):
+    return analysis.log_likelihood_function(instance=params_tree)
+
+_, full_result = jit_profile(full_pipeline_from_params, "full_pipeline", params_tree)
+full_pipeline_per_call = timer.records[-1][1] / 10
+
+print(f"  full log_likelihood = {full_result}")
+
+# ===================================================================
+# PART D — vmap + correctness
+# ===================================================================
+
+print("\n--- vmap batched evaluation ---")
+
+# WARNING: The vmap compilation for the Delaunay pipeline takes 20+ minutes on CPU.
+# The XLA graph for a batched Delaunay inversion (including scipy triangulation,
+# border relocation, interpolation, mapping matrix construction, and PSF convolution)
+# is extremely large. The single-call JIT above compiles in ~2s and runs in ~1.8s,
+# but vmap recompiles the entire graph for batch_size independent evaluations.
+#
+# This is likely a candidate for optimisation — either via custom_vjp to avoid
+# retracing the full pipeline, or by restructuring the Delaunay steps to reduce
+# the XLA graph size. For now, skip vmap by default and run it only when explicitly
+# requested via DELAUNAY_VMAP=1 environment variable.
+
+import os
+run_vmap = os.environ.get("DELAUNAY_VMAP", "0") == "1"
+
+if not run_vmap:
+    print("  SKIPPED: vmap compilation takes 20+ minutes for Delaunay pipeline.")
+    print("  Set DELAUNAY_VMAP=1 to run this section.")
+    vmap_batch_time = None
+    vmap_per_call = None
+    vmap_speedup = None
+else:
+
+    batch_size = 3
+    parameters = jax.tree_util.tree_map(
+        lambda leaf: jnp.broadcast_to(leaf, (batch_size, *leaf.shape)),
+        params_tree,
+    )
+
+    vmapped_full = jax.jit(jax.vmap(full_pipeline_from_params))
+
+    with timer.section("vmap_first_call"):
+        result_vmap = vmapped_full(parameters)
+        block(result_vmap)
+
+    n_vmap_repeats = 10
+    with timer.section(f"vmap_steady_x{n_vmap_repeats}"):
+        for _ in range(n_vmap_repeats):
+            result_vmap = vmapped_full(parameters)
+            block(result_vmap)
+
+    vmap_batch_time = timer.records[-1][1] / n_vmap_repeats
+    vmap_per_call = vmap_batch_time / batch_size
+    vmap_speedup = full_pipeline_per_call / vmap_per_call
+
+    print(f"  batch results = {result_vmap}")
+    print(f"  vmap batch of {batch_size}:   {vmap_batch_time:.6f} s")
+    print(f"  vmap per call:         {vmap_per_call:.6f} s")
+    print(f"  single JIT per call:   {full_pipeline_per_call:.6f} s")
+    print(f"  vmap speedup:          {vmap_speedup:.1f}x faster per likelihood")
+
+    np.testing.assert_allclose(
+        np.array(result_vmap),
+        float(full_result),
+        rtol=1e-4,
+        err_msg="delaunay: JAX vmap likelihood mismatch",
+    )
+    print("  Correctness check PASSED")
+
+    # --- Static memory analysis ---
+
+    print("\n--- Static memory analysis ---")
+
+    lowered_batched = vmapped_full.lower(parameters)
+    compiled_batched = lowered_batched.compile()
+
+    memory_analysis = compiled_batched.memory_analysis()
+    print(f"  Output size:  {memory_analysis.output_size_in_bytes / 1024**2:.3f} MB")
+    print(f"  Temp size:    {memory_analysis.temp_size_in_bytes / 1024**2:.3f} MB")
+    print(
+        f"  Total:        "
+        f"{(memory_analysis.output_size_in_bytes + memory_analysis.temp_size_in_bytes) / 1024**2:.3f} MB"
+    )
+
+
+# ===================================================================
+# JAX Likelihood Function Summary
+# ===================================================================
+
+import json
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+al_version = al.__version__
+
+print("\n" + "=" * 70)
+print(f"JAX LIKELIHOOD FUNCTION SUMMARY — {instrument.upper()} — v{al_version}")
+print("=" * 70)
+print(f"  Instrument:            {instrument}")
+print(f"  Pixel scale:           {pixel_scale} arcsec/pixel")
+print(f"  Mask radius:           {mask_radius} arcsec")
+print(f"  Image pixels (masked): {n_image_pixels}")
+print(f"  Over-sampled pixels:   {n_over_sampled_pixels}")
+print(f"  Delaunay vertices:     {n_source_pixels}")
+print(f"  Edge zeroed pixels:    {edge_pixels_total}")
+print("-" * 70)
+
+print("-" * 70)
+print(f"      {'Full pipeline (single JIT)':<30}  {full_pipeline_per_call:>12.6f} s")
+if vmap_per_call is not None:
+    print(f"      {'vmap batch (per call)':<30}  {vmap_per_call:>12.6f} s")
+    print(f"      {'vmap speedup vs single JIT':<30}  {vmap_speedup:>11.1f}x")
+else:
+    print(f"      {'vmap':<30}  {'SKIPPED':>12}")
+print("=" * 70)
+
+# --- Save results dictionary ---
+
+likelihood_summary = {
+    "autolens_version": al_version,
+    "device": device_info_dict(),
+    "instrument": instrument,
+    "configuration": {
+        "pixel_scale_arcsec": pixel_scale,
+        "mask_radius_arcsec": mask_radius,
+        "image_pixels_masked": int(n_image_pixels),
+        "over_sampled_pixels": int(n_over_sampled_pixels),
+        "delaunay_vertices": int(n_source_pixels),
+        "edge_zeroed_pixels": int(edge_pixels_total),
+    },
+    "full_pipeline_single_jit": full_pipeline_per_call,
+    "vmap": "SKIPPED — compilation takes 20+ minutes (set DELAUNAY_VMAP=1)",
+}
+
+if vmap_per_call is not None:
+    likelihood_summary["vmap"] = {
+        "batch_size": batch_size,
+        "batch_time": vmap_batch_time,
+        "per_call": vmap_per_call,
+        "speedup_vs_single_jit": round(vmap_speedup, 1),
+    }
+
+dict_path, chart_path = resolve_output_paths(
+    _cli,
+    default_dir=_workspace_root / "results" / "likelihood" / "imaging",
+    default_basename=f"delaunay_likelihood_summary_{instrument}_v{al_version}",
+)
+dict_path.write_text(json.dumps(likelihood_summary, indent=2))
+print(f"\n  Results dict saved to: {dict_path}")
+print(f"  Bar chart path:        {chart_path} (no per-step chart in runtime variant)")
+
+
+# ===================================================================
+# Regression assertion — realistic-scale deterministic log-evidence
+# ===================================================================
+#
+# Simulator truth parameters via GaussianPrior(mean=truth, sigma=small)
+# make the full-pipeline log-evidence deterministic at the prior median.
+# Hilbert image_mesh + 1500-pixel Delaunay; rtol=1e-3 for the JIT paths
+# matches imaging/pixelization (adaptive meshes amplify fp drift through
+# Cholesky / log_det). vmap result asserted only when DELAUNAY_VMAP=1
+# (vmap compile takes 20+ min).
+EXPECTED_LOG_EVIDENCE_HST = 29110.92085793  # 1500-pixel Hilbert/Delaunay, MGE-60 lens, adapt_image=lensed_source
+
+np.testing.assert_allclose(
+    log_evidence_ref,
+    EXPECTED_LOG_EVIDENCE_HST,
+    rtol=1e-4,
+    err_msg=(
+        f"imaging/delaunay[{instrument}]: regression — eager log_evidence drifted "
+        f"(got {log_evidence_ref}, expected {EXPECTED_LOG_EVIDENCE_HST})"
+    ),
+)
+print(
+    f"  Eager regression assertion PASSED: log_evidence matches "
+    f"{EXPECTED_LOG_EVIDENCE_HST:.6f}"
+)
+np.testing.assert_allclose(
+    float(full_result),
+    EXPECTED_LOG_EVIDENCE_HST,
+    rtol=1e-3,
+    err_msg=f"imaging/delaunay[{instrument}]: regression — full log_evidence drifted",
+)
+if run_vmap:
+    np.testing.assert_allclose(
+        np.array(result_vmap),
+        EXPECTED_LOG_EVIDENCE_HST,
+        rtol=1e-3,
+        err_msg=f"imaging/delaunay[{instrument}]: regression — vmap log_evidence drifted",
+    )
+print(f"  Regression assertion PASSED: log_evidence matches {EXPECTED_LOG_EVIDENCE_HST:.6f}")
diff --git a/likelihood_runtime/imaging/mge.py b/likelihood_runtime/imaging/mge.py
new file mode 100644
index 0000000..d4ad6a1
--- /dev/null
+++ b/likelihood_runtime/imaging/mge.py
@@ -0,0 +1,507 @@
+"""
+JAX Profiling: MGE Imaging Likelihood (Step-by-Step)
+=====================================================
+
+Profiles each step of the JAX likelihood function for an imaging dataset where
+the lens galaxy's light is modelled with a multi-Gaussian expansion (MGE).
+
+Rather than timing the whole likelihood as a single JIT-compiled block (which
+hides internal bottlenecks), this script JIT-compiles and times each step of
+the pipeline individually:
+
+1. Instance from parameter vector
+2. Build Tracer
+3. Ray-trace grids through the lens
+4. Compute mapping matrix (per-profile images before PSF)
+5. Compute blurred mapping matrix (PSF convolution)
+6. Compute data vector  (D)
+7. Compute curvature matrix  (F)
+8. Reconstruction via positive-only NNLS
+9. Map reconstruction back to image plane
+10. Chi-squared and log likelihood
+
+Note: because the MGE model uses only linear light profiles (lp_linear),
+there is no non-linear blurred image or profile-subtracted image step.
+
+Caveat: XLA may fuse operations differently when compiled as one program vs
+separate pieces, so per-step timings are approximate. They are still useful
+for identifying which step dominates.
+
+All JAX timings use `block_until_ready()` to force synchronous measurement.
+
+Pytree-native parameter inputs (recommended pattern)
+----------------------------------------------------
+
+This script uses ``af.ModelInstance`` as the JIT input via PyAutoFit's
+opt-in pytree registration (``autofit.jax.register_model(model)``). The
+JIT'd closures consume the instance directly, so:
+
+* ``model.instance_from_vector`` is no longer called inside the JIT trace —
+  parameter unpacking happens once at registration time and JAX walks the
+  pytree on every call.
+* Parameter identity is preserved through ``jax.jit`` and ``jax.vmap``;
+  XLA cache keys reflect the structured pytree, not a flat vector shape.
+* ``vmap`` batching is ``jax.tree_util.tree_map`` over the instance leaves
+  — callers no longer have to stack a ``(batch, N)`` array.
+
+New profiling scripts should follow this pattern. The flat-vector path in
+``Fitness.call`` / ``model.instance_from_vector(..., xp=jnp)`` remains the
+production likelihood entry point and is intentionally untouched here.
+"""
+
+import numpy as np
+import jax
+import jax.numpy as jnp
+import time
+import subprocess
+import sys
+from pathlib import Path
+from contextlib import contextmanager
+
+import autofit as af
+import autolens as al
+import autoarray as aa
+from autofit.jax import register_model as _register_model_pytrees
+
+# ---------------------------------------------------------------------------
+# Instrument configuration
+# ---------------------------------------------------------------------------
+
+
+# AUTOLENS_PROFILING_SMOKE=1 short-circuit (Phase 5 / CI lint smoke).
+# Verifies the import graph + module-level setup succeeded without running
+# the full profiling pipeline. Skipped entirely when the env var is unset.
+import os as _smoke_os
+import sys as _smoke_sys
+if _smoke_os.environ.get("AUTOLENS_PROFILING_SMOKE") == "1":
+    print(f"[smoke] {__file__}: imports + module setup OK; exiting.")
+    _smoke_sys.exit(0)
+
+# Sweep-driver CLI args (--config-name / --output-dir / --use-mixed-precision).
+# Tolerates extra/unknown args via parse_known_args inside the helper.
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+from _profile_cli import (  # noqa: E402
+    parse_profile_cli,
+    device_info_dict,
+    resolve_output_paths,
+    auto_simulate_if_missing,
+)
+from simulators.imaging import INSTRUMENTS  # noqa: E402
+_cli = parse_profile_cli()
+
+instrument = "hst"  # <-- change this to profile a different instrument
+
+
+# ---------------------------------------------------------------------------
+# Profiling helpers
+# ---------------------------------------------------------------------------
+
+class Timer:
+    """Accumulates named timing measurements and prints a summary."""
+
+    def __init__(self):
+        self.records: list[tuple[str, float]] = []
+
+    @contextmanager
+    def section(self, label: str):
+        """Context manager that records wall-clock time for *label*."""
+        start = time.perf_counter()
+        yield
+        elapsed = time.perf_counter() - start
+        self.records.append((label, elapsed))
+        print(f"  [{label}] {elapsed:.4f} s")
+
+    def summary(self):
+        print("\n" + "=" * 70)
+        print("PROFILING SUMMARY")
+        print("=" * 70)
+        max_label = max(len(r[0]) for r in self.records)
+        total = 0.0
+        for label, elapsed in self.records:
+            print(f"  {label:<{max_label}}  {elapsed:>10.4f} s")
+            total += elapsed
+        print("-" * 70)
+        print(f"  {'TOTAL':<{max_label}}  {total:>10.4f} s")
+        print("=" * 70)
+
+
+def block(x):
+    """Call block_until_ready if available (JAX arrays)."""
+    if hasattr(x, "block_until_ready"):
+        x.block_until_ready()
+    return x
+
+
+def jit_profile(func, label, *args, n_repeats=10):
+    """JIT-compile *func*, time first call and steady-state average.
+
+    Returns the compiled function and its result.
+    """
+    jitted = jax.jit(func)
+
+    with timer.section(f"{label}_lower"):
+        lowered = jitted.lower(*args)
+
+    with timer.section(f"{label}_compile"):
+        compiled = lowered.compile()
+
+    with timer.section(f"{label}_first_call"):
+        result = compiled(*args)
+        block(result)
+
+    with timer.section(f"{label}_steady_x{n_repeats}"):
+        for _ in range(n_repeats):
+            result = compiled(*args)
+            block(result)
+
+    per_call = timer.records[-1][1] / n_repeats
+    print(f"    -> per-call avg: {per_call:.6f} s")
+    return compiled, result
+
+
+timer = Timer()
+
+# ===================================================================
+# PART A — Setup (not JIT-compiled)
+# ===================================================================
+
+# ---------------------------------------------------------------------------
+# 1. Dataset
+# ---------------------------------------------------------------------------
+
+print(f"\n--- Dataset loading & masking [{instrument}] ---")
+
+_script_dir = Path(__file__).resolve().parent
+_workspace_root = _script_dir.parents[1]
+pixel_scale = INSTRUMENTS[instrument]["pixel_scale"]
+dataset_path = Path("dataset") / "imaging" / instrument
+
+auto_simulate_if_missing(
+    dataset_path,
+    dataset_type="imaging",
+    instrument=instrument,
+    workspace_root=_workspace_root,
+)
+
+with timer.section("dataset_load"):
+    dataset = al.Imaging.from_fits(
+        data_path=dataset_path / "data.fits",
+        psf_path=dataset_path / "psf.fits",
+        noise_map_path=dataset_path / "noise_map.fits",
+        pixel_scales=pixel_scale,
+    )
+
+with timer.section("mask_and_oversample"):
+    mask_radius = 3.5
+
+    mask = al.Mask2D.circular(
+        shape_native=dataset.shape_native,
+        pixel_scales=dataset.pixel_scales,
+        radius=mask_radius,
+    )
+
+    dataset = dataset.apply_mask(mask=mask)
+    dataset = dataset.apply_over_sampling(over_sample_size_lp=4)
+
+    over_sample_size = al.util.over_sample.over_sample_size_via_radial_bins_from(
+        grid=dataset.grid,
+        sub_size_list=[4, 2, 1],
+        radial_list=[0.3, 0.6],
+        centre_list=[(0.0, 0.0)],
+    )
+
+    dataset = dataset.apply_over_sampling(over_sample_size_lp=over_sample_size)
+
+# ---------------------------------------------------------------------------
+# 2. Model construction
+# ---------------------------------------------------------------------------
+
+print("\n--- Model construction ---")
+
+with timer.section("model_build"):
+    # GaussianPrior(mean=truth, sigma=small) centres prior-median at the
+    # simulator truth while keeping params free so gradient diagnostics
+    # have dimensionality.
+    lens_bulge = al.model_util.mge_model_from(
+        mask_radius=mask_radius, total_gaussians=20, centre_prior_is_uniform=True
+    )
+
+    # Mass and shear fixed to simulator truth (not GaussianPrior) because
+    # tracing GaussianPrior-backed mass params through this script's
+    # ``mapping_matrix_from_params`` JIT trigger a pre-existing xp=np/jnp
+    # propagation bug in autogalaxy/profiles/mass/total/isothermal.py:108
+    # (Isothermal.deflections_yx_2d_from called with xp=np on traced inputs).
+    # The bug is specific to this script's MGE-lens-light + over-sampled-LP
+    # combination; the likelihood-only imaging/mge_gradients.py uses the
+    # same pattern without the failing JIT and works under Option A.
+    mass = af.Model(al.mp.Isothermal)
+    mass.centre = (0.0, 0.0)
+    mass.einstein_radius = 1.6
+    mass.ell_comps = al.convert.ell_comps_from(axis_ratio=0.9, angle=45.0)
+
+    shear = af.Model(al.mp.ExternalShear)
+    shear.gamma_1 = 0.05
+    shear.gamma_2 = 0.05
+
+    lens = af.Model(
+        al.Galaxy, redshift=0.5, bulge=lens_bulge, mass=mass, shear=shear
+    )
+
+    source_bulge = al.model_util.mge_model_from(
+        mask_radius=mask_radius, total_gaussians=20, centre_prior_is_uniform=False
+    )
+
+    source = af.Model(al.Galaxy, redshift=1.0, bulge=source_bulge)
+
+    model = af.Collection(galaxies=af.Collection(lens=lens, source=source))
+
+print(f"  Total free parameters: {model.total_free_parameters}")
+
+# ---------------------------------------------------------------------------
+# 3. Instantiate concrete objects from prior medians
+# ---------------------------------------------------------------------------
+
+print("\n--- Instantiate concrete model ---")
+
+with timer.section("instance_from_vector"):
+    param_vector = model.physical_values_from_prior_medians
+    instance = model.instance_from_vector(vector=param_vector)
+
+# Register every concrete `model.cls` (Galaxy, profile classes, ModelInstance,
+# Collection, …) with `jax.tree_util` so the instance can cross JIT/vmap
+# boundaries directly. This must happen AFTER the model is built, because
+# registration walks the model's class graph.
+with timer.section("register_pytrees"):
+    _register_model_pytrees(model)
+
+# JIT input: the instance itself, with all parameter leaves promoted to JAX
+# arrays. We keep `instance` (the eager NumPy version) around for any
+# non-JIT setup that needs to read parameter values directly.
+params_tree = jax.tree_util.tree_map(jnp.asarray, instance)
+
+tracer = al.Tracer(galaxies=list(instance.galaxies))
+
+print(f"  Tracer planes: {tracer.total_planes}")
+
+# ---------------------------------------------------------------------------
+# Key configuration that dictates run time
+# ---------------------------------------------------------------------------
+
+n_image_pixels = dataset.data.shape[0]
+n_over_sampled_pixels = dataset.grids.lp.over_sampled.shape[0]
+n_linear_gaussians = len(tracer.cls_list_from(cls=al.lp_linear.LightProfileLinear))
+
+print("\n--- Configuration (determines run time) ---")
+print(f"  Instrument:              {instrument}")
+print(f"  Pixel scale:             {pixel_scale} arcsec/pixel")
+print(f"  Mask radius:             {mask_radius} arcsec")
+print(f"  Image pixels (masked):   {n_image_pixels}")
+print(f"  Over-sampled pixels:     {n_over_sampled_pixels}")
+print(f"  Linear Gaussians:        {n_linear_gaussians}")
+
+# ---------------------------------------------------------------------------
+# 4. Full-pipeline reference (FitImaging) — eager baseline
+# ---------------------------------------------------------------------------
+
+print("\n--- Full FitImaging (eager baseline) ---")
+
+with timer.section("fit_imaging_eager"):
+    fit = al.FitImaging(
+        dataset=dataset,
+        tracer=tracer,
+        settings=al.Settings(use_border_relocator=True),
+        xp=np,
+    )
+    log_evidence_ref = fit.figure_of_merit
+    log_likelihood_ref = fit.log_likelihood
+
+print(f"  figure_of_merit (log_evidence) = {log_evidence_ref}")
+print(f"  log_likelihood                 = {log_likelihood_ref}")
+
+# ===================================================================
+# PART C — Full-pipeline JIT for comparison
+# ===================================================================
+
+print("\n" + "=" * 70)
+print("FULL-PIPELINE JIT (for comparison)")
+print("=" * 70)
+
+# Build the analysis with ``use_jax=True`` so its ``log_likelihood_function``
+# threads ``xp=jnp`` through every internal call (border relocation, profile
+# evaluation, inversion, etc.). This is the same wiring that ``Fitness.call``
+# uses in production — we just feed it our pytree-native instance directly
+# instead of going through ``model.instance_from_vector(parameters, xp=jnp)``.
+analysis = al.AnalysisImaging(dataset=dataset, use_jax=True)
+
+def full_pipeline_from_params(params_tree):
+    """Full likelihood from a pytree-shaped ``ModelInstance``.
+
+    No flat-vector unpacking inside the trace — the instance crosses the JIT
+    boundary directly, with constants (redshifts, etc.) kept static via the
+    ``aux_data`` partition set up by ``autofit.jax.register_model``.
+    """
+    return analysis.log_likelihood_function(instance=params_tree)
+
+_, full_result = jit_profile(full_pipeline_from_params, "full_pipeline", params_tree)
+full_pipeline_per_call = timer.records[-1][1] / 10
+
+print(f"  full log_likelihood = {full_result}")
+
+# ===================================================================
+# PART D — vmap + correctness
+# ===================================================================
+
+print("\n--- vmap batched evaluation ---")
+
+batch_size = 3
+
+# Build the batched pytree: every leaf gets a fresh leading batch axis. No
+# flat-vector reshaping required — JAX walks the pytree via the registration
+# we added in PART A.
+parameters = jax.tree_util.tree_map(
+    lambda leaf: jnp.broadcast_to(leaf, (batch_size, *leaf.shape)),
+    params_tree,
+)
+
+vmapped_full = jax.jit(jax.vmap(full_pipeline_from_params))
+
+with timer.section("vmap_first_call"):
+    result_vmap = vmapped_full(parameters)
+    block(result_vmap)
+
+n_vmap_repeats = 10
+with timer.section(f"vmap_steady_x{n_vmap_repeats}"):
+    for _ in range(n_vmap_repeats):
+        result_vmap = vmapped_full(parameters)
+        block(result_vmap)
+
+vmap_batch_time = timer.records[-1][1] / n_vmap_repeats
+vmap_per_call = vmap_batch_time / batch_size
+vmap_speedup = full_pipeline_per_call / vmap_per_call
+
+print(f"  batch results = {result_vmap}")
+print(f"  vmap batch of {batch_size}:   {vmap_batch_time:.6f} s")
+print(f"  vmap per call:         {vmap_per_call:.6f} s")
+print(f"  single JIT per call:   {full_pipeline_per_call:.6f} s")
+print(f"  vmap speedup:          {vmap_speedup:.1f}x faster per likelihood")
+
+np.testing.assert_allclose(
+    np.array(result_vmap),
+    float(full_result),
+    rtol=1e-4,
+    err_msg="mge: JAX vmap likelihood mismatch",
+)
+print("  Correctness check PASSED")
+
+# ===================================================================
+# PART E — Static memory analysis
+# ===================================================================
+
+print("\n--- Static memory analysis ---")
+
+lowered_batched = vmapped_full.lower(parameters)
+compiled_batched = lowered_batched.compile()
+
+memory_analysis = compiled_batched.memory_analysis()
+print(f"  Output size:  {memory_analysis.output_size_in_bytes / 1024**2:.3f} MB")
+print(f"  Temp size:    {memory_analysis.temp_size_in_bytes / 1024**2:.3f} MB")
+print(
+    f"  Total:        "
+    f"{(memory_analysis.output_size_in_bytes + memory_analysis.temp_size_in_bytes) / 1024**2:.3f} MB"
+)
+
+
+# ===================================================================
+# JAX Likelihood Function Summary
+# ===================================================================
+
+import json
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+al_version = al.__version__
+
+print("\n" + "=" * 70)
+print(f"JAX LIKELIHOOD FUNCTION SUMMARY — {instrument.upper()} — v{al_version}")
+print("=" * 70)
+print(f"  Instrument:            {instrument}")
+print(f"  Pixel scale:           {pixel_scale} arcsec/pixel")
+print(f"  Mask radius:           {mask_radius} arcsec")
+print(f"  Image pixels (masked): {n_image_pixels}")
+print(f"  Over-sampled pixels:   {n_over_sampled_pixels}")
+print(f"  Linear Gaussians:      {n_linear_gaussians}")
+print("-" * 70)
+print(f"      {'Full pipeline (single JIT)':<30}  {full_pipeline_per_call:>12.6f} s")
+print(f"      {f'vmap batch={batch_size} (per call)':<30}  {vmap_per_call:>12.6f} s")
+print(f"      {f'vmap speedup vs single JIT':<30}  {vmap_speedup:>11.1f}x")
+print("=" * 70)
+
+# --- Save results dictionary ---
+
+likelihood_summary = {
+    "autolens_version": al_version,
+    "device": device_info_dict(),
+    "instrument": instrument,
+    "configuration": {
+        "pixel_scale_arcsec": pixel_scale,
+        "mask_radius_arcsec": mask_radius,
+        "image_pixels_masked": int(n_image_pixels),
+        "over_sampled_pixels": int(n_over_sampled_pixels),
+        "linear_gaussians": int(n_linear_gaussians),
+    },
+    "full_pipeline_single_jit": full_pipeline_per_call,
+    "vmap": {
+        "batch_size": batch_size,
+        "batch_time": vmap_batch_time,
+        "per_call": vmap_per_call,
+        "speedup_vs_single_jit": round(vmap_speedup, 1),
+    },
+}
+
+dict_path, chart_path = resolve_output_paths(
+    _cli,
+    default_dir=_workspace_root / "results" / "likelihood" / "imaging",
+    default_basename=f"mge_likelihood_summary_{instrument}_v{al_version}",
+)
+dict_path.write_text(json.dumps(likelihood_summary, indent=2))
+print(f"\n  Results dict saved to: {dict_path}")
+print(f"  Bar chart path:        {chart_path} (no per-step chart in runtime variant)")
+
+
+# ===================================================================
+# Regression assertion — realistic-scale deterministic likelihood
+# ===================================================================
+#
+# Simulator truth parameters (mass + shear fixed; MGE bulges free around
+# default centre/ell_comps priors) put the evaluation point at the
+# physically-meaningful truth operating point. Eager, JIT, and vmap all
+# agree to ~1e-11 precision.
+EXPECTED_LOG_LIKELIHOOD_HST = 27379.38890685539
+
+np.testing.assert_allclose(
+    log_likelihood_ref,
+    EXPECTED_LOG_LIKELIHOOD_HST,
+    rtol=1e-4,
+    err_msg=(
+        f"imaging/mge[{instrument}]: regression — eager log_likelihood drifted "
+        f"(got {log_likelihood_ref}, expected {EXPECTED_LOG_LIKELIHOOD_HST})"
+    ),
+)
+print(
+    f"  Eager regression assertion PASSED: log_likelihood matches "
+    f"{EXPECTED_LOG_LIKELIHOOD_HST:.6f}"
+)
+np.testing.assert_allclose(
+    float(full_result),
+    EXPECTED_LOG_LIKELIHOOD_HST,
+    rtol=1e-4,
+    err_msg=f"imaging/mge[{instrument}]: regression — full log_likelihood drifted",
+)
+np.testing.assert_allclose(
+    np.array(result_vmap),
+    EXPECTED_LOG_LIKELIHOOD_HST,
+    rtol=1e-4,
+    err_msg=f"imaging/mge[{instrument}]: regression — vmap log_likelihood drifted",
+)
+print(f"  Regression assertion PASSED: log_likelihood matches {EXPECTED_LOG_LIKELIHOOD_HST:.6f}")
diff --git a/likelihood_runtime/imaging/pixelization.py b/likelihood_runtime/imaging/pixelization.py
new file mode 100644
index 0000000..1ae7ae7
--- /dev/null
+++ b/likelihood_runtime/imaging/pixelization.py
@@ -0,0 +1,574 @@
+"""
+JAX Profiling: Pixelization Imaging Likelihood (Step-by-Step)
+=============================================================
+
+Profiles each step of the JAX likelihood function for an imaging dataset where
+the source galaxy is reconstructed using a rectangular pixelization with
+constant regularization.
+
+Rather than timing the whole likelihood as a single JIT-compiled block (which
+hides internal bottlenecks), this script JIT-compiles and times each step of
+the pipeline individually:
+
+1. Ray-trace grids through the lens
+2. Blurred image of lens light (non-linear profiles)
+3. Profile-subtracted image (lens light subtraction)
+4. Border relocation of traced grid
+5. Overlay grid (source pixel centres)
+6. Interpolation weights and mapper construction
+7. Mapping matrix
+8. Blurred mapping matrix (PSF convolution)
+9. Data vector (D)
+10. Curvature matrix (F)
+11. Regularization matrix (H)
+12. Regularized reconstruction: s = (F + H)^{-1} D
+13. Map reconstruction to image + log evidence
+
+Caveat: XLA may fuse operations differently when compiled as one program vs
+separate pieces, so per-step timings are approximate. They are still useful
+for identifying which step dominates.
+
+All JAX timings use `block_until_ready()` to force synchronous measurement.
+"""
+
+import numpy as np
+import jax
+import jax.numpy as jnp
+import time
+import subprocess
+import sys
+from pathlib import Path
+from contextlib import contextmanager
+
+import autofit as af
+import autolens as al
+import autoarray as aa
+from autofit.jax import register_model as _register_model_pytrees
+
+# Shared adapt-image loader: load or compute+cache `lensed_source.fits`
+# next to the dataset, then return the masked ``aa.Array2D``.
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+from _adapt_image_util import adapt_image_for_dataset  # noqa: E402
+
+# ---------------------------------------------------------------------------
+# Instrument configuration
+# ---------------------------------------------------------------------------
+
+
+# AUTOLENS_PROFILING_SMOKE=1 short-circuit (Phase 5 / CI lint smoke).
+# Verifies the import graph + module-level setup succeeded without running
+# the full profiling pipeline. Skipped entirely when the env var is unset.
+import os as _smoke_os
+import sys as _smoke_sys
+if _smoke_os.environ.get("AUTOLENS_PROFILING_SMOKE") == "1":
+    print(f"[smoke] {__file__}: imports + module setup OK; exiting.")
+    _smoke_sys.exit(0)
+
+# Sweep-driver CLI args (--config-name / --output-dir / --use-mixed-precision).
+# Tolerates extra/unknown args via parse_known_args inside the helper.
+from _profile_cli import (  # noqa: E402
+    parse_profile_cli,
+    device_info_dict,
+    resolve_output_paths,
+    auto_simulate_if_missing,
+)
+from simulators.imaging import INSTRUMENTS  # noqa: E402
+_cli = parse_profile_cli()
+
+instrument = "hst"  # <-- change this to profile a different instrument
+
+
+# ---------------------------------------------------------------------------
+# Profiling helpers
+# ---------------------------------------------------------------------------
+
+class Timer:
+    """Accumulates named timing measurements and prints a summary."""
+
+    def __init__(self):
+        self.records: list[tuple[str, float]] = []
+
+    @contextmanager
+    def section(self, label: str):
+        """Context manager that records wall-clock time for *label*."""
+        start = time.perf_counter()
+        yield
+        elapsed = time.perf_counter() - start
+        self.records.append((label, elapsed))
+        print(f"  [{label}] {elapsed:.4f} s")
+
+    def summary(self):
+        print("\n" + "=" * 70)
+        print("PROFILING SUMMARY")
+        print("=" * 70)
+        max_label = max(len(r[0]) for r in self.records)
+        total = 0.0
+        for label, elapsed in self.records:
+            print(f"  {label:<{max_label}}  {elapsed:>10.4f} s")
+            total += elapsed
+        print("-" * 70)
+        print(f"  {'TOTAL':<{max_label}}  {total:>10.4f} s")
+        print("=" * 70)
+
+
+def block(x):
+    """Call block_until_ready if available (JAX arrays)."""
+    if hasattr(x, "block_until_ready"):
+        x.block_until_ready()
+    return x
+
+
+def jit_profile(func, label, *args, n_repeats=10):
+    """JIT-compile *func*, time first call and steady-state average.
+
+    Returns the compiled function and its result.
+    """
+    jitted = jax.jit(func)
+
+    with timer.section(f"{label}_lower"):
+        lowered = jitted.lower(*args)
+
+    with timer.section(f"{label}_compile"):
+        compiled = lowered.compile()
+
+    with timer.section(f"{label}_first_call"):
+        result = compiled(*args)
+        block(result)
+
+    with timer.section(f"{label}_steady_x{n_repeats}"):
+        for _ in range(n_repeats):
+            result = compiled(*args)
+            block(result)
+
+    per_call = timer.records[-1][1] / n_repeats
+    print(f"    -> per-call avg: {per_call:.6f} s")
+    return compiled, result
+
+
+timer = Timer()
+
+# ===================================================================
+# PART A — Setup (not JIT-compiled)
+# ===================================================================
+
+# ---------------------------------------------------------------------------
+# 1. Dataset
+# ---------------------------------------------------------------------------
+
+print(f"\n--- Dataset loading & masking [{instrument}] ---")
+
+_script_dir = Path(__file__).resolve().parent
+_workspace_root = _script_dir.parents[1]
+pixel_scale = INSTRUMENTS[instrument]["pixel_scale"]
+dataset_path = Path("dataset") / "imaging" / instrument
+
+auto_simulate_if_missing(
+    dataset_path,
+    dataset_type="imaging",
+    instrument=instrument,
+    workspace_root=_workspace_root,
+)
+
+with timer.section("dataset_load"):
+    dataset = al.Imaging.from_fits(
+        data_path=dataset_path / "data.fits",
+        psf_path=dataset_path / "psf.fits",
+        noise_map_path=dataset_path / "noise_map.fits",
+        pixel_scales=pixel_scale,
+    )
+
+with timer.section("mask_and_oversample"):
+    mask_radius = 3.5
+
+    mask = al.Mask2D.circular(
+        shape_native=dataset.shape_native,
+        pixel_scales=dataset.pixel_scales,
+        radius=mask_radius,
+    )
+
+    dataset = dataset.apply_mask(mask=mask)
+    dataset = dataset.apply_over_sampling(
+        over_sample_size_lp=4,
+        over_sample_size_pixelization=1,
+    )
+
+    over_sample_size = al.util.over_sample.over_sample_size_via_radial_bins_from(
+        grid=dataset.grid,
+        sub_size_list=[4, 2, 1],
+        radial_list=[0.3, 0.6],
+        centre_list=[(0.0, 0.0)],
+    )
+
+    dataset = dataset.apply_over_sampling(
+        over_sample_size_lp=over_sample_size,
+        over_sample_size_pixelization=1,
+    )
+
+# ---------------------------------------------------------------------------
+# 2. Model construction
+# ---------------------------------------------------------------------------
+
+print("\n--- Model construction ---")
+
+mesh_pixels_yx = 39  # 39x39 = 1521 source pixels — 1500-tier production fiducial
+mesh_shape = (mesh_pixels_yx, mesh_pixels_yx)
+
+with timer.section("model_build"):
+    # GaussianPrior(mean=truth, sigma=small) centres prior-median at the
+    # simulator truth while keeping params free so gradient diagnostics
+    # have dimensionality.
+    # Lens light: MGE-60 (full production-fiducial) — replaces single Sersic.
+    # The 60 linear Gaussians enter the inversion's mapping matrix
+    # alongside the source-pixel columns.
+    lens_bulge = al.model_util.mge_model_from(
+        mask_radius=mask_radius,
+        total_gaussians=60,
+        centre_prior_is_uniform=True,
+    )
+
+    mass = af.Model(al.mp.Isothermal)
+    mass.centre.centre_0 = af.GaussianPrior(mean=0.0, sigma=0.005)
+    mass.centre.centre_1 = af.GaussianPrior(mean=0.0, sigma=0.005)
+    mass.einstein_radius = af.GaussianPrior(mean=1.6, sigma=0.05)
+    _lens_mass_ell = al.convert.ell_comps_from(axis_ratio=0.9, angle=45.0)
+    mass.ell_comps.ell_comps_0 = af.GaussianPrior(mean=_lens_mass_ell[0], sigma=0.01)
+    mass.ell_comps.ell_comps_1 = af.GaussianPrior(mean=_lens_mass_ell[1], sigma=0.01)
+
+    shear = af.Model(al.mp.ExternalShear)
+    shear.gamma_1 = af.GaussianPrior(mean=0.05, sigma=0.005)
+    shear.gamma_2 = af.GaussianPrior(mean=0.05, sigma=0.005)
+
+    lens = af.Model(
+        al.Galaxy, redshift=0.5, bulge=lens_bulge, mass=mass, shear=shear
+    )
+
+    # ``RectangularAdaptImage`` weights mesh pixels by the lensed-source
+    # adapt image — the production-grade alternative to the coordinate-
+    # density-only ``RectangularAdaptDensity``. Adapt image is loaded /
+    # cached below; the same shape and regularization are kept.
+    pixelization = al.Pixelization(
+        mesh=al.mesh.RectangularAdaptImage(
+            shape=mesh_shape, weight_power=1.0, weight_floor=0.0
+        ),
+        regularization=al.reg.Constant(coefficient=1.0),
+    )
+
+    source = af.Model(al.Galaxy, redshift=1.0, pixelization=pixelization)
+
+    model = af.Collection(galaxies=af.Collection(lens=lens, source=source))
+
+print(f"  Total free parameters: {model.total_free_parameters}")
+print(f"  Mesh shape: {mesh_shape}")
+print(f"  Source pixels: {mesh_pixels_yx * mesh_pixels_yx}")
+
+# ---------------------------------------------------------------------------
+# 3. Instantiate concrete objects from prior medians
+# ---------------------------------------------------------------------------
+
+print("\n--- Instantiate concrete model ---")
+
+with timer.section("instance_from_vector"):
+    param_vector = model.physical_values_from_prior_medians
+    instance = model.instance_from_vector(vector=param_vector)
+
+with timer.section("register_pytrees"):
+    _register_model_pytrees(model)
+
+params_tree = jax.tree_util.tree_map(jnp.asarray, instance)
+tracer = al.Tracer(galaxies=list(instance.galaxies))
+
+print(f"  Tracer planes: {tracer.total_planes}")
+
+# ---------------------------------------------------------------------------
+# Key configuration that dictates run time
+# ---------------------------------------------------------------------------
+
+n_image_pixels = dataset.data.shape[0]
+n_over_sampled_pixels = dataset.grids.lp.over_sampled.shape[0]
+n_source_pixels = mesh_pixels_yx * mesh_pixels_yx
+
+print("\n--- Configuration (determines run time) ---")
+print(f"  Instrument:              {instrument}")
+print(f"  Pixel scale:             {pixel_scale} arcsec/pixel")
+print(f"  Mask radius:             {mask_radius} arcsec")
+print(f"  Image pixels (masked):   {n_image_pixels}")
+print(f"  Over-sampled pixels:     {n_over_sampled_pixels}")
+print(f"  Mesh shape:              {mesh_shape}")
+print(f"  Source pixels:           {n_source_pixels}")
+
+# ---------------------------------------------------------------------------
+# 4. Adapt image — PSF-convolved lensed-source image used by
+#    ``RectangularAdaptImage`` to weight mesh pixels. Loads ``lensed_source.fits``
+#    from the dataset directory if present, otherwise computes it from the
+#    truth tracer and caches the file for sibling scripts on the same
+#    instrument.
+# ---------------------------------------------------------------------------
+
+print("\n--- Adapt image (lensed source) ---")
+
+with timer.section("adapt_image_build"):
+    adapt_image = adapt_image_for_dataset(
+        dataset_path=dataset_path, dataset=dataset
+    )
+    # ``galaxy_image_dict`` (Galaxy-object-keyed) feeds the eager-path
+    # ``image_for_galaxy`` lookup; ``galaxy_name_image_dict`` (path-tuple
+    # str-keyed) is rebuilt inside JIT closures where the Galaxy objects
+    # are reconstructed on every call. Both must be supplied here.
+    adapt_images = al.AdaptImages(
+        galaxy_image_dict={instance.galaxies.source: adapt_image},
+        galaxy_name_image_dict={"('galaxies', 'source')": adapt_image},
+    )
+
+print(f"  adapt_image shape (slim): {adapt_image.shape_slim}")
+
+# ---------------------------------------------------------------------------
+# 5. Full-pipeline reference (FitImaging) — eager baseline
+# ---------------------------------------------------------------------------
+
+print("\n--- Full FitImaging (eager baseline) ---")
+
+with timer.section("fit_imaging_eager"):
+    fit = al.FitImaging(
+        dataset=dataset,
+        tracer=tracer,
+        adapt_images=adapt_images,
+        settings=al.Settings(
+            use_border_relocator=True,
+            use_mixed_precision=_cli.use_mixed_precision,
+        ),
+        xp=np,
+    )
+    log_evidence_ref = fit.figure_of_merit
+    log_likelihood_ref = fit.log_likelihood
+
+print(f"  figure_of_merit (log_evidence) = {log_evidence_ref}")
+print(f"  log_likelihood                 = {log_likelihood_ref}")
+
+
+# ===================================================================
+# PART C — Full-pipeline JIT for comparison
+# ===================================================================
+
+print("\n" + "=" * 70)
+print("FULL-PIPELINE JIT (for comparison)")
+print("=" * 70)
+
+analysis = al.AnalysisImaging(
+    dataset=dataset,
+    adapt_images=adapt_images,
+    settings=al.Settings(
+        use_border_relocator=True,
+        use_mixed_precision=_cli.use_mixed_precision,
+    ),
+    use_jax=True,
+)
+
+def full_pipeline_from_params(params_tree):
+    return analysis.log_likelihood_function(instance=params_tree)
+
+_, full_result = jit_profile(full_pipeline_from_params, "full_pipeline", params_tree)
+full_pipeline_per_call = timer.records[-1][1] / 10
+
+print(f"  full log_likelihood = {full_result}")
+
+# ===================================================================
+# PART D — vmap + correctness
+# ===================================================================
+#
+# NOTE: vmap requires at least one JAX array leaf in the params_tree.
+# When model.total_free_parameters == 0 (all params fixed to truth), the
+# pytree has no array leaves and vmap cannot batch over it. Skip in that case.
+
+print("\n--- vmap batched evaluation ---")
+
+batch_size = 3
+vmap_batch_time = None
+vmap_per_call = None
+vmap_speedup = None
+result_vmap = None
+
+_n_leaves = len(jax.tree_util.tree_leaves(params_tree))
+_vmap_skipped_reason = None
+if _n_leaves == 0:
+    _vmap_skipped_reason = (
+        "model has 0 free parameters (all fixed to truth); vmap "
+        "requires at least one array leaf."
+    )
+else:
+    parameters = jax.tree_util.tree_map(
+        lambda leaf: jnp.broadcast_to(leaf, (batch_size, *leaf.shape)),
+        params_tree,
+    )
+
+    vmapped_full = jax.jit(jax.vmap(full_pipeline_from_params))
+
+    # 1521-source-pixel adapt-mesh pipelines push the per-batch working
+    # set past 2.5 GB; on smaller GPUs the vmap compile / first call can
+    # OOM. Catch and skip cleanly rather than killing the script.
+    try:
+        with timer.section("vmap_first_call"):
+            result_vmap = vmapped_full(parameters)
+            block(result_vmap)
+    except Exception as exc:
+        if "RESOURCE_EXHAUSTED" in str(exc) or "Out of memory" in str(exc):
+            _vmap_skipped_reason = (
+                f"OOM during vmap first call (batch_size={batch_size}); skip vmap. "
+                f"Re-run on a bigger device or lower `batch_size`."
+            )
+        else:
+            raise
+
+if _vmap_skipped_reason is None and _n_leaves > 0:
+    n_vmap_repeats = 10
+    with timer.section(f"vmap_steady_x{n_vmap_repeats}"):
+        for _ in range(n_vmap_repeats):
+            result_vmap = vmapped_full(parameters)
+            block(result_vmap)
+
+    vmap_batch_time = timer.records[-1][1] / n_vmap_repeats
+    vmap_per_call = vmap_batch_time / batch_size
+    vmap_speedup = full_pipeline_per_call / vmap_per_call
+
+    print(f"  batch results = {result_vmap}")
+    print(f"  vmap batch of {batch_size}:   {vmap_batch_time:.6f} s")
+    print(f"  vmap per call:         {vmap_per_call:.6f} s")
+    print(f"  single JIT per call:   {full_pipeline_per_call:.6f} s")
+    print(f"  vmap speedup:          {vmap_speedup:.1f}x faster per likelihood")
+
+    np.testing.assert_allclose(
+        np.array(result_vmap),
+        float(full_result),
+        rtol=1e-4,
+        err_msg="pixelization: JAX vmap likelihood mismatch",
+    )
+    print("  Correctness check PASSED")
+else:
+    print(f"  SKIPPED: {_vmap_skipped_reason}")
+
+# ===================================================================
+# PART E — Static memory analysis
+# ===================================================================
+
+print("\n--- Static memory analysis ---")
+
+if _vmap_skipped_reason is not None:
+    print(f"  SKIPPED: {_vmap_skipped_reason}")
+    memory_analysis = None
+else:
+    lowered_batched = vmapped_full.lower(parameters)
+    compiled_batched = lowered_batched.compile()
+
+    memory_analysis = compiled_batched.memory_analysis()
+    print(f"  Output size:  {memory_analysis.output_size_in_bytes / 1024**2:.3f} MB")
+    print(f"  Temp size:    {memory_analysis.temp_size_in_bytes / 1024**2:.3f} MB")
+    print(
+        f"  Total:        "
+        f"{(memory_analysis.output_size_in_bytes + memory_analysis.temp_size_in_bytes) / 1024**2:.3f} MB"
+    )
+
+
+# ===================================================================
+# JAX Likelihood Function Summary
+# ===================================================================
+
+import json
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+al_version = al.__version__
+
+print("\n" + "=" * 70)
+print(f"JAX LIKELIHOOD FUNCTION SUMMARY — {instrument.upper()} — v{al_version}")
+print("=" * 70)
+print(f"  Instrument:            {instrument}")
+print(f"  Pixel scale:           {pixel_scale} arcsec/pixel")
+print(f"  Mask radius:           {mask_radius} arcsec")
+print(f"  Image pixels (masked): {n_image_pixels}")
+print(f"  Over-sampled pixels:   {n_over_sampled_pixels}")
+print(f"  Mesh shape:            {mesh_shape}")
+print(f"  Source pixels:         {n_source_pixels}")
+print("-" * 70)
+
+print("-" * 70)
+print(f"      {'Full pipeline (single JIT)':<30}  {full_pipeline_per_call:>12.6f} s")
+if vmap_per_call is not None:
+    print(f"      {'vmap batch (per call)':<30}  {vmap_per_call:>12.6f} s")
+    print(f"      {'vmap speedup vs single JIT':<30}  {vmap_speedup:>11.1f}x")
+else:
+    print(f"      {'vmap':<30}  {'SKIPPED (0 free params)':>12}")
+print("=" * 70)
+
+# --- Save results dictionary ---
+
+likelihood_summary = {
+    "autolens_version": al_version,
+    "device": device_info_dict(),
+    "instrument": instrument,
+    "configuration": {
+        "pixel_scale_arcsec": pixel_scale,
+        "mask_radius_arcsec": mask_radius,
+        "image_pixels_masked": int(n_image_pixels),
+        "over_sampled_pixels": int(n_over_sampled_pixels),
+        "mesh_shape": list(mesh_shape),
+        "source_pixels": int(n_source_pixels),
+    },
+    "full_pipeline_single_jit": full_pipeline_per_call,
+    "vmap": "SKIPPED — model has 0 free parameters (all fixed to truth)" if vmap_per_call is None else {
+        "batch_size": batch_size,
+        "batch_time": vmap_batch_time,
+        "per_call": vmap_per_call,
+        "speedup_vs_single_jit": round(vmap_speedup, 1),
+    },
+}
+
+dict_path, chart_path = resolve_output_paths(
+    _cli,
+    default_dir=_workspace_root / "results" / "likelihood" / "imaging",
+    default_basename=f"pixelization_likelihood_summary_{instrument}_v{al_version}",
+)
+dict_path.write_text(json.dumps(likelihood_summary, indent=2))
+print(f"\n  Results dict saved to: {dict_path}")
+print(f"  Bar chart path:        {chart_path} (no per-step chart in runtime variant)")
+
+
+# ===================================================================
+# Regression assertion — realistic-scale deterministic log-evidence
+# ===================================================================
+#
+# RectangularAdaptImage at prior medians anchors the regression on the
+# *eager* FitImaging value (deterministic to fp64 noise). The full-pipeline
+# single-JIT / vmap paths agree with eager to ~1e-3 only: adaptive mesh
+# weighting amplifies fp accumulation in Cholesky / log_det on the bigger
+# 1581x1581 mapping matrix relative to the non-adaptive baseline (which
+# previously matched at 1e-4). The 1e-3 envelope is still tight enough to
+# catch real numerical regressions while accommodating the adaptive path.
+EXPECTED_LOG_EVIDENCE_HST = 28370.27770182  # 39x39 = 1521 source pixels, MGE-60 lens light, adapt_image=lensed_source
+
+np.testing.assert_allclose(
+    log_evidence_ref,
+    EXPECTED_LOG_EVIDENCE_HST,
+    rtol=1e-4,
+    err_msg=(
+        f"imaging/pixelization[{instrument}]: regression — eager log_evidence drifted "
+        f"(got {log_evidence_ref}, expected {EXPECTED_LOG_EVIDENCE_HST})"
+    ),
+)
+print(
+    f"  Eager regression assertion PASSED: log_evidence matches "
+    f"{EXPECTED_LOG_EVIDENCE_HST:.6f}"
+)
+np.testing.assert_allclose(
+    float(full_result),
+    EXPECTED_LOG_EVIDENCE_HST,
+    rtol=1e-3,
+    err_msg=f"imaging/pixelization[{instrument}]: regression — full log_evidence drifted",
+)
+if result_vmap is not None:
+    np.testing.assert_allclose(
+        np.array(result_vmap),
+        EXPECTED_LOG_EVIDENCE_HST,
+        rtol=1e-3,
+        err_msg=f"imaging/pixelization[{instrument}]: regression — vmap log_evidence drifted",
+    )
+print(f"  Regression assertion PASSED: log_evidence matches {EXPECTED_LOG_EVIDENCE_HST:.6f}")
diff --git a/likelihood_runtime/interferometer/delaunay.py b/likelihood_runtime/interferometer/delaunay.py
new file mode 100644
index 0000000..f7b62a2
--- /dev/null
+++ b/likelihood_runtime/interferometer/delaunay.py
@@ -0,0 +1,647 @@
+"""
+JAX Profiling: Delaunay Interferometer Likelihood
+=================================================
+
+Profiles the JAX likelihood function for an interferometer dataset where the
+source galaxy is reconstructed using a Delaunay pixelization with cross-
+derivative (``ConstantSplit``) regularization, and the lens galaxy is an
+Isothermal + ExternalShear.
+
+Mirrors ``likelihood/interferometer/pixelization.py`` (Phase 2) with the
+``RectangularUniform`` source replaced by a ``Delaunay`` mesh — matching
+``likelihood/imaging/delaunay.py`` so imaging vs interferometer Delaunay
+results can be compared side-by-side.
+
+Matches the step-by-step pedagogy of ``likelihood/imaging/delaunay.py``
+applied to the visibility-space pipeline. The 11 per-step JIT-profiled stages
+map 1:1 onto sections in
+``autolens_workspace/scripts/interferometer/features/datacube/likelihood_function.py``
+and its single-channel parent
+``interferometer/features/pixelization/likelihood_function.py``.
+
+Pipeline steps (matching the imaging-delaunay numbering for cross-reference;
+the two lens-light steps from the imaging sibling are dropped since the
+interferometer pixelization model has no parametric lens light):
+
+ 1. Ray-trace data grid to source plane.
+ 2. Ray-trace mesh grid (image-plane Overlay vertices) to source plane.
+ 5. Border relocation (data grid + mesh grid).
+ 6. Delaunay triangulation + interpolation + mapper.
+ 7. Mapping matrix.
+ 8. Transformed mapping matrix (NUFFT) — interferometer-specific. Replaces
+    imaging's PSF-convolved blurred mapping matrix; the difference is the
+    Fourier transform to visibility space rather than image-space convolution.
+ 9. Data vector D — visibility-space (real and imaginary components).
+ 10. Curvature matrix F — real and imaginary curvatures summed.
+ 11. Regularization matrix H — ConstantSplit (same as imaging).
+ 12. Reconstruction s = NNLS(F + H, D) (same NNLS path as imaging).
+ 13. Mapped reconstructed visibilities + log evidence (visibility-space χ²).
+
+Measures:
+
+1. Eager baseline: ``FitInterferometer`` with ``xp=np``, print
+   ``figure_of_merit`` / ``log_likelihood``.
+2. Per-step JIT profiling: each pipeline stage above gets its own
+   ``jit_profile()`` call (lower / compile / first-call / steady-state ×10).
+3. Full-pipeline JIT: ``jax.jit(analysis.log_likelihood_function)`` on a
+   pytree-registered ``ModelInstance``. Measure lower / compile / first-call /
+   steady-state per-call.
+4. Batched evaluation (opt-in via ``DELAUNAY_VMAP=1``): ``jax.jit(jax.vmap(...))``.
+   Skipped by default because Delaunay vmap compilation can take 20+ minutes
+   on CPU due to triangulation + interpolation graph size.
+5. Correctness: eager vs JIT log-evidence agreement at ``rtol=1e-4`` for both
+   the per-step recomputation and the full pipeline.
+6. Static memory analysis of the batched program (only when vmap runs).
+7. Results JSON + PNG written to ``results/`` with per-step entries that
+   slot into the same bar-chart shape as ``likelihood/imaging/delaunay.py``.
+
+JIT-blocker notes
+-----------------
+
+Per-step decomposition risks missing cross-step XLA fusion and hitting
+library-level JAX blockers. Caveats from the previous opt-out version that
+still apply:
+
+- ``dataset.transformer.transform_mapping_matrix`` is JIT-friendly for
+  ``TransformerDFT`` (a single matrix multiply) and the default SMA preset
+  uses it. The JAX-native ``al.TransformerNUFFT`` (nufftax-backed) IS
+  JIT-friendly today, but it is currently incompatible with
+  ``apply_sparse_operator`` (see
+  ``PyAutoArray/autoarray/dataset/interferometer/dataset.py:261``) — the
+  Delaunay path here relies on the sparse precision operator, so the
+  transformer stays on DFT. The legacy ``TransformerNUFFTPyNUFFT`` is
+  pynufft-based and is not JIT-friendly.
+- The visibility-space χ² in step 13 separates the complex visibilities and
+  noise into real/imag components inside the JIT body (matching the
+  ``pixelization/likelihood_function.py`` reference). Complex-valued JIT
+  with autoarray ``Visibilities`` wrappers is avoided.
+
+Pytree-native parameter inputs
+------------------------------
+
+Uses ``af.ModelInstance`` as the JIT input via PyAutoFit's opt-in pytree
+registration (``autofit.jax.register_model``). Exercises the ``TuplePrior``
+pytree support landed in PyAutoFit#1222.
+"""
+
+import os
+import numpy as np
+import jax
+import jax.numpy as jnp
+import time
+import subprocess
+import sys
+from pathlib import Path
+from contextlib import contextmanager
+
+import autofit as af
+import autolens as al
+from autofit.jax import register_model as _register_model_pytrees
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+from _adapt_image_util import adapt_image_for_dataset  # noqa: E402
+
+# ---------------------------------------------------------------------------
+# Instrument configuration
+# ---------------------------------------------------------------------------
+
+
+# AUTOLENS_PROFILING_SMOKE=1 short-circuit (Phase 5 / CI lint smoke).
+# Verifies the import graph + module-level setup succeeded without running
+# the full profiling pipeline. Skipped entirely when the env var is unset.
+import os as _smoke_os
+import sys as _smoke_sys
+if _smoke_os.environ.get("AUTOLENS_PROFILING_SMOKE") == "1":
+    print(f"[smoke] {__file__}: imports + module setup OK; exiting.")
+    _smoke_sys.exit(0)
+
+# Sweep-driver CLI args (--config-name / --output-dir / --use-mixed-precision).
+# Tolerates extra/unknown args via parse_known_args inside the helper.
+from _profile_cli import (  # noqa: E402
+    parse_profile_cli,
+    device_info_dict,
+    resolve_output_paths,
+    auto_simulate_if_missing,
+)
+from simulators.interferometer import INSTRUMENTS  # noqa: E402
+_cli = parse_profile_cli()
+
+instrument = "sma"  # <-- change this to profile a different instrument
+
+hilbert_pixels = 1000  # 1000-tier production fiducial for Hilbert + Delaunay
+regularization_coefficient = 1.0
+
+
+# ---------------------------------------------------------------------------
+# Profiling helpers
+# ---------------------------------------------------------------------------
+
+class Timer:
+    """Accumulates named timing measurements and prints a summary."""
+
+    def __init__(self):
+        self.records: list[tuple[str, float]] = []
+
+    @contextmanager
+    def section(self, label: str):
+        start = time.perf_counter()
+        yield
+        elapsed = time.perf_counter() - start
+        self.records.append((label, elapsed))
+        print(f"  [{label}] {elapsed:.4f} s")
+
+
+def block(x):
+    """Call block_until_ready if available (JAX arrays)."""
+    if hasattr(x, "block_until_ready"):
+        x.block_until_ready()
+    return x
+
+
+def jit_profile(func, label, *args, n_repeats=10):
+    """JIT-compile *func*, time lower / compile / first call / steady state."""
+    jitted = jax.jit(func)
+
+    with timer.section(f"{label}_lower"):
+        lowered = jitted.lower(*args)
+
+    with timer.section(f"{label}_compile"):
+        compiled = lowered.compile()
+
+    with timer.section(f"{label}_first_call"):
+        result = compiled(*args)
+        block(result)
+
+    with timer.section(f"{label}_steady_x{n_repeats}"):
+        for _ in range(n_repeats):
+            result = compiled(*args)
+            block(result)
+
+    per_call = timer.records[-1][1] / n_repeats
+    print(f"    -> per-call avg: {per_call:.6f} s")
+    return compiled, result
+
+
+timer = Timer()
+
+# ===================================================================
+# PART A — Setup (not JIT-compiled)
+# ===================================================================
+
+# ---------------------------------------------------------------------------
+# 1. Dataset
+# ---------------------------------------------------------------------------
+
+print(f"\n--- Dataset loading [{instrument}] ---")
+
+_script_dir = Path(__file__).resolve().parent
+_workspace_root = _script_dir.parents[1]
+pixel_scale = INSTRUMENTS[instrument]["pixel_scale"]
+real_space_shape = INSTRUMENTS[instrument]["real_space_shape"]
+dataset_path = Path("dataset") / "interferometer" / instrument
+
+auto_simulate_if_missing(
+    dataset_path,
+    dataset_type="interferometer",
+    instrument=instrument,
+    workspace_root=_workspace_root,
+)
+
+mask_radius = INSTRUMENTS[instrument]["mask_radius"]
+
+real_space_mask = al.Mask2D.circular(
+    shape_native=real_space_shape,
+    pixel_scales=pixel_scale,
+    radius=mask_radius,
+)
+
+with timer.section("dataset_load"):
+    dataset = al.Interferometer.from_fits(
+        data_path=dataset_path / "data.fits",
+        noise_map_path=dataset_path / "noise_map.fits",
+        uv_wavelengths_path=dataset_path / "uv_wavelengths.fits",
+        real_space_mask=real_space_mask,
+        transformer_class=al.TransformerDFT,
+        # DFT is intentional even at ALMA-scale visibility counts — profiling
+        # the JAX-traceable path is the goal, NUFFT (pynufft) is not yet
+        # JIT-friendly.
+        raise_error_dft_visibilities_limit=False,
+    )
+
+with timer.section("apply_sparse_operator"):
+    # Precompute the NUFFT precision-matrix preload so per-fit curvature
+    # assembly uses the FFT-based sparse path instead of dense DFT for every
+    # source pixel. Unblocked by PyAutoArray#316 (the Pmax > 1 extent-indexing
+    # fix); on Delaunay this was previously guarded with NotImplementedError.
+    dataset = dataset.apply_sparse_operator(use_jax=True, show_progress=True)
+
+n_visibilities = dataset.uv_wavelengths.shape[0]
+print(f"  Total visibilities: {n_visibilities}")
+
+# ---------------------------------------------------------------------------
+# 2. Adapt image + image mesh (Hilbert)
+# ---------------------------------------------------------------------------
+#
+# ``image_mesh.Hilbert`` adaptively places the source mesh vertices in the
+# image plane based on the lensed-source adapt image — denser where the
+# source lives, sparser elsewhere. Replaces the regular ``image_mesh.Overlay``
+# + circular-edge fallback that preceded this path. ``zeroed_pixels=0``
+# because Hilbert's placement is data-driven (no fixed edge points to mask).
+
+print("\n--- Adapt image (lensed source) ---")
+
+with timer.section("adapt_image_build"):
+    adapt_image = adapt_image_for_dataset(
+        dataset_path=dataset_path, dataset=dataset
+    )
+
+print(f"  adapt_image shape (slim): {adapt_image.shape_slim}")
+
+print("\n--- Image mesh construction (Hilbert) ---")
+
+with timer.section("image_mesh_hilbert"):
+    image_mesh = al.image_mesh.Hilbert(
+        pixels=hilbert_pixels, weight_power=1.0, weight_floor=0.0
+    )
+    image_plane_mesh_grid = image_mesh.image_plane_mesh_grid_from(
+        mask=dataset.real_space_mask, adapt_data=adapt_image
+    )
+
+n_mesh_vertices = image_plane_mesh_grid.shape[0]
+edge_pixels_total = 0
+print(f"  Hilbert pixels: {hilbert_pixels}")
+print(f"  Mesh vertices placed: {n_mesh_vertices}")
+
+# ---------------------------------------------------------------------------
+# 3. Model construction
+# ---------------------------------------------------------------------------
+
+print("\n--- Model construction ---")
+
+with timer.section("model_build"):
+    # GaussianPrior(mean=truth, sigma=small) centres prior-median at the
+    # simulator truth while keeping params free so gradient diagnostics
+    # have dimensionality.
+    mass = af.Model(al.mp.Isothermal)
+    mass.centre.centre_0 = af.GaussianPrior(mean=0.0, sigma=0.005)
+    mass.centre.centre_1 = af.GaussianPrior(mean=0.0, sigma=0.005)
+    mass.einstein_radius = af.GaussianPrior(mean=1.6, sigma=0.05)
+    _lens_mass_ell = al.convert.ell_comps_from(axis_ratio=0.9, angle=45.0)
+    mass.ell_comps.ell_comps_0 = af.GaussianPrior(mean=_lens_mass_ell[0], sigma=0.01)
+    mass.ell_comps.ell_comps_1 = af.GaussianPrior(mean=_lens_mass_ell[1], sigma=0.01)
+
+    shear = af.Model(al.mp.ExternalShear)
+    shear.gamma_1 = af.GaussianPrior(mean=0.05, sigma=0.005)
+    shear.gamma_2 = af.GaussianPrior(mean=0.05, sigma=0.005)
+
+    lens = af.Model(al.Galaxy, redshift=0.5, mass=mass, shear=shear)
+
+    mesh = al.mesh.Delaunay(
+        pixels=n_mesh_vertices,
+        zeroed_pixels=0,
+    )
+    regularization = al.reg.ConstantSplit(coefficient=regularization_coefficient)
+    pixelization = al.Pixelization(mesh=mesh, regularization=regularization)
+
+    source = af.Model(al.Galaxy, redshift=1.0, pixelization=pixelization)
+
+    model = af.Collection(galaxies=af.Collection(lens=lens, source=source))
+
+print(f"  Total free parameters: {model.total_free_parameters}")
+print(f"  Delaunay pixels: {n_mesh_vertices}")
+print(f"  Zeroed edge pixels: {edge_pixels_total}")
+
+# ---------------------------------------------------------------------------
+# 4. Instantiate concrete objects from prior medians
+# ---------------------------------------------------------------------------
+
+print("\n--- Instantiate concrete model ---")
+
+with timer.section("instance_from_vector"):
+    param_vector = model.physical_values_from_prior_medians
+    instance = model.instance_from_vector(vector=param_vector)
+
+with timer.section("register_pytrees"):
+    _register_model_pytrees(model)
+
+# JIT input: the instance itself, with all parameter leaves promoted to JAX
+# arrays. The eager NumPy instance is retained for the eager FitInterferometer
+# baseline below.
+params_tree = jax.tree_util.tree_map(jnp.asarray, instance)
+
+tracer = al.Tracer(galaxies=list(instance.galaxies))
+
+# AdaptImages tells FitInterferometer / AnalysisInterferometer where the
+# Delaunay mesh vertices live in the image-plane (separate from the source-
+# plane vertices that get computed by ray-tracing).
+adapt_images = al.AdaptImages(
+    galaxy_image_plane_mesh_grid_dict={
+        instance.galaxies.source: image_plane_mesh_grid,
+    },
+    galaxy_name_image_plane_mesh_grid_dict={
+        "('galaxies', 'source')": image_plane_mesh_grid,
+    },
+)
+
+print(f"  Tracer planes: {tracer.total_planes}")
+
+# ---------------------------------------------------------------------------
+# 5. Configuration summary
+# ---------------------------------------------------------------------------
+
+print("\n--- Configuration (determines run time) ---")
+print(f"  Instrument:              {instrument}")
+print(f"  Pixel scale:             {pixel_scale} arcsec/pixel")
+print(f"  Real-space mask radius:  {mask_radius} arcsec")
+print(f"  Real-space grid shape:   {real_space_shape[0]} x {real_space_shape[1]}")
+print(f"  Visibilities:            {n_visibilities}")
+print(f"  Hilbert pixels:          {hilbert_pixels}")
+print(f"  Delaunay vertices:       {n_mesh_vertices}")
+print(f"  Edge zeroed pixels:      {edge_pixels_total}")
+print(f"  Reg. coefficient:        {regularization_coefficient}")
+
+# ---------------------------------------------------------------------------
+# 6. Full-pipeline reference (FitInterferometer) — eager baseline
+# ---------------------------------------------------------------------------
+
+print("\n--- Full FitInterferometer (eager baseline) ---")
+
+with timer.section("fit_interferometer_eager"):
+    fit = al.FitInterferometer(
+        dataset=dataset,
+        tracer=tracer,
+        adapt_images=adapt_images,
+        settings=al.Settings(use_mixed_precision=_cli.use_mixed_precision),
+        xp=np,
+    )
+    figure_of_merit_ref = fit.figure_of_merit
+    log_likelihood_ref = fit.log_likelihood
+
+print(f"  figure_of_merit = {figure_of_merit_ref}")
+print(f"  log_likelihood  = {log_likelihood_ref}")
+
+
+# ===================================================================
+# PART C — Full-pipeline JIT (for comparison)
+# ===================================================================
+
+print("\n" + "=" * 70)
+print("FULL-PIPELINE JIT")
+print("=" * 70)
+
+analysis = al.AnalysisInterferometer(
+    dataset=dataset,
+    adapt_images=adapt_images,
+    settings=al.Settings(use_mixed_precision=_cli.use_mixed_precision),
+    use_jax=True,
+)
+
+def full_pipeline_from_params(params_tree):
+    """Full interferometer likelihood from a pytree-shaped ``ModelInstance``.
+
+    No flat-vector unpacking inside the trace — the instance crosses the JIT
+    boundary directly, with constants (redshifts, etc.) kept static via the
+    ``aux_data`` partition set up by ``autofit.jax.register_model``.
+    """
+    return analysis.log_likelihood_function(instance=params_tree)
+
+_, full_result = jit_profile(full_pipeline_from_params, "full_pipeline", params_tree)
+full_pipeline_per_call = timer.records[-1][1] / 10
+
+print(f"  full log_evidence = {full_result}")
+
+# Correctness: for inversion models (pixelization + regularization), the
+# analysis "log_likelihood_function" actually returns the log-evidence
+# (= figure_of_merit), which includes the regularization/determinant terms.
+# Match against figure_of_merit_ref, not log_likelihood_ref.
+np.testing.assert_allclose(
+    float(full_result),
+    float(figure_of_merit_ref),
+    rtol=1e-4,
+    err_msg="interferometer/delaunay: JIT log-evidence does not match eager figure_of_merit",
+)
+print("  Eager-vs-JIT correctness PASSED")
+
+# ===================================================================
+# PART D — vmap (opt-in) + correctness
+# ===================================================================
+#
+# Delaunay vmap compilation can take 20+ minutes on CPU due to the size of
+# the triangulation + interpolation XLA graph. Skipped by default — set
+# DELAUNAY_VMAP=1 to opt in.
+
+print("\n--- vmap batched evaluation ---")
+
+run_vmap = os.environ.get("DELAUNAY_VMAP", "0") == "1"
+
+batch_size = 3
+vmap_batch_time = None
+vmap_per_call = None
+vmap_speedup = None
+result_vmap = None
+vmapped_full = None
+parameters = None
+
+_n_leaves = len(jax.tree_util.tree_leaves(params_tree))
+if not run_vmap:
+    print("  SKIPPED: opt-in via DELAUNAY_VMAP=1 (compilation can take 20+ minutes).")
+elif _n_leaves == 0:
+    print(f"  SKIPPED: model has 0 free parameters (all fixed to truth); "
+          f"vmap requires at least one array leaf.")
+else:
+    parameters = jax.tree_util.tree_map(
+        lambda leaf: jnp.broadcast_to(leaf, (batch_size, *leaf.shape)),
+        params_tree,
+    )
+
+    vmapped_full = jax.jit(jax.vmap(full_pipeline_from_params))
+
+    with timer.section("vmap_first_call"):
+        result_vmap = vmapped_full(parameters)
+        block(result_vmap)
+
+    n_vmap_repeats = 10
+    with timer.section(f"vmap_steady_x{n_vmap_repeats}"):
+        for _ in range(n_vmap_repeats):
+            result_vmap = vmapped_full(parameters)
+            block(result_vmap)
+
+    vmap_batch_time = timer.records[-1][1] / n_vmap_repeats
+    vmap_per_call = vmap_batch_time / batch_size
+    vmap_speedup = full_pipeline_per_call / vmap_per_call
+
+    print(f"  batch results = {result_vmap}")
+    print(f"  vmap batch of {batch_size}:   {vmap_batch_time:.6f} s")
+    print(f"  vmap per call:         {vmap_per_call:.6f} s")
+    print(f"  single JIT per call:   {full_pipeline_per_call:.6f} s")
+    print(f"  vmap speedup:          {vmap_speedup:.1f}x faster per likelihood")
+
+    np.testing.assert_allclose(
+        np.array(result_vmap),
+        float(full_result),
+        rtol=1e-4,
+        err_msg="interferometer/delaunay: JAX vmap likelihood mismatch",
+    )
+    print("  vmap-vs-single-JIT correctness PASSED")
+
+# ===================================================================
+# PART E — Static memory analysis (only if vmap ran)
+# ===================================================================
+
+print("\n--- Static memory analysis ---")
+
+if vmapped_full is None:
+    print("  SKIPPED: vmap path was not exercised this run.")
+    memory_analysis = None
+else:
+    lowered_batched = vmapped_full.lower(parameters)
+    compiled_batched = lowered_batched.compile()
+
+    memory_analysis = compiled_batched.memory_analysis()
+    print(f"  Output size:  {memory_analysis.output_size_in_bytes / 1024**2:.3f} MB")
+    print(f"  Temp size:    {memory_analysis.temp_size_in_bytes / 1024**2:.3f} MB")
+    print(
+        f"  Total:        "
+        f"{(memory_analysis.output_size_in_bytes + memory_analysis.temp_size_in_bytes) / 1024**2:.3f} MB"
+    )
+
+# ===================================================================
+# JAX Likelihood Function Summary + artefacts
+# ===================================================================
+
+import json
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+al_version = al.__version__
+
+print("\n" + "=" * 70)
+print(f"JAX LIKELIHOOD FUNCTION SUMMARY — {instrument.upper()} — v{al_version}")
+print("=" * 70)
+print(f"  Instrument:              {instrument}")
+print(f"  Pixel scale:             {pixel_scale} arcsec/pixel")
+print(f"  Real-space mask radius:  {mask_radius} arcsec")
+print(f"  Real-space grid shape:   {real_space_shape[0]} x {real_space_shape[1]}")
+print(f"  Visibilities:            {n_visibilities}")
+print(f"  Delaunay vertices:       {n_mesh_vertices}")
+print(f"  Edge zeroed pixels:      {edge_pixels_total}")
+print("-" * 70)
+print(f"  Eager log_likelihood:    {log_likelihood_ref}")
+print(f"  Eager figure_of_merit:   {figure_of_merit_ref}  (log-evidence)")
+print(f"  JIT  log-evidence:       {float(full_result)}")
+print("-" * 70)
+
+print("-" * 70)
+print(f"      {'Full pipeline (single JIT)':<30}  {full_pipeline_per_call:>12.6f} s")
+if vmap_per_call is not None:
+    print(f"      {'vmap batch (per call)':<30}  {vmap_per_call:>12.6f} s")
+    print(f"      {'vmap speedup vs single JIT':<30}  {vmap_speedup:>11.1f}x")
+else:
+    print(f"      {'vmap':<30}  {'SKIPPED':>12}")
+print("=" * 70)
+
+# --- Save results dictionary ---
+
+if vmap_per_call is None:
+    if not run_vmap:
+        vmap_payload = "SKIPPED — opt-in via DELAUNAY_VMAP=1"
+    else:
+        vmap_payload = "SKIPPED — model has 0 free parameters (all fixed to truth)"
+else:
+    vmap_payload = {
+        "batch_size": batch_size,
+        "batch_time": vmap_batch_time,
+        "per_call": vmap_per_call,
+        "speedup_vs_single_jit": round(vmap_speedup, 1),
+    }
+
+likelihood_summary = {
+    "autolens_version": al_version,
+    "device": device_info_dict(),
+    "instrument": instrument,
+    "model": "delaunay",
+    "configuration": {
+        "pixel_scale_arcsec": pixel_scale,
+        "mask_radius_arcsec": mask_radius,
+        "real_space_shape": list(real_space_shape),
+        "visibilities": int(n_visibilities),
+        "hilbert_pixels": int(hilbert_pixels),
+        "delaunay_vertices": int(n_mesh_vertices),
+        "edge_zeroed_pixels": int(edge_pixels_total),
+        "regularization_coefficient": regularization_coefficient,
+    },
+    "log_likelihood_eager": float(log_likelihood_ref),
+    "figure_of_merit_eager": float(figure_of_merit_ref),
+    "log_evidence_jit": float(full_result),
+    "full_pipeline_single_jit": full_pipeline_per_call,
+    "vmap": vmap_payload,
+    "memory_mb": None if memory_analysis is None else {
+        "output": memory_analysis.output_size_in_bytes / 1024**2,
+        "temp": memory_analysis.temp_size_in_bytes / 1024**2,
+    },
+}
+
+dict_path, chart_path = resolve_output_paths(
+    _cli,
+    default_dir=_workspace_root / "results" / "likelihood" / "interferometer",
+    default_basename=f"delaunay_likelihood_summary_{instrument}_v{al_version}",
+)
+dict_path.write_text(json.dumps(likelihood_summary, indent=2))
+print(f"\n  Results dict saved to: {dict_path}")
+print(f"  Bar chart path:        {chart_path} (no per-step chart in runtime variant)")
+
+
+# ===================================================================
+# Regression assertion — realistic-scale deterministic log-evidence
+# ===================================================================
+#
+# Simulator truth parameters via GaussianPrior(mean=truth, sigma=small)
+# make the full-pipeline log-evidence deterministic at the prior median.
+# Pinned empirically per instrument; ``None`` means "skip the assertion and
+# print the value so it can be pasted in here on a clean run". sma was
+# bumped to mask_radius=3.5 in 2026-05-21's INSTRUMENTS refactor — the
+# old mask_radius=3.0 value no longer applies and needs re-measuring.
+EXPECTED_LOG_EVIDENCE = {
+    "sma": None,
+    "alma": None,
+    "alma_high": None,
+}
+
+expected_log_evidence = EXPECTED_LOG_EVIDENCE.get(instrument)
+
+if expected_log_evidence is None:
+    print(
+        f"\n  Regression assertion SKIPPED for [{instrument}] — "
+        f"capture this run's eager log_evidence ({figure_of_merit_ref}) "
+        f"and paste it into EXPECTED_LOG_EVIDENCE[{instrument!r}]."
+    )
+else:
+    np.testing.assert_allclose(
+        figure_of_merit_ref,
+        expected_log_evidence,
+        rtol=1e-4,
+        err_msg=(
+            f"interferometer/delaunay[{instrument}]: regression — eager log_evidence "
+            f"drifted (got {figure_of_merit_ref}, expected {expected_log_evidence})"
+        ),
+    )
+    print(
+        f"  Eager regression assertion PASSED: log_evidence matches "
+        f"{expected_log_evidence:.6f}"
+    )
+    np.testing.assert_allclose(
+        float(full_result),
+        expected_log_evidence,
+        rtol=1e-3,
+        err_msg=f"interferometer/delaunay[{instrument}]: regression — full log_evidence drifted",
+    )
+    print(f"  Full-pipeline regression assertion PASSED")
+    if result_vmap is not None:
+        np.testing.assert_allclose(
+            np.array(result_vmap),
+            expected_log_evidence,
+            rtol=1e-3,
+            err_msg=f"interferometer/delaunay[{instrument}]: regression — vmap log_evidence drifted",
+        )
+        print(f"  vmap regression assertion PASSED")
diff --git a/likelihood/interferometer/mge.py b/likelihood_runtime/interferometer/mge.py
similarity index 99%
rename from likelihood/interferometer/mge.py
rename to likelihood_runtime/interferometer/mge.py
index 3ddd706..cb931e0 100644
--- a/likelihood/interferometer/mge.py
+++ b/likelihood_runtime/interferometer/mge.py
@@ -78,7 +78,6 @@
 # Sweep-driver CLI args (--config-name / --output-dir / --use-mixed-precision).
 # Plus this script's own --use-dft override to compare NUFFT against the
 # historical DFT baseline on SMA.
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
 from _profile_cli import (  # noqa: E402
     parse_profile_cli,
diff --git a/likelihood/interferometer/pixelization.py b/likelihood_runtime/interferometer/pixelization.py
similarity index 99%
rename from likelihood/interferometer/pixelization.py
rename to likelihood_runtime/interferometer/pixelization.py
index b488be4..c1bb6e2 100644
--- a/likelihood/interferometer/pixelization.py
+++ b/likelihood_runtime/interferometer/pixelization.py
@@ -57,8 +57,8 @@
 import autoarray as aa
 from autofit.jax import register_model as _register_model_pytrees
 
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-from adapt_image_util import adapt_image_for_dataset  # noqa: E402
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+from _adapt_image_util import adapt_image_for_dataset  # noqa: E402
 
 # ---------------------------------------------------------------------------
 # Instrument configuration
@@ -76,8 +76,6 @@
 
 # Sweep-driver CLI args (--config-name / --output-dir / --use-mixed-precision).
 # Tolerates extra/unknown args via parse_known_args inside the helper.
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
-sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
 from _profile_cli import (  # noqa: E402
     parse_profile_cli,
     device_info_dict,
diff --git a/likelihood/point_source/image_plane.py b/likelihood_runtime/point_source/image_plane.py
similarity index 99%
rename from likelihood/point_source/image_plane.py
rename to likelihood_runtime/point_source/image_plane.py
index 373b4b4..7955387 100644
--- a/likelihood/point_source/image_plane.py
+++ b/likelihood_runtime/point_source/image_plane.py
@@ -61,7 +61,6 @@
 
 # Sweep-driver CLI args (--config-name / --output-dir / --use-mixed-precision).
 # Tolerates extra/unknown args via parse_known_args inside the helper.
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
 from _profile_cli import (  # noqa: E402
     parse_profile_cli,
diff --git a/likelihood/point_source/source_plane.py b/likelihood_runtime/point_source/source_plane.py
similarity index 99%
rename from likelihood/point_source/source_plane.py
rename to likelihood_runtime/point_source/source_plane.py
index 2820ad0..ead517e 100644
--- a/likelihood/point_source/source_plane.py
+++ b/likelihood_runtime/point_source/source_plane.py
@@ -43,7 +43,6 @@
 
 # Sweep-driver CLI args (--config-name / --output-dir / --use-mixed-precision).
 # Tolerates extra/unknown args via parse_known_args inside the helper.
-sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
 from _profile_cli import (  # noqa: E402
     parse_profile_cli,
diff --git a/scripts/sweep_likelihood.py b/likelihood_runtime/sweep.py
similarity index 94%
rename from scripts/sweep_likelihood.py
rename to likelihood_runtime/sweep.py
index 8f3ecdb..1d23edc 100644
--- a/scripts/sweep_likelihood.py
+++ b/likelihood_runtime/sweep.py
@@ -5,9 +5,9 @@
 `z_projects/profiling/hpc/sync`).
 
 Each subprocess invokes the existing per-cell likelihood script under
-``autolens_profiling/likelihood/<class>/<model>.py`` with the new CLI args
-the scripts gained in Phase 1 (``--config-name``, ``--output-dir``,
-``--use-mixed-precision``). Per-config JSONs land at::
+``autolens_profiling/likelihood_runtime/<class>/<model>.py`` with the
+CLI args ``--config-name``, ``--output-dir``, ``--use-mixed-precision``.
+Per-config JSONs land at::
 
     <output_root>/<class>/<model>/<config_name>.json
     <output_root>/<class>/<model>/<config_name>.png
@@ -15,19 +15,19 @@
 
 Default ``--output-root`` is
 ``autolens_workspace_developer/jax_profiling/results/jit`` — matches the
-existing imaging precedent and is read by ``aggregate_sweep.py`` to produce
+existing imaging precedent and is read by ``aggregate.py`` to produce
 ``comparison.json`` / ``comparison.png``.
 
 Usage::
 
     # All in-scope cells, both backends
-    python scripts/sweep_likelihood.py
+    python likelihood_runtime/sweep.py
 
     # Skip the heaviest cell during iteration
-    python scripts/sweep_likelihood.py --skip datacube/delaunay
+    python likelihood_runtime/sweep.py --skip datacube/delaunay
 
     # Single cell, single backend
-    python scripts/sweep_likelihood.py --only interferometer/mge --skip-cpu
+    python likelihood_runtime/sweep.py --only interferometer/mge --skip-cpu
 """
 
 from __future__ import annotations
@@ -251,7 +251,7 @@ def main() -> int:
     overall_t0 = time.time()
 
     for (cls, model) in cells:
-        script_path = _REPO_ROOT / "likelihood" / cls / f"{model}.py"
+        script_path = _REPO_ROOT / "likelihood_runtime" / cls / f"{model}.py"
         if not script_path.exists():
             print(f"\n!!! missing script: {script_path}")
             for cfg in configs: