Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add plot_ppc_pit #159

Merged
merged 2 commits into from
Mar 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion References.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,12 @@ See https://github.com/arviz-devs/arviz-stats/issues/56
References
----------
.. [1] Kleiber C, Zeileis A. *Visualizing Count Data Regressions Using Rootograms*.
The American Statistician, 70(3). (2016) https://doi.org/10.1080/00031305.2016.1173590
The American Statistician, 70(3). (2016) https://doi.org/10.1080/00031305.2016.1173590

## ECDF-pit

References
----------
.. [1] Säilynoja T, Bürkner PC. and Vehtari A. *Graphical test for discrete uniformity and
its applications in goodness-of-fit evaluation and multiple sample comparison*.
Statistics and Computing 32(32). (2022) https://doi.org/10.1007/s11222-022-10090-6
1 change: 1 addition & 0 deletions docs/source/api/plots.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ A complementary introduction and guide to ``plot_...`` functions is available at
plot_forest
plot_ppc_dist
plot_ppc_pava
plot_ppc_pit
plot_ppc_rootogram
plot_psense_dist
plot_psense_quantities
Expand Down
26 changes: 26 additions & 0 deletions docs/source/gallery/posterior_predictive_checks/plot_ppc_pit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""
# Predictive check with pit ECDF

Plot of $p(\tilde y_i \le y_i \mid y)$, where $y_i$ represents the observed data for index $i$
and $\tilde y_i$ represents the posterior predictive data for index $i$.

The distribution should be uniform if the model is well-calibrated. As small deviations from
uniformity are expected, the plot also shows the credible bands.
---

:::{seealso}
API Documentation: {func}`~arviz_plots.plot_ppc_pit`
:::
"""
from arviz_base import load_arviz_data

import arviz_plots as azp

azp.style.use("arviz-variat")

dt = load_arviz_data("radon")
pc = azp.plot_ppc_pit(
dt,
backend="none",
)
pc.show()
2 changes: 2 additions & 0 deletions src/arviz_plots/plots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .forestplot import plot_forest
from .pavacalibrationplot import plot_ppc_pava
from .ppcdistplot import plot_ppc_dist
from .ppcpitplot import plot_ppc_pit
from .ppcrootogramplot import plot_ppc_rootogram
from .psensedistplot import plot_psense_dist
from .psensequantitiesplot import plot_psense_quantities
Expand All @@ -30,6 +31,7 @@
"plot_ppc_rootogram",
"plot_ridge",
"plot_ppc_pava",
"plot_ppc_pit",
"plot_psense_dist",
"plot_psense_quantities",
]
281 changes: 281 additions & 0 deletions src/arviz_plots/plots/ppcpitplot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
"""Plot ppc pit for continuous data."""
import warnings
from copy import copy
from importlib import import_module

from arviz_base import rcParams
from arviz_base.labels import BaseLabeller
from arviz_stats.ecdf_utils import difference_ecdf_pit

from arviz_plots.plot_collection import PlotCollection, process_facet_dims
from arviz_plots.plots.utils import filter_aes
from arviz_plots.visuals import ecdf_line, fill_between_y, labelled_title, labelled_x, labelled_y


def plot_ppc_pit(
dt,
ci_prob=0.95, # Not sure we need to use rcParams here of if we should just use 0.95
method="simulation",
n_simulations=1000,
loo=False,
var_names=None,
data_pairs=None,
filter_vars=None, # pylint: disable=unused-argument
coords=None, # pylint: disable=unused-argument
sample_dims=None,
plot_collection=None,
backend=None,
labeller=None,
aes_map=None,
plot_kwargs=None,
pc_kwargs=None,
):
"""Marginal_p_values with simultaneous confidence envelope.

For a calibrated model, the posterior predictive p-values should be uniformly distributed.
This plot shows the empirical cumulative distribution function (ECDF) of the p-values.
To make the plot easier to interpret, we plot the Δ-ECDF, that is, the difference between
the expected CDF from the observed ECDF. Simultaneous confidence bands are computed using
the method described in described in [1]_.

Parameters
----------
dt : DataTree
Input data
ci_prob : float, optional
Probability for the credible interval. Defaults to 0.95.
n_simulations : int, optional
Number of simulations to use to compute simultaneous confidence intervals when using the
`method="simulation"` ignored if method is "optimized". Defaults to 1000.
method : str, optional
Method to compute the confidence intervals. Either "simulation" or "optimized".
Defaults to "simulation".
loo : bool, optional
If True, use the leave-one-out cross-validation samples. Defaults to False.
Requires the `log_likelihood` group to be present in the DataTree.
data_pairs : dict, optional
Dictionary of keys prior/posterior predictive data and values observed data variable names.
If None, it will assume that the observed data and the predictive data have
the same variable name.
var_names : str or list of str, optional
One or more variables to be plotted. Currently only one variable is supported.
Prefix the variables by ~ when you want to exclude them from the plot.
filter_vars : {None, “like”, “regex”}, optional, default=None
If None (default), interpret var_names as the real variables names.
If “like”, interpret var_names as substrings of the real variables names.
If “regex”, interpret var_names as regular expressions on the real variables names.
coords : dict, optional
Coordinates to plot.
sample_dims : str or sequence of hashable, optional
Dimensions to reduce unless mapped to an aesthetic.
Defaults to ``rcParams["data.sample_dims"]``
plot_collection : PlotCollection, optional
backend : {"matplotlib", "bokeh", "plotly"}, optional
labeller : labeller, optional
aes_map : mapping of {str : sequence of str}, optional
Mapping of artists to aesthetics that should use their mapping in `plot_collection`
when plotted. Valid keys are the same as for `plot_kwargs`.

plot_kwargs : mapping of {str : mapping or False}, optional
Valid keys are:

* ecdf_lines -> passed to :func:`~arviz_plots.visuals.ecdf_line`
* ci -> passed to :func:`~arviz_plots.visuals.ci_line_y`
* xlabel -> passed to :func:`~arviz_plots.visuals.labelled_x`
* ylabel -> passed to :func:`~arviz_plots.visuals.labelled_y`
* title -> passed to :func:`~arviz_plots.visuals.labelled_title`

pc_kwargs : mapping
Passed to :class:`arviz_plots.PlotCollection.grid`

Returns
-------
PlotCollection

Examples
--------
Plot the ecdf-PIT for the radon dataset.

.. plot::
:context: close-figs

>>> from arviz_plots import plot_ppc_pit, style
>>> style.use("arviz-variat")
>>> from arviz_base import load_arviz_data
>>> dt = load_arviz_data('radon')
>>> plot_ppc_pit(dt)


.. minigallery:: plot_ppc_pit

.. [1] Säilynoja T, Bürkner PC. and Vehtari A. *Graphical test for discrete uniformity and
its applications in goodness-of-fit evaluation and multiple sample comparison*.
Statistics and Computing 32(32). (2022) https://doi.org/10.1007/s11222-022-10090-6
"""
if ci_prob is None:
ci_prob = rcParams["stats.ci_prob"]
if sample_dims is None:
sample_dims = rcParams["data.sample_dims"]
if isinstance(sample_dims, str):
sample_dims = [sample_dims]
sample_dims = list(sample_dims)
if plot_kwargs is None:
plot_kwargs = {}
else:
plot_kwargs = plot_kwargs.copy()
if pc_kwargs is None:
pc_kwargs = {}
else:
pc_kwargs = pc_kwargs.copy()

if backend is None:
if plot_collection is None:
backend = rcParams["plot.backend"]
else:
backend = plot_collection.backend

labeller = BaseLabeller()

if data_pairs is None:
data_pairs = {var_names: var_names}
if None in data_pairs.keys():
data_pairs = dict(zip(dt.posterior_predictive.data_vars, dt.observed_data.data_vars))

predictive_types = [
dt.posterior_predictive[var].values.dtype.kind == "i" for var in data_pairs.keys()
]
observed_types = [dt.observed_data[var].values.dtype.kind == "i" for var in data_pairs.values()]

# Currently only continuous data is supported
if any(predictive_types + observed_types):
warnings.warn(
"Detected at least one discrete variable.\n"
"Consider using plot_ppc variants specific for discrete data, "
"such as plot_ppc_pava or plot_ppc_rootogram.",
UserWarning,
stacklevel=2,
)

# We should default to use loo when available
if loo:
pass

ds_ecdf = difference_ecdf_pit(dt, data_pairs, ci_prob, method, n_simulations)

plot_bknd = import_module(f".backend.{backend}", package="arviz_plots")
colors = plot_bknd.get_default_aes("color", 1, {})

if plot_collection is None:
pc_kwargs["plot_grid_kws"] = pc_kwargs.get("plot_grid_kws", {}).copy()

pc_kwargs["aes"] = pc_kwargs.get("aes", {}).copy()
pc_kwargs.setdefault("rows", None)
pc_kwargs.setdefault("cols", ["__variable__"])

figsize = pc_kwargs["plot_grid_kws"].get("figsize", None)
figsize_units = pc_kwargs["plot_grid_kws"].get("figsize_units", "inches")
col_dims = pc_kwargs["cols"]
row_dims = pc_kwargs["rows"]
if figsize is None:
figsize = plot_bknd.scale_fig_size(
figsize,
rows=process_facet_dims(ds_ecdf, row_dims)[0],
cols=process_facet_dims(ds_ecdf, col_dims)[0],
figsize_units=figsize_units,
)
figsize_units = "dots"
pc_kwargs["plot_grid_kws"]["figsize"] = figsize
pc_kwargs["plot_grid_kws"]["figsize_units"] = figsize_units

plot_collection = PlotCollection.grid(
ds_ecdf,
backend=backend,
**pc_kwargs,
)

if aes_map is None:
aes_map = {}
else:
aes_map = aes_map.copy()

## ecdf_line
ecdf_ls_kwargs = copy(plot_kwargs.get("ecdf_lines", {}))

if ecdf_ls_kwargs is not False:
_, _, ecdf_ls_ignore = filter_aes(plot_collection, aes_map, "ecdf_lines", sample_dims)
ecdf_ls_kwargs.setdefault("color", colors[0])

plot_collection.map(
ecdf_line,
"ecdf_lines",
data=ds_ecdf,
ignore_aes=ecdf_ls_ignore,
**ecdf_ls_kwargs,
)

ci_kwargs = copy(plot_kwargs.get("ci", {}))
_, _, ci_ignore = filter_aes(plot_collection, aes_map, "ci", sample_dims)
if ci_kwargs is not False:
ci_kwargs.setdefault("color", colors[0])
ci_kwargs.setdefault("alpha", 0.2)

plot_collection.map(
fill_between_y,
"ci",
data=ds_ecdf,
x=ds_ecdf.sel(plot_axis="x"),
y_bottom=ds_ecdf.sel(plot_axis="y_bottom"),
y_top=ds_ecdf.sel(plot_axis="y_top"),
ignore_aes=ci_ignore,
**ci_kwargs,
)

# set xlabel
_, xlabels_aes, xlabels_ignore = filter_aes(plot_collection, aes_map, "xlabel", sample_dims)
xlabel_kwargs = copy(plot_kwargs.get("xlabel", {}))
if xlabel_kwargs is not False:
if "color" not in xlabels_aes:
xlabel_kwargs.setdefault("color", "black")

xlabel_kwargs.setdefault("text", "quantiles")

plot_collection.map(
labelled_x,
"xlabel",
ignore_aes=xlabels_ignore,
subset_info=True,
**xlabel_kwargs,
)

# set ylabel
_, ylabels_aes, ylabels_ignore = filter_aes(plot_collection, aes_map, "ylabel", sample_dims)
ylabel_kwargs = copy(plot_kwargs.get("ylabel", {}))
if ylabel_kwargs is not False:
if "color" not in ylabels_aes:
ylabel_kwargs.setdefault("color", "black")

ylabel_kwargs.setdefault("text", "Δ ECDF")

plot_collection.map(
labelled_y,
"ylabel",
ignore_aes=ylabels_ignore,
subset_info=True,
**ylabel_kwargs,
)

# title
title_kwargs = copy(plot_kwargs.get("title", {}))
_, _, title_ignore = filter_aes(plot_collection, aes_map, "title", sample_dims)

if title_kwargs is not False:
plot_collection.map(
labelled_title,
"title",
ignore_aes=title_ignore,
subset_info=True,
labeller=labeller,
**title_kwargs,
)

return plot_collection