From cabf9e9ef6997e4912895fda45e9150bd8dd589b Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Tue, 4 Mar 2025 13:53:08 +0200 Subject: [PATCH 1/2] add plot_ppc_pit --- References.md | 10 +- docs/source/api/plots.rst | 1 + .../plot_ppc_pit.py | 25 ++ src/arviz_plots/plots/__init__.py | 2 + src/arviz_plots/plots/ppcpitplot.py | 281 ++++++++++++++++++ 5 files changed, 318 insertions(+), 1 deletion(-) create mode 100644 docs/source/gallery/posterior_predictive_checks/plot_ppc_pit.py create mode 100644 src/arviz_plots/plots/ppcpitplot.py diff --git a/References.md b/References.md index c8e6aa3b..6eab646a 100644 --- a/References.md +++ b/References.md @@ -56,4 +56,12 @@ See https://github.com/arviz-devs/arviz-stats/issues/56 References ---------- .. [1] Kleiber C, Zeileis A. *Visualizing Count Data Regressions Using Rootograms*. - The American Statistician, 70(3). (2016) https://doi.org/10.1080/00031305.2016.1173590 \ No newline at end of file + The American Statistician, 70(3). (2016) https://doi.org/10.1080/00031305.2016.1173590 + +## ECDF-pit + + References + ---------- + .. [1] Säilynoja T, Bürkner PC. and Vehtari A. *Graphical test for discrete uniformity and + its applications in goodness-of-fit evaluation and multiple sample comparison*. + Statistics and Computing 32(32). (2022) https://doi.org/10.1007/s11222-022-10090-6 \ No newline at end of file diff --git a/docs/source/api/plots.rst b/docs/source/api/plots.rst index a17609e3..a3b3a7bf 100644 --- a/docs/source/api/plots.rst +++ b/docs/source/api/plots.rst @@ -25,6 +25,7 @@ A complementary introduction and guide to ``plot_...`` functions is available at plot_forest plot_ppc_dist plot_ppc_pava + plot_ppc_pit plot_ppc_rootogram plot_psense_dist plot_psense_quantities diff --git a/docs/source/gallery/posterior_predictive_checks/plot_ppc_pit.py b/docs/source/gallery/posterior_predictive_checks/plot_ppc_pit.py new file mode 100644 index 00000000..39e33e50 --- /dev/null +++ b/docs/source/gallery/posterior_predictive_checks/plot_ppc_pit.py @@ -0,0 +1,25 @@ +""" +# Predictive check with pit ECDF + +Plot of $p(\tilde y_i \le y_i \mid y)$, where $y_i$ represents the observed data for index $i$ +and $\tilde y_i$ represents the posterior predictive data for index $i$. + +The distribution should be uniform if the model is well-calibrated. +--- + +:::{seealso} +API Documentation: {func}`~arviz_plots.plot_ppc_pit` +::: +""" +from arviz_base import load_arviz_data + +import arviz_plots as azp + +azp.style.use("arviz-variat") + +dt = load_arviz_data("radon") +pc = azp.plot_ppc_pit( + dt, + backend="none", +) +pc.show() diff --git a/src/arviz_plots/plots/__init__.py b/src/arviz_plots/plots/__init__.py index dfda262a..7fc94059 100644 --- a/src/arviz_plots/plots/__init__.py +++ b/src/arviz_plots/plots/__init__.py @@ -9,6 +9,7 @@ from .forestplot import plot_forest from .pavacalibrationplot import plot_ppc_pava from .ppcdistplot import plot_ppc_dist +from .ppcpitplot import plot_ppc_pit from .ppcrootogramplot import plot_ppc_rootogram from .psensedistplot import plot_psense_dist from .psensequantitiesplot import plot_psense_quantities @@ -30,6 +31,7 @@ "plot_ppc_rootogram", "plot_ridge", "plot_ppc_pava", + "plot_ppc_pit", "plot_psense_dist", "plot_psense_quantities", ] diff --git a/src/arviz_plots/plots/ppcpitplot.py b/src/arviz_plots/plots/ppcpitplot.py new file mode 100644 index 00000000..6980e217 --- /dev/null +++ b/src/arviz_plots/plots/ppcpitplot.py @@ -0,0 +1,281 @@ +"""Plot ppc pit for continuous data.""" +import warnings +from copy import copy +from importlib import import_module + +from arviz_base import rcParams +from arviz_base.labels import BaseLabeller +from arviz_stats.ecdf_utils import difference_ecdf_pit + +from arviz_plots.plot_collection import PlotCollection, process_facet_dims +from arviz_plots.plots.utils import filter_aes +from arviz_plots.visuals import ecdf_line, fill_between_y, labelled_title, labelled_x, labelled_y + + +def plot_ppc_pit( + dt, + ci_prob=0.95, # Not sure we need to use rcParams here of if we should just use 0.95 + method="simulation", + n_simulations=1000, + loo=False, + var_names=None, + data_pairs=None, + filter_vars=None, # pylint: disable=unused-argument + coords=None, # pylint: disable=unused-argument + sample_dims=None, + plot_collection=None, + backend=None, + labeller=None, + aes_map=None, + plot_kwargs=None, + pc_kwargs=None, +): + """Marginal_p_values with simultaneous confidence envelope. + + For a calibrated model, the posterior predictive p-values should be uniformly distributed. + This plot shows the empirical cumulative distribution function (ECDF) of the p-values. + To make the plot easier to interpret, we plot the Δ-ECDF, that is, the difference between + the expected CDF from the observed ECDF. Simultaneous confidence bands are computed using + the method described in described in [1]_. + + Parameters + ---------- + dt : DataTree + Input data + ci_prob : float, optional + Probability for the credible interval. Defaults to 0.95. + n_simulations : int, optional + Number of simulations to use to compute simultaneous confidence intervals when using the + `method="simulation"` ignored if method is "optimized". Defaults to 1000. + method : str, optional + Method to compute the confidence intervals. Either "simulation" or "optimized". + Defaults to "simulation". + loo : bool, optional + If True, use the leave-one-out cross-validation samples. Defaults to False. + Requires the `log_likelihood` group to be present in the DataTree. + data_pairs : dict, optional + Dictionary of keys prior/posterior predictive data and values observed data variable names. + If None, it will assume that the observed data and the predictive data have + the same variable name. + var_names : str or list of str, optional + One or more variables to be plotted. Currently only one variable is supported. + Prefix the variables by ~ when you want to exclude them from the plot. + filter_vars : {None, “like”, “regex”}, optional, default=None + If None (default), interpret var_names as the real variables names. + If “like”, interpret var_names as substrings of the real variables names. + If “regex”, interpret var_names as regular expressions on the real variables names. + coords : dict, optional + Coordinates to plot. + sample_dims : str or sequence of hashable, optional + Dimensions to reduce unless mapped to an aesthetic. + Defaults to ``rcParams["data.sample_dims"]`` + plot_collection : PlotCollection, optional + backend : {"matplotlib", "bokeh", "plotly"}, optional + labeller : labeller, optional + aes_map : mapping of {str : sequence of str}, optional + Mapping of artists to aesthetics that should use their mapping in `plot_collection` + when plotted. Valid keys are the same as for `plot_kwargs`. + + plot_kwargs : mapping of {str : mapping or False}, optional + Valid keys are: + + * ecdf_lines -> passed to :func:`~arviz_plots.visuals.ecdf_line` + * ci -> passed to :func:`~arviz_plots.visuals.ci_line_y` + * xlabel -> passed to :func:`~arviz_plots.visuals.labelled_x` + * ylabel -> passed to :func:`~arviz_plots.visuals.labelled_y` + * title -> passed to :func:`~arviz_plots.visuals.labelled_title` + + pc_kwargs : mapping + Passed to :class:`arviz_plots.PlotCollection.grid` + + Returns + ------- + PlotCollection + + Examples + -------- + Plot the ecdf-PIT for the radon dataset. + + .. plot:: + :context: close-figs + + >>> from arviz_plots import plot_ppc_pit, style + >>> style.use("arviz-variat") + >>> from arviz_base import load_arviz_data + >>> dt = load_arviz_data('radon') + >>> plot_ppc_pit(dt) + + + .. minigallery:: plot_ppc_pit + + .. [1] Säilynoja T, Bürkner PC. and Vehtari A. *Graphical test for discrete uniformity and + its applications in goodness-of-fit evaluation and multiple sample comparison*. + Statistics and Computing 32(32). (2022) https://doi.org/10.1007/s11222-022-10090-6 + """ + if ci_prob is None: + ci_prob = rcParams["stats.ci_prob"] + if sample_dims is None: + sample_dims = rcParams["data.sample_dims"] + if isinstance(sample_dims, str): + sample_dims = [sample_dims] + sample_dims = list(sample_dims) + if plot_kwargs is None: + plot_kwargs = {} + else: + plot_kwargs = plot_kwargs.copy() + if pc_kwargs is None: + pc_kwargs = {} + else: + pc_kwargs = pc_kwargs.copy() + + if backend is None: + if plot_collection is None: + backend = rcParams["plot.backend"] + else: + backend = plot_collection.backend + + labeller = BaseLabeller() + + if data_pairs is None: + data_pairs = {var_names: var_names} + if None in data_pairs.keys(): + data_pairs = dict(zip(dt.posterior_predictive.data_vars, dt.observed_data.data_vars)) + + predictive_types = [ + dt.posterior_predictive[var].values.dtype.kind == "i" for var in data_pairs.keys() + ] + observed_types = [dt.observed_data[var].values.dtype.kind == "i" for var in data_pairs.values()] + + # Currently only continuous data is supported + if any(predictive_types + observed_types): + warnings.warn( + "Detected at least one discrete variable.\n" + "Consider using plot_ppc variants specific for discrete data, " + "such as plot_ppc_pava or plot_ppc_rootogram.", + UserWarning, + stacklevel=2, + ) + + # We should default to use loo when available + if loo: + pass + + ds_ecdf = difference_ecdf_pit(dt, data_pairs, ci_prob, method, n_simulations) + + plot_bknd = import_module(f".backend.{backend}", package="arviz_plots") + colors = plot_bknd.get_default_aes("color", 1, {}) + + if plot_collection is None: + pc_kwargs["plot_grid_kws"] = pc_kwargs.get("plot_grid_kws", {}).copy() + + pc_kwargs["aes"] = pc_kwargs.get("aes", {}).copy() + pc_kwargs.setdefault("rows", None) + pc_kwargs.setdefault("cols", ["__variable__"]) + + figsize = pc_kwargs["plot_grid_kws"].get("figsize", None) + figsize_units = pc_kwargs["plot_grid_kws"].get("figsize_units", "inches") + col_dims = pc_kwargs["cols"] + row_dims = pc_kwargs["rows"] + if figsize is None: + figsize = plot_bknd.scale_fig_size( + figsize, + rows=process_facet_dims(ds_ecdf, row_dims)[0], + cols=process_facet_dims(ds_ecdf, col_dims)[0], + figsize_units=figsize_units, + ) + figsize_units = "dots" + pc_kwargs["plot_grid_kws"]["figsize"] = figsize + pc_kwargs["plot_grid_kws"]["figsize_units"] = figsize_units + + plot_collection = PlotCollection.grid( + ds_ecdf, + backend=backend, + **pc_kwargs, + ) + + if aes_map is None: + aes_map = {} + else: + aes_map = aes_map.copy() + + ## ecdf_line + ecdf_ls_kwargs = copy(plot_kwargs.get("ecdf_lines", {})) + + if ecdf_ls_kwargs is not False: + _, _, ecdf_ls_ignore = filter_aes(plot_collection, aes_map, "ecdf_lines", sample_dims) + ecdf_ls_kwargs.setdefault("color", colors[0]) + + plot_collection.map( + ecdf_line, + "ecdf_lines", + data=ds_ecdf, + ignore_aes=ecdf_ls_ignore, + **ecdf_ls_kwargs, + ) + + ci_kwargs = copy(plot_kwargs.get("ci", {})) + _, _, ci_ignore = filter_aes(plot_collection, aes_map, "ci", sample_dims) + if ci_kwargs is not False: + ci_kwargs.setdefault("color", colors[0]) + ci_kwargs.setdefault("alpha", 0.2) + + plot_collection.map( + fill_between_y, + "ci", + data=ds_ecdf, + x=ds_ecdf.sel(plot_axis="x"), + y_bottom=ds_ecdf.sel(plot_axis="y_bottom"), + y_top=ds_ecdf.sel(plot_axis="y_top"), + ignore_aes=ci_ignore, + **ci_kwargs, + ) + + # set xlabel + _, xlabels_aes, xlabels_ignore = filter_aes(plot_collection, aes_map, "xlabel", sample_dims) + xlabel_kwargs = copy(plot_kwargs.get("xlabel", {})) + if xlabel_kwargs is not False: + if "color" not in xlabels_aes: + xlabel_kwargs.setdefault("color", "black") + + xlabel_kwargs.setdefault("text", "quantiles") + + plot_collection.map( + labelled_x, + "xlabel", + ignore_aes=xlabels_ignore, + subset_info=True, + **xlabel_kwargs, + ) + + # set ylabel + _, ylabels_aes, ylabels_ignore = filter_aes(plot_collection, aes_map, "ylabel", sample_dims) + ylabel_kwargs = copy(plot_kwargs.get("ylabel", {})) + if ylabel_kwargs is not False: + if "color" not in ylabels_aes: + ylabel_kwargs.setdefault("color", "black") + + ylabel_kwargs.setdefault("text", "Δ ECDF") + + plot_collection.map( + labelled_y, + "ylabel", + ignore_aes=ylabels_ignore, + subset_info=True, + **ylabel_kwargs, + ) + + # title + title_kwargs = copy(plot_kwargs.get("title", {})) + _, _, title_ignore = filter_aes(plot_collection, aes_map, "title", sample_dims) + + if title_kwargs is not False: + plot_collection.map( + labelled_title, + "title", + ignore_aes=title_ignore, + subset_info=True, + labeller=labeller, + **title_kwargs, + ) + + return plot_collection From d24239c489a7dfe126bb7d047707d8feb6f02dfd Mon Sep 17 00:00:00 2001 From: aloctavodia Date: Tue, 4 Mar 2025 14:11:20 +0200 Subject: [PATCH 2/2] clarify text example --- .../source/gallery/posterior_predictive_checks/plot_ppc_pit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/gallery/posterior_predictive_checks/plot_ppc_pit.py b/docs/source/gallery/posterior_predictive_checks/plot_ppc_pit.py index 39e33e50..a7bac4b3 100644 --- a/docs/source/gallery/posterior_predictive_checks/plot_ppc_pit.py +++ b/docs/source/gallery/posterior_predictive_checks/plot_ppc_pit.py @@ -4,7 +4,8 @@ Plot of $p(\tilde y_i \le y_i \mid y)$, where $y_i$ represents the observed data for index $i$ and $\tilde y_i$ represents the posterior predictive data for index $i$. -The distribution should be uniform if the model is well-calibrated. +The distribution should be uniform if the model is well-calibrated. As small deviations from +uniformity are expected, the plot also shows the credible bands. --- :::{seealso}