Skip to content

Commit

Permalink
misc documentation fixes (#417)
Browse files Browse the repository at this point in the history
* add conditional tests to API reference

* fix doc string issues

* render math properly in conditional docstrings

* change ljungbox reference

* try using automodule

* [skip ci] fix api

* [skip ci] fix for reference

* revert old version of doing things

* add notes for preprints

* add tutorials for CDcorr

* [skip ci] fix documentation imports for CDcorr and PDcorr

* add conditional simulations to documentation
  • Loading branch information
sampan501 authored Sep 12, 2024
1 parent 8ec3610 commit be77507
Show file tree
Hide file tree
Showing 25 changed files with 227 additions and 212 deletions.
132 changes: 0 additions & 132 deletions benchmarks/condi_indep_power_sampsize.py

This file was deleted.

4 changes: 2 additions & 2 deletions benchmarks/indep_power_dimension.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def estimate_power(sim, test, auto=False):
#
# outputs = Parallel(n_jobs=-1, verbose=100)(
# [
# delayed(estimate_featimport)(sim_name, test)
# delayed(estimate_power)(sim_name, test)
# for sim_name in SIMULATIONS.keys()
# for test in INDEP_TESTS.keys()
# ]
Expand Down Expand Up @@ -184,7 +184,7 @@ def plot_power():
loc="upper center",
)
leg.get_frame().set_linewidth(0.0)
for legobj in leg.legendHandles:
for legobj in leg.legend_handles:
legobj.set_linewidth(5.0)
plt.subplots_adjust(hspace=0.50)

Expand Down
4 changes: 2 additions & 2 deletions benchmarks/indep_power_sampsize.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def estimate_power(sim, test, auto=False):
#
# outputs = Parallel(n_jobs=-1, verbose=100)(
# [
# delayed(estimate_featimport)(sim_name, test)
# delayed(estimate_power)(sim_name, test)
# for sim_name in SIMULATIONS.keys()
# for test in INDEP_TESTS.keys()
# ]
Expand Down Expand Up @@ -146,7 +146,7 @@ def plot_power():
loc="upper center",
)
leg.get_frame().set_linewidth(0.0)
for legobj in leg.legendHandles:
for legobj in leg.legend_handles:
legobj.set_linewidth(5.0)
plt.subplots_adjust(hspace=0.50)

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/perf_1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def plot_wall_times():
loc="upper center",
)
leg.get_frame().set_linewidth(0.0)
for legobj in leg.legendHandles:
for legobj in leg.legend_handles:
legobj.set_linewidth(5.0)


Expand Down
27 changes: 26 additions & 1 deletion docs/api/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ Independence
HHG
CCA
RV
FriedmanRafsky



Expand Down Expand Up @@ -57,6 +56,7 @@ Independence
Hotelling
SmoothCFTest
MeanEmbeddingTest
FriedmanRafsky
KSampleHHG


Expand Down Expand Up @@ -98,6 +98,9 @@ Conditional Independence
.. autosummary::
:toctree: generated/

ConditionalDcorr
PartialDcorr
PartialCorr
FCIT
KCI

Expand Down Expand Up @@ -160,6 +163,28 @@ Independence Simulations
rot_ksamp
gaussian_3samp

Conditional Independence Simulations
""""""""""""""""""""""""""""""""""""

.. autosummary::
:toctree: generated/

indep_normal
indep_lognormal
indep_binomial
cond_indep_normal
cond_indep_lognormal
cond_indep_normal_nonlinear
cond_indep_binomial
correlated_binomial
correlated_normal
correlated_normal_nonliear
correlated_lognormal
correlated_t_linear
correlated_t_quadratic
correlated_t_nonlinear
condi_indep_sim

Time-Series Simulations
""""""""""""""""""""""""

Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,10 @@
# -- numpydoc
# Below is needed to prevent errors
numpydoc_class_members_toctree = True
numpydoc_show_class_members = False
numpydoc_show_class_members = True

# -- sphinx.ext.autosummary
autosummary_generate = []
autosummary_generate = True

# Otherwise, the Return parameter list looks different from the Parameters list
napoleon_use_rtype = False
Expand Down
93 changes: 62 additions & 31 deletions docs/refs.bib
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@ @incollection{hardleCanonicalCorrelationAnalysis2015
language = {en}
}

@article{wang2015conditional,
title={Conditional distance correlation},
author={Wang, Xueqin and Pan, Wenliang and Hu, Wenhao and Tian, Yuan and Zhang, Heping},
journal={Journal of the American Statistical Association},
volume={110},
number={512},
pages={1726--1734},
year={2015},
publisher={Taylor \& Francis}
}

@article{hardoonCanonicalCorrelationAnalysis2004,
title = {Canonical {{Correlation Analysis}}: {{An Overview}} with {{Application}} to {{Learning Methods}}},
shorttitle = {Canonical {{Correlation Analysis}}},
Expand Down Expand Up @@ -425,18 +436,22 @@ @article{grettonKernelTwoSampleTest2012
abstract = {We propose a framework for analyzing and comparing distributions, which we use to construct statistical tests to determine if two samples are drawn from different distributions. Our test statistic is the largest difference in expectations over functions in the unit ball of a reproducing kernel Hilbert space (RKHS), and is called the maximum mean discrepancy (MMD). We present two distribution-free tests based on large deviation bounds for the MMD, and a third test based on the asymptotic distribution of this statistic. The MMD can be computed in quadratic time, although efficient linear time approximations are available. Our statistic is an instance of an integral probability metric, and various classical metrics on distributions are obtained when alternative function classes are used in place of an RKHS. We apply our two-sample tests to a variety of problems, including attribute matching for databases using the Hungarian marriage method, where they perform strongly. Excellent performance is also obtained when comparing distributions over graphs, for which these are the first such tests.},
}

@article{ljung1978measure,
title={On a measure of lack of fit in time series models},
author={Ljung, Greta M and Box, George EP},
journal={Biometrika},
volume={65},
number={2},
pages={297--303},
year={1978},
publisher={Oxford University Press}
}

@article{mehtaIndependenceTestingMultivariate2020,
title = {Independence {{Testing}} for {{Multivariate Time Series}}},
author = {Mehta, Ronak and Chung, Jaewon and Shen, Cencheng and Xu, Ting and Vogelstein, Joshua T.},
year = {2020},
month = may,
journal = {arXiv:1908.06486 [cs, stat]},
eprint = {1908.06486},
eprinttype = {arxiv},
primaryclass = {cs, stat},
abstract = {Complex data structures such as time series are increasingly present in modern data science problems. A fundamental question is whether two such time-series are statistically dependent. Many current approaches make parametric assumptions on the random processes, only detect linear association, require multiple tests, or forfeit power in high-dimensional, nonlinear settings. Estimating the distribution of any test statistic under the null is non-trivial, as the permutation test is invalid. This work juxtaposes distance correlation (Dcorr) and multiscale graph correlation (MGC) from independence testing literature and block permutation from time series analysis to address these challenges. The proposed nonparametric procedure is valid and consistent, building upon prior work by characterizing the geometry of the relationship, estimating the time lag at which dependence is maximized, avoiding the need for multiple testing, and exhibiting superior power in high-dimensional, low sample size, nonlinear settings. Neural connectivity is analyzed via fMRI data, revealing linear dependence of signals within the visual network and default mode network, and nonlinear relationships in other networks. This work uncovers a first-resort data analysis tool with open-source code available, directly impacting a wide range of scientific disciplines.},
archiveprefix = {arXiv},
keywords = {Computer Science - Machine Learning,Statistics - Machine Learning,Statistics - Methodology},
title={Independence testing for temporal data},
author={Shen, Cencheng and Chung, Jaewon and Mehta, Ronak and Xu, Ting and Vogelstein, Joshua T},
journal={Transactions on Machine Learning Research},
year={2024}
}

@article{shenChiSquareTestDistance2021,
Expand All @@ -458,29 +473,22 @@ @article{shenChiSquareTestDistance2021
}

@article{chwialkowski2015fast,
title={Fast Two-Sample Testing with Analytic Representations of Probability Measures},
author={Kacper Chwialkowski and Aaditya Ramdas and Dino Sejdinovic and Arthur Gretton},
year={2015},
journal={arXiv:1506.04725 [math, stat]},
print={1506.04725},
eprinttype={arxiv},
abstract={We propose a class of nonparametric two-sample tests with a cost linear in the sample size. Two tests are given, both based on an ensemble of distances between analytic functions representing each of the distributions. The first test uses smoothed empirical characteristic functions to represent the distributions, the second uses distribution embeddings in a reproducing kernel Hilbert space. Analyticity implies that differences in the distributions may be detected almost surely at a finite number of randomly chosen locations/frequencies. The new tests are consistent against a larger class of alternatives than the previous linear-time tests based on the (non-smoothed) empirical characteristic functions, while being much faster than the current state-of-the-art quadratic-time kernel-based or energy distance-based tests. Experiments on artificial benchmarks and on challenging real-world testing problems demonstrate that our tests give a better power/time tradeoff than competing approaches, and in some cases, better outright power than even the most expensive quadratic-time tests. This performance advantage is retained even in high dimensions, and in cases where the difference in distributions is not observable with low order statistics.},
archivePrefix={arXiv},
primaryClass={stat.ML}
title={Fast two-sample testing with analytic representations of probability measures},
author={Chwialkowski, Kacper P and Ramdas, Aaditya and Sejdinovic, Dino and Gretton, Arthur},
journal={Advances in Neural Information Processing Systems},
volume={28},
year={2015}
}

@article{grettonKernelJointIndependence2016,
title = {{Kernel-based Tests} for {Joint Independence}},
author = {Pfister, Nikolas and Buhlmann, Peter and Scholkopf, Bernhard and Peters, Jonas},
year = {2016},
month = nov,
journal = {arXiv:1603.00285 [math, stat]},
eprint = {1603.00285},
eprinttype = {arxiv},
primaryclass = {math, stat},
abstract = {We investigate the problem of testing whether d random variables, which may or may not be continuous, are jointly (or mutually) independent. Our method builds on ideas of the two variable Hilbert-Schmidt independence criterion (HSIC) but allows for an arbitrary number of variables. We embed the d-dimensional joint distribution and the product of the marginals into a reproducing kernel Hilbert space and define the d-variable Hilbert-Schmidt independence criterion (dHSIC) as the squared distance between the embeddings. In the population case, the value of dHSIC is zero if and only if the d variables are jointly independent, as long as the kernel is characteristic. Based on an empirical estimate of dHSIC, we define three different non-parametric hypothesis tests: a permutation test, a bootstrap test and a test based on a Gamma approximation. We prove that the permutation test achieves the significance level and that the bootstrap test achieves pointwise asymptotic significance level as well as pointwise asymptotic consistency (i.e., it is able to detect any type of fixed dependence in the large sample limit). The Gamma approximation does not come with these guarantees; however, it is computationally very fast and for small d, it performs well in practice. Finally, we apply the test to a problem in causal discovery.},
archiveprefix = {arXiv},
keywords = {Math - Statistics Theory, Statistics - Machine Learning},
title={Kernel-based tests for joint independence},
author={Pfister, Niklas and B{\"u}hlmann, Peter and Sch{\"o}lkopf, Bernhard and Peters, Jonas},
journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
volume={80},
number={1},
pages={5--31},
year={2018},
publisher={Wiley Online Library}
}

@article{friedmanMultivariateGeneralizationsoftheWaldWolfowitzandSmirnovTwoSampleTests1979,
Expand Down Expand Up @@ -544,4 +552,27 @@ @Inbook{hotellingRelationsTwoSets1992
isbn="978-1-4612-4380-9",
doi="10.1007/978-1-4612-4380-9_14",
url="https://doi.org/10.1007/978-1-4612-4380-9_14"
}

@article{jitkrittum2017linear,
title={A linear-time kernel goodness-of-fit test},
author={Jitkrittum, Wittawat and Xu, Wenkai and Szab{\'o}, Zolt{\'a}n and Fukumizu, Kenji and Gretton, Arthur},
journal={Advances in Neural Information Processing Systems},
volume={30},
year={2017}
}

@inproceedings{10.5555/3020548.3020641,
author = {Zhang, Kun and Peters, Jonas and Janzing, Dominik and Sch\"{o}lkopf, Bernhard},
title = {Kernel-Based Conditional Independence Test and Application in Causal Discovery},
year = {2011},
isbn = {9780974903972},
publisher = {AUAI Press},
address = {Arlington, Virginia, USA},
abstract = {Conditional independence testing is an important problem, especially in Bayesian network learning and causal discovery. Due to the curse of dimensionality, testing for conditional independence of continuous variables is particularly challenging. We propose a Kernel-based Conditional Independence test (KCI-test), by constructing an appropriate test statistic and deriving its asymptotic distribution under the null hypothesis of conditional independence. The proposed method is computationally efficient and easy to implement. Experimental results show that it outperforms other methods, especially when the conditioning set is large or the sample size is not very large, in which case other methods encounter difficulties.},
booktitle = {Proceedings of the Twenty-Seventh Conference on Uncertainty in Artificial Intelligence},
pages = {804–813},
numpages = {10},
location = {Barcelona, Spain},
series = {UAI'11}
}
10 changes: 6 additions & 4 deletions hyppo/conditional/FCIT.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import time
import joblib

import joblib
import numpy as np
from scipy.stats import ttest_1samp
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.tree import DecisionTreeRegressor

from .base import ConditionalIndependenceTest, ConditionalIndependenceTestOutput
Expand All @@ -32,8 +30,12 @@ class FCIT(ConditionalIndependenceTest):
Proportion of data to evaluate test stat on.
discrete: tuple of string
Whether :math:`X` or :math:`Y` are discrete
Notes
-----
.. note::
This algorithm is currently a pre-print on arXiv.
The motivation for the test rests on the assumption that if :math:`X \not\!\perp\!\!\!\perp Y \mid Z`,
then :math:`Y` should be more accurately predicted by using both
:math:`X` and :math:`Z` as covariates as opposed to only using
Expand Down
Loading

0 comments on commit be77507

Please sign in to comment.