misc documentation fixes (#417)

* add conditional tests to API reference * fix doc string issues * render math properly in conditional docstrings * change ljungbox reference * try using automodule * [skip ci] fix api * [skip ci] fix for reference * revert old version of doing things * add notes for preprints * add tutorials for CDcorr * [skip ci] fix documentation imports for CDcorr and PDcorr * add conditional simulations to documentation
neurodata · Sep 12, 2024 · be77507 · be77507
1 parent 8ec3610
commit be77507
Show file tree

Hide file tree

Showing 25 changed files with 227 additions and 212 deletions.
diff --git a/benchmarks/condi_indep_power_sampsize.py b/benchmarks/condi_indep_power_sampsize.py
diff --git a/benchmarks/indep_power_dimension.py b/benchmarks/indep_power_dimension.py
@@ -132,7 +132,7 @@ def estimate_power(sim, test, auto=False):
 #
 # outputs = Parallel(n_jobs=-1, verbose=100)(
 #     [
-#         delayed(estimate_featimport)(sim_name, test)
+#         delayed(estimate_power)(sim_name, test)
 #         for sim_name in SIMULATIONS.keys()
 #         for test in INDEP_TESTS.keys()
 #     ]
@@ -184,7 +184,7 @@ def plot_power():
         loc="upper center",
     )
     leg.get_frame().set_linewidth(0.0)
-    for legobj in leg.legendHandles:
+    for legobj in leg.legend_handles:
         legobj.set_linewidth(5.0)
     plt.subplots_adjust(hspace=0.50)
 

diff --git a/benchmarks/indep_power_sampsize.py b/benchmarks/indep_power_sampsize.py
@@ -95,7 +95,7 @@ def estimate_power(sim, test, auto=False):
 #
 # outputs = Parallel(n_jobs=-1, verbose=100)(
 #     [
-#         delayed(estimate_featimport)(sim_name, test)
+#         delayed(estimate_power)(sim_name, test)
 #         for sim_name in SIMULATIONS.keys()
 #         for test in INDEP_TESTS.keys()
 #     ]
@@ -146,7 +146,7 @@ def plot_power():
         loc="upper center",
     )
     leg.get_frame().set_linewidth(0.0)
-    for legobj in leg.legendHandles:
+    for legobj in leg.legend_handles:
         legobj.set_linewidth(5.0)
     plt.subplots_adjust(hspace=0.50)
 

diff --git a/benchmarks/perf_1d.py b/benchmarks/perf_1d.py
@@ -112,7 +112,7 @@ def plot_wall_times():
         loc="upper center",
     )
     leg.get_frame().set_linewidth(0.0)
-    for legobj in leg.legendHandles:
+    for legobj in leg.legend_handles:
         legobj.set_linewidth(5.0)
 
 

diff --git a/docs/api/index.rst b/docs/api/index.rst
@@ -21,7 +21,6 @@ Independence
    HHG
    CCA
    RV
-   FriedmanRafsky
 
 
 
@@ -57,6 +56,7 @@ Independence
    Hotelling
    SmoothCFTest
    MeanEmbeddingTest
+   FriedmanRafsky
    KSampleHHG
 
 
@@ -98,6 +98,9 @@ Conditional Independence
 .. autosummary::
     :toctree: generated/
 
+    ConditionalDcorr
+    PartialDcorr
+    PartialCorr
     FCIT
     KCI
 
@@ -160,6 +163,28 @@ Independence Simulations
    rot_ksamp
    gaussian_3samp
 
+Conditional Independence Simulations
+""""""""""""""""""""""""""""""""""""
+
+.. autosummary::
+   :toctree: generated/
+
+   indep_normal
+   indep_lognormal
+   indep_binomial
+   cond_indep_normal
+   cond_indep_lognormal
+   cond_indep_normal_nonlinear
+   cond_indep_binomial
+   correlated_binomial
+   correlated_normal
+   correlated_normal_nonliear
+   correlated_lognormal
+   correlated_t_linear
+   correlated_t_quadratic
+   correlated_t_nonlinear
+   condi_indep_sim
+
 Time-Series Simulations
 """"""""""""""""""""""""
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -79,10 +79,10 @@
 # -- numpydoc
 # Below is needed to prevent errors
 numpydoc_class_members_toctree = True
-numpydoc_show_class_members = False
+numpydoc_show_class_members = True
 
 # -- sphinx.ext.autosummary
-autosummary_generate = []
+autosummary_generate = True
 
 # Otherwise, the Return parameter list looks different from the Parameters list
 napoleon_use_rtype = False

diff --git a/docs/refs.bib b/docs/refs.bib
@@ -14,6 +14,17 @@ @incollection{hardleCanonicalCorrelationAnalysis2015
   language = {en}
 }
 
+@article{wang2015conditional,
+  title={Conditional distance correlation},
+  author={Wang, Xueqin and Pan, Wenliang and Hu, Wenhao and Tian, Yuan and Zhang, Heping},
+  journal={Journal of the American Statistical Association},
+  volume={110},
+  number={512},
+  pages={1726--1734},
+  year={2015},
+  publisher={Taylor \& Francis}
+}
+
 @article{hardoonCanonicalCorrelationAnalysis2004,
   title = {Canonical {{Correlation Analysis}}: {{An Overview}} with {{Application}} to {{Learning Methods}}},
   shorttitle = {Canonical {{Correlation Analysis}}},
@@ -425,18 +436,22 @@ @article{grettonKernelTwoSampleTest2012
   abstract = {We propose a framework for analyzing and comparing distributions, which we use to construct statistical tests to determine if two samples are drawn from different distributions. Our test statistic is the largest difference in expectations over functions in the unit ball of a reproducing kernel Hilbert space (RKHS), and is called the maximum mean discrepancy (MMD). We present two distribution-free tests based on large deviation bounds for the MMD, and a third test based on the asymptotic distribution of this statistic. The MMD can be computed in quadratic time, although efficient linear time approximations are available. Our statistic is an instance of an integral probability metric, and various classical metrics on distributions are obtained when alternative function classes are used in place of an RKHS. We apply our two-sample tests to a variety of problems, including attribute matching for databases using the Hungarian marriage method, where they perform strongly. Excellent performance is also obtained when comparing distributions over graphs, for which these are the first such tests.},
 }
 
+@article{ljung1978measure,
+  title={On a measure of lack of fit in time series models},
+  author={Ljung, Greta M and Box, George EP},
+  journal={Biometrika},
+  volume={65},
+  number={2},
+  pages={297--303},
+  year={1978},
+  publisher={Oxford University Press}
+}
+
 @article{mehtaIndependenceTestingMultivariate2020,
-  title = {Independence {{Testing}} for {{Multivariate Time Series}}},
-  author = {Mehta, Ronak and Chung, Jaewon and Shen, Cencheng and Xu, Ting and Vogelstein, Joshua T.},
-  year = {2020},
-  month = may,
-  journal = {arXiv:1908.06486 [cs, stat]},
-  eprint = {1908.06486},
-  eprinttype = {arxiv},
-  primaryclass = {cs, stat},
-  abstract = {Complex data structures such as time series are increasingly present in modern data science problems. A fundamental question is whether two such time-series are statistically dependent. Many current approaches make parametric assumptions on the random processes, only detect linear association, require multiple tests, or forfeit power in high-dimensional, nonlinear settings. Estimating the distribution of any test statistic under the null is non-trivial, as the permutation test is invalid. This work juxtaposes distance correlation (Dcorr) and multiscale graph correlation (MGC) from independence testing literature and block permutation from time series analysis to address these challenges. The proposed nonparametric procedure is valid and consistent, building upon prior work by characterizing the geometry of the relationship, estimating the time lag at which dependence is maximized, avoiding the need for multiple testing, and exhibiting superior power in high-dimensional, low sample size, nonlinear settings. Neural connectivity is analyzed via fMRI data, revealing linear dependence of signals within the visual network and default mode network, and nonlinear relationships in other networks. This work uncovers a first-resort data analysis tool with open-source code available, directly impacting a wide range of scientific disciplines.},
-  archiveprefix = {arXiv},
-  keywords = {Computer Science - Machine Learning,Statistics - Machine Learning,Statistics - Methodology},
+  title={Independence testing for temporal data},
+  author={Shen, Cencheng and Chung, Jaewon and Mehta, Ronak and Xu, Ting and Vogelstein, Joshua T},
+  journal={Transactions on Machine Learning Research},
+  year={2024}
 }
 
 @article{shenChiSquareTestDistance2021,
@@ -458,29 +473,22 @@ @article{shenChiSquareTestDistance2021
 }
 
 @article{chwialkowski2015fast,
-      title={Fast Two-Sample Testing with Analytic Representations of Probability Measures},
-      author={Kacper Chwialkowski and Aaditya Ramdas and Dino Sejdinovic and Arthur Gretton},
-      year={2015},
-      journal={arXiv:1506.04725 [math, stat]},
-      print={1506.04725},
-      eprinttype={arxiv},
-      abstract={We propose a class of nonparametric two-sample tests with a cost linear in the sample size. Two tests are given, both based on an ensemble of distances between analytic functions representing each of the distributions. The first test uses smoothed empirical characteristic functions to represent the distributions, the second uses distribution embeddings in a reproducing kernel Hilbert space. Analyticity implies that differences in the distributions may be detected almost surely at a finite number of randomly chosen locations/frequencies. The new tests are consistent against a larger class of alternatives than the previous linear-time tests based on the (non-smoothed) empirical characteristic functions, while being much faster than the current state-of-the-art quadratic-time kernel-based or energy distance-based tests. Experiments on artificial benchmarks and on challenging real-world testing problems demonstrate that our tests give a better power/time tradeoff than competing approaches, and in some cases, better outright power than even the most expensive quadratic-time tests. This performance advantage is retained even in high dimensions, and in cases where the difference in distributions is not observable with low order statistics.},
-      archivePrefix={arXiv},
-      primaryClass={stat.ML}
+  title={Fast two-sample testing with analytic representations of probability measures},
+  author={Chwialkowski, Kacper P and Ramdas, Aaditya and Sejdinovic, Dino and Gretton, Arthur},
+  journal={Advances in Neural Information Processing Systems},
+  volume={28},
+  year={2015}
 }
 
 @article{grettonKernelJointIndependence2016,
-  title = {{Kernel-based Tests} for {Joint Independence}},
-  author = {Pfister, Nikolas and Buhlmann, Peter and Scholkopf, Bernhard and Peters, Jonas},
-  year = {2016},
-  month = nov,
-  journal = {arXiv:1603.00285 [math, stat]},
-  eprint = {1603.00285},
-  eprinttype = {arxiv},
-  primaryclass = {math, stat},
-  abstract = {We investigate the problem of testing whether d random variables, which may or may not be continuous, are jointly (or mutually) independent. Our method builds on ideas of the two variable Hilbert-Schmidt independence criterion (HSIC) but allows for an arbitrary number of variables. We embed the d-dimensional joint distribution and the product of the marginals into a reproducing kernel Hilbert space and define the d-variable Hilbert-Schmidt independence criterion (dHSIC) as the squared distance between the embeddings. In the population case, the value of dHSIC is zero if and only if the d variables are jointly independent, as long as the kernel is characteristic. Based on an empirical estimate of dHSIC, we define three different non-parametric hypothesis tests: a permutation test, a bootstrap test and a test based on a Gamma approximation. We prove that the permutation test achieves the significance level and that the bootstrap test achieves pointwise asymptotic significance level as well as pointwise asymptotic consistency (i.e., it is able to detect any type of fixed dependence in the large sample limit). The Gamma approximation does not come with these guarantees; however, it is computationally very fast and for small d, it performs well in practice. Finally, we apply the test to a problem in causal discovery.},
-  archiveprefix = {arXiv},
-  keywords = {Math - Statistics Theory, Statistics - Machine Learning},
+  title={Kernel-based tests for joint independence},
+  author={Pfister, Niklas and B{\"u}hlmann, Peter and Sch{\"o}lkopf, Bernhard and Peters, Jonas},
+  journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)},
+  volume={80},
+  number={1},
+  pages={5--31},
+  year={2018},
+  publisher={Wiley Online Library}
  }
 
  @article{friedmanMultivariateGeneralizationsoftheWaldWolfowitzandSmirnovTwoSampleTests1979,
@@ -544,4 +552,27 @@ @Inbook{hotellingRelationsTwoSets1992
 isbn="978-1-4612-4380-9",
 doi="10.1007/978-1-4612-4380-9_14",
 url="https://doi.org/10.1007/978-1-4612-4380-9_14"
+}
+
+@article{jitkrittum2017linear,
+  title={A linear-time kernel goodness-of-fit test},
+  author={Jitkrittum, Wittawat and Xu, Wenkai and Szab{\'o}, Zolt{\'a}n and Fukumizu, Kenji and Gretton, Arthur},
+  journal={Advances in Neural Information Processing Systems},
+  volume={30},
+  year={2017}
+}
+
+@inproceedings{10.5555/3020548.3020641,
+  author = {Zhang, Kun and Peters, Jonas and Janzing, Dominik and Sch\"{o}lkopf, Bernhard},
+  title = {Kernel-Based Conditional Independence Test and Application in Causal Discovery},
+  year = {2011},
+  isbn = {9780974903972},
+  publisher = {AUAI Press},
+  address = {Arlington, Virginia, USA},
+  abstract = {Conditional independence testing is an important problem, especially in Bayesian network learning and causal discovery. Due to the curse of dimensionality, testing for conditional independence of continuous variables is particularly challenging. We propose a Kernel-based Conditional Independence test (KCI-test), by constructing an appropriate test statistic and deriving its asymptotic distribution under the null hypothesis of conditional independence. The proposed method is computationally efficient and easy to implement. Experimental results show that it outperforms other methods, especially when the conditioning set is large or the sample size is not very large, in which case other methods encounter difficulties.},
+  booktitle = {Proceedings of the Twenty-Seventh Conference on Uncertainty in Artificial Intelligence},
+  pages = {804–813},
+  numpages = {10},
+  location = {Barcelona, Spain},
+  series = {UAI'11}
 }
diff --git a/hyppo/conditional/FCIT.py b/hyppo/conditional/FCIT.py
@@ -1,13 +1,11 @@
 import time
-import joblib
 
+import joblib
 import numpy as np
 from scipy.stats import ttest_1samp
 from sklearn.metrics import mean_squared_error as mse
+from sklearn.model_selection import GridSearchCV, ShuffleSplit
 from sklearn.preprocessing import StandardScaler
-
-from sklearn.model_selection import GridSearchCV
-from sklearn.model_selection import ShuffleSplit
 from sklearn.tree import DecisionTreeRegressor
 
 from .base import ConditionalIndependenceTest, ConditionalIndependenceTestOutput
@@ -32,8 +30,12 @@ class FCIT(ConditionalIndependenceTest):
         Proportion of data to evaluate test stat on.
     discrete: tuple of string
         Whether :math:`X` or :math:`Y` are discrete
+
     Notes
     -----
+    .. note::
+       This algorithm is currently a pre-print on arXiv.
+
     The motivation for the test rests on the assumption that if :math:`X \not\!\perp\!\!\!\perp Y \mid Z`,
     then :math:`Y` should be more accurately predicted by using both
     :math:`X` and :math:`Z` as covariates as opposed to only using