diff --git a/docs/make.jl b/docs/make.jl index a3ae15bcd..3c1d3a2d7 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -23,17 +23,9 @@ makedocs(; "Normalizing Flows" => "tutorials/flows.md", ], "Algorithms" => [ - "KLMinRepGradDescent" => "paramspacesgd/klminrepgraddescent.md", - "KLMinRepGradProxDescent" => "paramspacesgd/klminrepgradproxdescent.md", - "KLMinScoreGradDescent" => "paramspacesgd/klminscoregraddescent.md", - "Parameter Space SGD" => [ - "General" => "paramspacesgd/general.md", - "Objectives" => [ - "Overview" => "paramspacesgd/objectives.md", - "RepGradELBO" => "paramspacesgd/repgradelbo.md", - "ScoreGradELBO" => "paramspacesgd/scoregradelbo.md", - ], - ], + "KLMinRepGradDescent" => "klminrepgraddescent.md", + "KLMinRepGradProxDescent" => "klminrepgradproxdescent.md", + "KLMinScoreGradDescent" => "klminscoregraddescent.md", ], "Variational Families" => "families.md", "Optimization" => "optimization.md", diff --git a/docs/src/families.md b/docs/src/families.md index 761769f3a..a7f2e19b8 100644 --- a/docs/src/families.md +++ b/docs/src/families.md @@ -1,6 +1,6 @@ # [Reparameterizable Variational Families](@id families) -The [RepGradELBO](@ref repgradelbo) objective assumes that the members of the variational family have a differentiable sampling path. +Algorithms such as [`KLMinRepGradELBO`](@ref klminrepgraddescent) assume that the members of the variational family have a differentiable sampling path. We provide multiple pre-packaged variational families that can be readily used. ## [The `LocationScale` Family](@id locscale) diff --git a/docs/src/index.md b/docs/src/index.md index c02c2cd19..5728e6f5a 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -10,7 +10,6 @@ VI algorithms perform scalable and computationally efficient Bayesian inference # List of Algorithms - - [ParamSpaceSGD](@ref paramspacesgd) - [KLMinRepGradDescent](@ref klminrepgraddescent) (alias of `ADVI`) - [KLMinRepGradProxDescent](@ref klminrepgradproxdescent) - [KLMinScoreGradDescent](@ref klminscoregraddescent) (alias of `BBVI`) diff --git a/docs/src/paramspacesgd/repgradelbo.md b/docs/src/klminrepgraddescent.md similarity index 79% rename from docs/src/paramspacesgd/repgradelbo.md rename to docs/src/klminrepgraddescent.md index f61940be1..0deec5a01 100644 --- a/docs/src/paramspacesgd/repgradelbo.md +++ b/docs/src/klminrepgraddescent.md @@ -1,49 +1,75 @@ -# [Reparameterization Gradient Estimator](@id repgradelbo) +# [`KLMinRepGradDescent`](@id klminrepgraddescent) -## Overview +## Description -The `RepGradELBO` objective implements the reparameterization gradient estimator[^HC1983][^G1991][^R1992][^P1996] of the ELBO gradient. -The reparameterization gradient, also known as the push-in gradient or the pathwise gradient, was introduced to VI in [^TL2014][^RMW2014][^KW2014]. -For the variational family $\mathcal{Q} = \{q_{\lambda} \mid \lambda \in \Lambda\}$, suppose the process of sampling from $q_{\lambda}$ can be described by some differentiable reparameterization function $$T_{\lambda}$$ and a *base distribution* $$\varphi$$ independent of $$\lambda$$ such that +This algorithm aims to minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence via stochastic gradient descent in the space of parameters. +Specifically, it uses the the *reparameterization gradient estimator*. +As a result, this algorithm is best applicable when the target log-density is differentiable and the sampling process of the variational family is differentiable. +(See the [methodology section](@ref klminrepgraddescent_method) for more details.) +This algorithm is also commonly referred to as automatic differentiation variational inference, black-box variational inference with the reparameterization gradient, and stochastic gradient variational inference. +`KLMinRepGradDescent` is also an alias of `ADVI` . + +```@docs +KLMinRepGradDescent +``` + +## [Methodology](@id klminrepgraddescent_method) + +This algorithm aims to solve the problem -[^HC1983]: Ho, Y. C., & Cao, X. (1983). Perturbation analysis and optimization of queueing networks. Journal of optimization theory and Applications, 40(4), 559-582. -[^G1991]: Glasserman, P. (1991). Gradient estimation via perturbation analysis (Vol. 116). Springer Science & Business Media. -[^R1992]: Rubinstein, R. Y. (1992). Sensitivity analysis of discrete event systems by the “push out” method. Annals of Operations Research, 39(1), 229-250. -[^P1996]: Pflug, G. C. (1996). Optimization of stochastic models: the interface between simulation and optimization (Vol. 373). Springer Science & Business Media. -[^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*. -[^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*. -[^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*. ```math -z \sim q_{\lambda} \qquad\Leftrightarrow\qquad -z \stackrel{d}{=} T_{\lambda}\left(\epsilon\right);\quad \epsilon \sim \varphi \; . + \mathrm{minimize}_{q \in \mathcal{Q}}\quad \mathrm{KL}\left(q, \pi\right) ``` -In these cases, denoting the target log denstiy as $\log \pi$, we can effectively estimate the gradient of the ELBO by directly differentiating the stochastic estimate of the ELBO objective +where $\mathcal{Q}$ is some family of distributions, often called the variational family, by running stochastic gradient descent in the (Euclidean) space of parameters. +That is, for all $$q_{\lambda} \in \mathcal{Q}$$, we assume $$q_{\lambda}$$ there is a corresponding vector of parameters $$\lambda \in \Lambda$$, where the space of parameters is Euclidean such that $$\Lambda \subset \mathbb{R}^p$$. + +Since we usually only have access to the unnormalized densities of the target distribution $\pi$, we don't have direct access to the KL divergence. +Instead, the ELBO maximization strategy maximizes a surrogate objective, the *evidence lower bound* (ELBO; [^JGJS1999]) ```math - \widehat{\mathrm{ELBO}}\left(\lambda\right) = \frac{1}{M}\sum^M_{m=1} \log \pi\left(T_{\lambda}\left(\epsilon_m\right)\right) + \mathbb{H}\left(q_{\lambda}\right), + \mathrm{ELBO}\left(q\right) \triangleq \mathbb{E}_{\theta \sim q} \log \pi\left(\theta\right) + \mathbb{H}\left(q\right), ``` -where $$\epsilon_m \sim \varphi$$ are Monte Carlo samples. -The resulting gradient estimate is called the reparameterization gradient estimator. +which is equivalent to the KL up to an additive constant (the evidence). -In addition to the reparameterization gradient, `AdvancedVI` provides the following features: +Algorithmically, `KLMinRepGradDescent` iterates the step - 1. **Posteriors with constrained supports** are handled through [`Bijectors`](https://github.com/TuringLang/Bijectors.jl), which is known as the automatic differentiation VI (ADVI; [^KTRGB2017]) formulation. (See [this section](@ref bijectors).) - 2. **The gradient of the entropy** can be estimated through various strategies depending on the capabilities of the variational family. (See [this section](@ref entropygrad).) +```math + \lambda_{t+1} = \mathrm{operator}\big( + \lambda_{t} + \gamma_t \widehat{\nabla_{\lambda} \mathrm{ELBO}} (q_{\lambda_t}) + \big) , +``` -## `RepGradELBO` +where $\widehat{\nabla \mathrm{ELBO}}(q_{\lambda})$ is the reparameterization gradient estimate[^HC1983][^G1991][^R1992][^P1996] of the ELBO gradient and $$\mathrm{operator}$$ is an optional operator (*e.g.* projections, identity mapping). -To use the reparameterization gradient, `AdvancedVI` provides the following variational objective: +The reparameterization gradient, also known as the push-in gradient or the pathwise gradient, was introduced to VI in [^TL2014][^RMW2014][^KW2014]. +For the variational family $$\mathcal{Q}$$, suppose the process of sampling from $$q_{\lambda} \in \mathcal{Q}$$ can be described by some differentiable reparameterization function $$T_{\lambda}$$ and a *base distribution* $$\varphi$$ independent of $$\lambda$$ such that -```@docs -RepGradELBO +```math +z \sim q_{\lambda} \qquad\Leftrightarrow\qquad +z \stackrel{d}{=} T_{\lambda}\left(\epsilon\right);\quad \epsilon \sim \varphi \; . ``` -## [Handling Constraints with `Bijectors`](@id bijectors) +In these cases, denoting the target log denstiy as $\log \pi$, we can effectively estimate the gradient of the ELBO by directly differentiating the stochastic estimate of the ELBO objective -As mentioned in the docstring, the `RepGradELBO` objective assumes that the variational approximation $$q_{\lambda}$$ and the target distribution $$\pi$$ have the same support for all $$\lambda \in \Lambda$$. +```math + \widehat{\mathrm{ELBO}}\left(q_{\lambda}\right) = \frac{1}{M}\sum^M_{m=1} \log \pi\left(T_{\lambda}\left(\epsilon_m\right)\right) + \mathbb{H}\left(q_{\lambda}\right), +``` + +where $$\epsilon_m \sim \varphi$$ are Monte Carlo samples. + +[^JGJS1999]: Jordan, M. I., Ghahramani, Z., Jaakkola, T. S., & Saul, L. K. (1999). An introduction to variational methods for graphical models. Machine learning, 37, 183-233. +[^HC1983]: Ho, Y. C., & Cao, X. (1983). Perturbation analysis and optimization of queueing networks. Journal of optimization theory and Applications, 40(4), 559-582. +[^G1991]: Glasserman, P. (1991). Gradient estimation via perturbation analysis (Vol. 116). Springer Science & Business Media. +[^R1992]: Rubinstein, R. Y. (1992). Sensitivity analysis of discrete event systems by the “push out” method. Annals of Operations Research, 39(1), 229-250. +[^P1996]: Pflug, G. C. (1996). Optimization of stochastic models: the interface between simulation and optimization (Vol. 373). Springer Science & Business Media. +[^TL2014]: Titsias, M., & Lázaro-Gredilla, M. (2014). Doubly stochastic variational Bayes for non-conjugate inference. In *International Conference on Machine Learning*. +[^RMW2014]: Rezende, D. J., Mohamed, S., & Wierstra, D. (2014). Stochastic backpropagation and approximate inference in deep generative models. In *International Conference on Machine Learning*. +[^KW2014]: Kingma, D. P., & Welling, M. (2014). Auto-encoding variational bayes. In *International Conference on Learning Representations*. +## [Handling Constraints with `Bijectors`](@id bijectors) +As mentioned in the docstring, `KLMinRepGradDescent` assumes that the variational approximation $$q_{\lambda}$$ and the target distribution $$\pi$$ have the same support for all $$\lambda \in \Lambda$$. However, in general, it is most convenient to use variational families that have the whole Euclidean space $$\mathbb{R}^d$$ as their support. This is the case for the [location-scale distributions](@ref locscale) provided by `AdvancedVI`. For target distributions which the support is not the full $$\mathbb{R}^d$$, we can apply some transformation $$b$$ to $$q_{\lambda}$$ to match its support such that @@ -57,9 +83,11 @@ where $$b$$ is often called a *bijector*, since it is often chosen among bijecti This idea is known as automatic differentiation VI[^KTRGB2017] and has subsequently been improved by Tensorflow Probability[^DLTBV2017]. In Julia, [Bijectors.jl](https://github.com/TuringLang/Bijectors.jl)[^FXTYG2020] provides a comprehensive collection of bijections. -One caveat of ADVI is that, after applying the bijection, a Jacobian adjustment needs to be applied. -That is, the objective is now - +[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*, 18(14), 1-45. +[^DLTBV2017]: Dillon, J. V., Langmore, I., Tran, D., Brevdo, E., Vasudevan, S., Moore, D., ... & Saurous, R. A. (2017). Tensorflow distributions. arXiv. +[^FXTYG2020]: Fjelde, T. E., Xu, K., Tarek, M., Yalburgi, S., & Ge, H. (2020,. Bijectors. jl: Flexible transformations for probability distributions. In *Symposium on Advances in Approximate Bayesian Inference*. + One caveat of ADVI is that, after applying the bijection, a Jacobian adjustment needs to be applied. + That is, the objective is now ```math \mathrm{ADVI}\left(\lambda\right) \triangleq @@ -84,13 +112,10 @@ q_transformed = Bijectors.TransformedDistribution(q, binv) By passing `q_transformed` to `optimize`, the Jacobian adjustment for the bijector `b` is automatically applied. (See the [Basic Example](@ref basic) for a fully working example.) -[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*. -[^DLTBV2017]: Dillon, J. V., Langmore, I., Tran, D., Brevdo, E., Vasudevan, S., Moore, D., ... & Saurous, R. A. (2017). Tensorflow distributions. arXiv. -[^FXTYG2020]: Fjelde, T. E., Xu, K., Tarek, M., Yalburgi, S., & Ge, H. (2020,. Bijectors. jl: Flexible transformations for probability distributions. In *Symposium on Advances in Approximate Bayesian Inference*. -## [Entropy Estimators](@id entropygrad) +## [Entropy Gradient Estimators](@id entropygrad) For the gradient of the entropy term, we provide three choices with varying requirements. -The user can select the entropy estimator by passing it as a keyword argument when constructing the `RepGradELBO` objective. +The user can select the entropy estimator by passing it as a keyword argument when constructing the algorithm object. | Estimator | `entropy(q)` | `logpdf(q)` | Type | |:--------------------------- |:------------:|:-----------:|:-------------------------------- | @@ -179,7 +204,7 @@ end In this example, the true posterior is contained within the variational family. This setting is known as "perfect variational family specification." -In this case, the `RepGradELBO` estimator with `StickingTheLandingEntropy` is the only estimator known to converge exponentially fast ("linear convergence") to the true solution. +In this case, `KLMinRepGradDescent` with `StickingTheLandingEntropy` is the only estimator known to converge exponentially fast ("linear convergence") to the true solution. Recall that the original ADVI objective with a closed-form entropy (CFE) is given as follows: @@ -281,7 +306,7 @@ Furthermore, in a lot of cases, a low-accuracy solution may be sufficient. [^KMG2024]: Kim, K., Ma, Y., & Gardner, J. (2024). Linear Convergence of Black-Box Variational Inference: Should We Stick the Landing?. In International Conference on Artificial Intelligence and Statistics (pp. 235-243). PMLR. ## Advanced Usage -There are two major ways to customize the behavior of `RepGradELBO` +There are two major ways to customize the behavior of `KLMinRepGradDescent` - Customize the `Distributions` functions: `rand(q)`, `entropy(q)`, `logpdf(q)`. - Customize `AdvancedVI.reparam_with_entropy`. diff --git a/docs/src/klminrepgradproxdescent.md b/docs/src/klminrepgradproxdescent.md new file mode 100644 index 000000000..24931a033 --- /dev/null +++ b/docs/src/klminrepgradproxdescent.md @@ -0,0 +1,61 @@ +# [`KLMinRepGradProxDescent`](@id klminrepgradproxdescent) + +## Description + +This algorithm is a slight variation of [`KLMinRepGradDescent`](@ref klminrepgraddescent) specialized to [location-scale families](@ref locscale). +Therefore, it also aims to minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence over the space of parameters. +But instead, it uses stochastic proximal gradient descent with the [proximal operator](@ref proximalocationscaleentropy) of the entropy of location-scale variational families as discussed in: [^D2020][^KMG2024][^DGG2023]. +The remainder of the section will only discuss details specific to `KLMinRepGradProxDescent`. +Thus, for general usage and additional details, please refer to the docs of `KLMinRepGradDescent` instead. + +```@docs +KLMinRepGradProxDescent +``` + +It implements the stochastic proximal gradient descent-based algorithm described in: . + +## Methodology + +Recall that [KLMinRepGradDescent](@ref klminrepgraddescent) maximizes the ELBO. +Now, the ELBO can be re-written as follows: + +```math + \mathrm{ELBO}\left(q\right) \triangleq \mathcal{E}\left(q\right) + \mathbb{H}\left(q\right), +``` + +where + +```math + \mathcal{E}\left(q\right) = \mathbb{E}_{\theta \sim q} \log \pi\left(\theta\right) +``` + +is often referred to as the *negative energy functional*. +`KLMinRepGradProxDescent` attempts to address the fact that minimizing the whole ELBO can be unstable due to non-smoothness of $$\mathbb{H}\left(q\right)$$[^D2020]. +For this, `KLMinRepGradProxDescent` relies on proximal stochastic gradient descent, where the problematic term $$\mathbb{H}\left(q\right)$$ is separately handled via a *proximal operator*. +Specifically, `KLMinRepGradProxDescent` first estimates the gradient of the energy $$\mathcal{E}\left(q\right)$$ only via the reparameterization gradient estimator. +Let us denote this as $$\widehat{\nabla_{\lambda} \mathcal{E}}\left(q_{\lambda}\right)$$. +Then `KLMinRepGradProxDescent` iterates the step + +```math + \lambda_{t+1} = \mathrm{prox}_{-\gamma_t \mathbb{H}}\big( + \lambda_{t} + \gamma_t \widehat{\nabla_{\lambda} \mathcal{E}}(q_{\lambda_t}) + \big) , +``` + +where + +```math +\mathrm{prox}_{h}(\lambda_t) += \argmin_{\lambda \in \Lambda}\left\{ + h(\lambda) + {\lVert \lambda - \lambda_t \rVert}_2^2 +\right\} +``` + +is a proximal operator for the entropy. +As long as $$\mathrm{prox}_{-\gamma_t \mathbb{H}}$$ can be evaluated efficiently, this scheme can side-step the fact that $$\mathbb{H}(\lambda)$$ is difficult to deal with via gradient descent. +For location-scale families, it turns out the proximal operator of the entropy can be operated efficiently[^D2020], which is implemented as [`ProximalLocationScaleEntropy`](@ref proximalocationscaleentropy). +This has been empirically shown to be more robust[^D2020][^KMG2024]. + +[^D2020]: Domke, J. (2020). Provable smoothness guarantees for black-box variational inference. In *International Conference on Machine Learning*. +[^KMG2024]: Kim, K., Ma, Y., & Gardner, J. (2024). Linear Convergence of Black-Box Variational Inference: Should We Stick the Landing?. In International Conference on Artificial Intelligence and Statistics (pp. 235-243). PMLR. +[^DGG2023]: Domke, J., Gower, R., & Garrigos, G. (2023). Provable convergence guarantees for black-box variational inference. Advances in neural information processing systems, 36, 66289-66327. diff --git a/docs/src/klminscoregraddescent.md b/docs/src/klminscoregraddescent.md new file mode 100644 index 000000000..dd1a96677 --- /dev/null +++ b/docs/src/klminscoregraddescent.md @@ -0,0 +1,79 @@ +# [`KLMinScoreGradDescent`](@id klminscoregraddescent) + +## Description + +This algorithms aim to minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence via stochastic gradient descent in the space of parameters. +Specifically, it uses the the *score gradient* estimator, which is similar to the algorithm that was originally referred to as black-box variational inference (BBVI; [^RGB2014][^WW2013]). +(The term BBVI has also recently been used to refer to the more general setup of maximizing the ELBO in parameter space. We are using the more narrow definition, which restricts to the use of the score gradient.) +However, instead of using the vanilla score gradient estimator, we differentiate the "VarGrad" objective[^RBNRA2020], which results in the score gradient variance-reduced by the leave-one-out control variate[^SK2014][^KvHW2019]. +`KLMinScoreGradDescent` is also an alias of `BBVI`. + +[^RBNRA2020]: Richter, L., Boustati, A., Nüsken, N., Ruiz, F., & Akyildiz, O. D. (2020). Vargrad: a low-variance gradient estimator for variational inference. Advances in Neural Information Processing Systems, 33, 13481-13492. +[^SK2014]: Salimans, T., & Knowles, D. A. (2014). On using control variates with stochastic approximation for variational bayes and its connection to stochastic linear regression. arXiv preprint arXiv:1401.1022. +```@docs +KLMinScoreGradDescent +``` + +## Methodology + +This algorithm aims to solve the problem + +```math + \mathrm{minimize}_{q \in \mathcal{Q}}\quad \mathrm{KL}\left(q, \pi\right) +``` + +where $\mathcal{Q}$ is some family of distributions, often called the variational family, by running stochastic gradient descent in the (Euclidean) space of parameters. +That is, for all $$q_{\lambda} \in \mathcal{Q}$$, we assume $$q_{\lambda}$$ there is a corresponding vector of parameters $$\lambda \in \Lambda$$, where the space of parameters is Euclidean such that $$\Lambda \subset \mathbb{R}^p$$. + +Since we usually only have access to the unnormalized densities of the target distribution $\pi$, we don't have direct access to the KL divergence. +Instead, the ELBO maximization strategy maximizes a surrogate objective, the *evidence lower bound* (ELBO; [^JGJS1999]) + +```math + \mathrm{ELBO}\left(q\right) \triangleq \mathbb{E}_{\theta \sim q} \log \pi\left(\theta\right) + \mathbb{H}\left(q\right), +``` + +which is equivalent to the KL up to an additive constant (the evidence). + +Algorithmically, `KLMinRepGradDescent` iterates the step + +```math + \lambda_{t+1} = \mathrm{operator}\big( + \lambda_{t} + \gamma_t \widehat{\nabla_{\lambda} \mathrm{ELBO}} (q_{\lambda_t}) + \big) , +``` + +where $\widehat{\nabla \mathrm{ELBO}}(q_{\lambda})$ is the score gradient estimate[^G1990][^KR1996][^RSU1996][^W1992] of the ELBO gradient and $$\mathrm{operator}$$ is an optional operator (*e.g.* projections, identity mapping). + +Let us describe the score gradient estimator[^G1990][^KR1996][^RSU1996][^W1992] of the ELBO gradient, also known as the score function method and the REINFORCE gradient. +For variational inference, the use of the score gradient was proposed in [^WW2013][^RGB2014]. +Unlike the reparameterization gradient, the score gradient does not require the target log density to be differentiable, and does not differentiate through the sampling process of the variational approximation $q$. +Instead, it only requires gradients of the log density $\log q$. +For this reason, the score gradient is the standard method to deal with discrete variables and targets with discrete support. +In more detail, the score gradient uses the Fisher log-derivative identity: For any regular $f$, + +```math +\nabla_{\lambda} \mathbb{E}_{z \sim q_{\lambda}} f += +\mathbb{E}_{z \sim q_{\lambda}}\left[ f(z) \log q_{\lambda}(z) \right] \; . +``` + +The ELBO corresponds to the case where $f = \log \pi / \log q$, where $\log \pi$ is the target log density. +Instead of implementing the canonical score gradient, `KLMinScoreGradDescent` internally uses the "VarGrad" objective[^RBNRA2020]: + +```math +\widehat{\mathrm{VarGrad}}(\lambda) += +\mathrm{Var}\left( \log q_{\lambda}(z_i) - \log \pi\left(z_i\right) \right) \; , +``` + +where the variance is computed over the samples $z_1, \ldots, z_m \sim q_{\lambda}$. +Differentiating the VarGrad objective corresponds to the canonical score gradient combined with the "leave-one-out" control variate[^SK2014][^KvHW2019]. + +[^JGJS1999]: Jordan, M. I., Ghahramani, Z., Jaakkola, T. S., & Saul, L. K. (1999). An introduction to variational methods for graphical models. Machine learning, 37, 183-233. +[^G1990]: Glynn, P. W. (1990). Likelihood ratio gradient estimation for stochastic systems. Communications of the ACM, 33(10), 75-84. +[^KR1996]: Kleijnen, J. P., & Rubinstein, R. Y. (1996). Optimization and sensitivity analysis of computer simulation models by the score function method. European Journal of Operational Research, 88(3), 413-427. +[^RSU1996]: Rubinstein, R. Y., Shapiro, A., & Uryasev, S. (1996). The score function method. Encyclopedia of Management Sciences, 1363-1366. +[^W1992]: Williams, R. J. (1992). Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine learning, 8, 229-256. +[^WW2013]: Wingate, D., & Weber, T. (2013). Automated variational inference in probabilistic programming. arXiv preprint arXiv:1301.1299. +[^RGB2014]: Ranganath, R., Gerrish, S., & Blei, D. (2014). Black box variational inference. In Artificial intelligence and statistics (pp. 814-822). PMLR. +[^KvHW2019]: Kool, W., van Hoof, H., & Welling, M. (2019). Buy 4 reinforce samples, get a baseline for free!. diff --git a/docs/src/paramspacesgd/general.md b/docs/src/paramspacesgd/general.md deleted file mode 100644 index 347a9dc82..000000000 --- a/docs/src/paramspacesgd/general.md +++ /dev/null @@ -1,98 +0,0 @@ - -# [General](@id paramspacesgd) - -`ParamSpaceSGD` SGD is a general algorithm for leveraging automatic differentiation and SGD. -Furthermore, it operates in the space of *variational parameters*. -Consider the case where each member $q_{\lambda} \in \mathcal{Q}$ of the variational family $\mathcal{Q}$ is uniquely represented through a collection of parameters $\lambda \in \Lambda \subseteq \mathbb{R}^p$. -That is, - -```math -\mathcal{Q} = \{q_{\lambda} \mid \lambda \in \Lambda \}, -``` -Then, as implied by the name, `ParamSpaceSGD` runs SGD on $\Lambda$, the (Euclidean) space of parameters. - -Any algorithm that operates by iterating the following steps can easily be implemented via `ParamSpaceSGD`: - -1. Obtain an unbiased estimate of the target objective. -2. Obtain an estimate of the gradient of the objective by differentiating the objective estimate with respect to the parameters. -3. Perform gradient descent with the stochastic gradient estimate. - -After some simplifications, each `step` of `ParamSpaceSGD` can be described as follows: - -```julia -function step(rng, alg::ParamSpaceSGD, state, callback, objargs...; kwargs...) - (; adtype, problem, objective, operator, averager) = alg - (; q, iteration, grad_buf, opt_st, obj_st, avg_st) = state - iteration += 1 - - # Extract variational parameters of `q` - params, re = Optimisers.destructure(q) - - # Estimate gradient and update the `DiffResults` buffer `grad_buf`. - grad_buf, obj_st, info = estimate_gradient!(...) - - # Gradient descent step. - grad = DiffResults.gradient(grad_buf) - opt_st, params = Optimisers.update!(opt_st, params, grad) - - # Apply operator - params = apply(operator, typeof(q), opt_st, params, re) - - # Apply parameter averaging - avg_st = apply(averager, avg_st, params) - - # Updated state - state = ParamSpaceSGDState(re(params), iteration, grad_buf, opt_st, obj_st, avg_st) - state, false, info -end -``` -The output of `ParamSpaceSGD` is the final state of `averager`. -Furthermore, `operator` can be anything from an identity mapping, a projection operator, a proximal operator, and so on. - -## `ParamSpaceSGD` -The constructor for `ParamSpaceSGD` is as follows: - -```@docs -ParamSpaceSGD -``` - -## Objective Interface - -To define an instance of a `ParamSpaceSGD` algorithm, it suffices to implement the `AbstractVariationalObjective` interface. -First, we need to define a subtype of `AbstractVariationalObjective`: - -```@docs -AdvancedVI.AbstractVariationalObjective -``` - -In addition, we need to implement some methods associated with the objective. -First, each objective may maintain a state such as buffers, online estimates of control variates, batch iterators for subsampling, and so on. -Such things should be initialized by implementing the following: - -```@docs -AdvancedVI.init( - ::Random.AbstractRNG, - ::AdvancedVI.AbstractVariationalObjective, - ::ADTypes.AbstractADType, - ::Any, - ::Any, - ::Any, - ::Any, -) -``` -If this method is not implemented, the state will be automatically be `nothing`. - -Next, the key functionality of estimating stochastic gradients should be implemented through the following: - -```@docs -AdvancedVI.estimate_gradient! -``` - -`AdvancedVI` only interacts with each variational objective by querying gradient estimates. -In a lot of cases, however, it is convinient to be able to estimate the current value of the objective. -For example, for monitoring convergence. -This should be done through the following: - -```@docs -AdvancedVI.estimate_objective -``` diff --git a/docs/src/paramspacesgd/klminrepgraddescent.md b/docs/src/paramspacesgd/klminrepgraddescent.md deleted file mode 100644 index 6212085dc..000000000 --- a/docs/src/paramspacesgd/klminrepgraddescent.md +++ /dev/null @@ -1,10 +0,0 @@ -# [`KLMinRepGradDescent`](@id klminrepgraddescent) - -This is a convenience constructor for [`ParamSpaceSGD`](@ref paramspacesgd) with the [`RepGradELBO`](@ref repgradelbo) objective. -This is equivalent to the algorithm commonly referred as automatic differentiation variational inference[^KTRGB2017]. -`KLMinRepGradDescent` is also an alias of `ADVI` . - -[^KTRGB2017]: Kucukelbir, A., Tran, D., Ranganath, R., Gelman, A., & Blei, D. M. (2017). Automatic differentiation variational inference. *Journal of Machine Learning Research*, 18(14), 1-45. -```@docs -KLMinRepGradDescent -``` diff --git a/docs/src/paramspacesgd/klminrepgradproxdescent.md b/docs/src/paramspacesgd/klminrepgradproxdescent.md deleted file mode 100644 index 831c86bd5..000000000 --- a/docs/src/paramspacesgd/klminrepgradproxdescent.md +++ /dev/null @@ -1,11 +0,0 @@ -# [`KLMinRepGradProxDescent`](@id klminrepgradproxdescent) - -This is a convenience constructor for [`ParamSpaceSGD`](@ref paramspacesgd) with the [`RepGradELBO`](@ref repgradelbo) objective with a proximal operator of the entropy (see [here](@ref proximalocationscaleentropy)) of location-scale variational families. -It implements the stochastic proximal gradient descent-based algorithm described in: [^D2020][^KMG2024][^DGG2023]. - -[^D2020]: Domke, J. (2020). Provable smoothness guarantees for black-box variational inference. In *International Conference on Machine Learning*. -[^KMG2024]: Kim, K., Ma, Y., & Gardner, J. (2024). Linear Convergence of Black-Box Variational Inference: Should We Stick the Landing?. In International Conference on Artificial Intelligence and Statistics (pp. 235-243). PMLR. -[^DGG2023]: Domke, J., Gower, R., & Garrigos, G. (2023). Provable convergence guarantees for black-box variational inference. Advances in neural information processing systems, 36, 66289-66327. -```@docs -KLMinRepGradProxDescent -``` diff --git a/docs/src/paramspacesgd/klminscoregraddescent.md b/docs/src/paramspacesgd/klminscoregraddescent.md deleted file mode 100644 index 514a1410b..000000000 --- a/docs/src/paramspacesgd/klminscoregraddescent.md +++ /dev/null @@ -1,15 +0,0 @@ -# [`KLMinScoreGradDescent`](@id klminscoregraddescent) - -This is a convenience constructor for [`ParamSpaceSGD`](@ref paramspacesgd) with the [`ScoreGradELBO`](@ref scoregradelbo) objective. -This is similar to the algorithm that was originally referred to as black-box variational inference (BBVI; [^RGB2014][^WW2013]). -(The term BBVI has also recently been used to refer to the more general setup of maximizing the ELBO in parameter space. We are using the more narrow definition, which restricts to the use of the score gradient.) -However, instead of using the vanilla score gradient estimator, we differentiate the "VarGrad" objective[^RBNRA2020], which results in the score gradient variance-reduced by the leave-one-out control variate[^SK2014][^KvHW2019]. - -[^RGB2014]: Ranganath, R., Gerrish, S., & Blei, D. (2014, April). Black box variational inference. In *Artificial Intelligence and Statistics* (pp. 814-822). PMLR. -[^WW2013]: Wingate, D., & Weber, T. (2013). Automated variational inference in probabilistic programming. arXiv preprint arXiv:1301.1299. -[^RBNRA2020]: Richter, L., Boustati, A., Nüsken, N., Ruiz, F., & Akyildiz, O. D. (2020). Vargrad: a low-variance gradient estimator for variational inference. Advances in Neural Information Processing Systems, 33, 13481-13492. -[^SK2014]: Salimans, T., & Knowles, D. A. (2014). On using control variates with stochastic approximation for variational bayes and its connection to stochastic linear regression. arXiv preprint arXiv:1401.1022. -[^KvHW2019]: Kool, W., van Hoof, H., & Welling, M. (2019). Buy 4 reinforce samples, get a baseline for free!. -```@docs -KLMinScoreGradDescent -``` diff --git a/docs/src/paramspacesgd/objectives.md b/docs/src/paramspacesgd/objectives.md deleted file mode 100644 index d3a822cf7..000000000 --- a/docs/src/paramspacesgd/objectives.md +++ /dev/null @@ -1,35 +0,0 @@ -# Overview of Algorithms - -This section will provide an overview of the algorithm form by each objectives provided by `AdvancedVI`. - -## Evidence Lower Bound Maximization - -Evidence lower bound (ELBO) maximization[^JGJS1999] is a general family of algorithms that minimize the exclusive (or reverse) Kullback-Leibler (KL) divergence between the target distribution ``\pi`` and a variational approximation ``q_{\lambda}``. -More generally, it aims to solve the problem - -```math - \mathrm{minimize}_{q \in \mathcal{Q}}\quad \mathrm{KL}\left(q, \pi\right) \; , -``` - -where $\mathcal{Q}$ is some family of distributions, often called the variational family. -Since we usually only have access to the unnormalized densities of the target distribution $\pi$, we don't have direct access to the KL divergence. -Instead, the ELBO maximization strategy maximizes a surrogate objective, the *ELBO*: - -```math - \mathrm{ELBO}\left(q\right) \triangleq \mathbb{E}_{\theta \sim q} \log \pi\left(\theta\right) + \mathbb{H}\left(q\right), -``` - -which is equivalent to the KL up to an additive constant (the evidence). -The ELBO and its gradient can be readily estimated through various strategies. -Overall, ELBO maximization algorithms aim to solve the problem: - -```math - \mathrm{minimize}_{q \in \mathcal{Q}}\quad -\mathrm{ELBO}\left(q\right). -``` - -Multiple ways to solve this problem exist, each leading to a different variational inference algorithm. `AdvancedVI` provides the following objectives: - - - [RepGradELBO](@ref repgradelbo): Implements the reparameterization gradient estimator of the ELBO gradient. - - [ScoreGradELBO](@ref scoregradelbo): Implements the score gradient estimator of the ELBO gradient. - -[^JGJS1999]: Jordan, M. I., Ghahramani, Z., Jaakkola, T. S., & Saul, L. K. (1999). An introduction to variational methods for graphical models. Machine learning, 37, 183-233. diff --git a/docs/src/paramspacesgd/scoregradelbo.md b/docs/src/paramspacesgd/scoregradelbo.md deleted file mode 100644 index c86ceca10..000000000 --- a/docs/src/paramspacesgd/scoregradelbo.md +++ /dev/null @@ -1,45 +0,0 @@ -# [Score Gradient Estimator](@id scoregradelbo) - -## Overview - -The `ScoreGradELBO` implements the score gradient estimator[^G1990][^KR1996][^RSU1996][^W1992] of the ELBO gradient, also known as the score function method and the REINFORCE gradient. -For variational inference, the use of the score gradient was proposed in [^WW2013][^RGB2014]. -Unlike the [reparameterization gradient](@ref repgradelbo), the score gradient does not require the target log density to be differentiable, and does not differentiate through the sampling process of the variational approximation $q$. -Instead, it only requires gradients of the log density $\log q$. -For this reason, the score gradient is the standard method to deal with discrete variables and targets with discrete support. - -[^G1990]: Glynn, P. W. (1990). Likelihood ratio gradient estimation for stochastic systems. Communications of the ACM, 33(10), 75-84. -[^KR1996]: Kleijnen, J. P., & Rubinstein, R. Y. (1996). Optimization and sensitivity analysis of computer simulation models by the score function method. European Journal of Operational Research, 88(3), 413-427. -[^RSU1996]: Rubinstein, R. Y., Shapiro, A., & Uryasev, S. (1996). The score function method. Encyclopedia of Management Sciences, 1363-1366. -[^W1992]: Williams, R. J. (1992). Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine learning, 8, 229-256. -[^WW2013]: Wingate, D., & Weber, T. (2013). Automated variational inference in probabilistic programming. arXiv preprint arXiv:1301.1299. -[^RGB2014]: Ranganath, R., Gerrish, S., & Blei, D. (2014). Black box variational inference. In Artificial intelligence and statistics (pp. 814-822). PMLR. - In more detail, the score gradient uses the Fisher log-derivative identity: For any regular $f$, -```math -\nabla_{\lambda} \mathbb{E}_{z \sim q_{\lambda}} f -= -\mathbb{E}_{z \sim q_{\lambda}}\left[ f(z) \log q_{\lambda}(z) \right] \; . -``` - -The ELBO corresponds to the case where $f = \log \pi / \log q$, where $\log \pi$ is the target log density. - -Instead of implementing the canonical score gradient, `ScoreGradELBO` uses the "VarGrad" objective[^RBNRA2020]: - -```math -\widehat{\mathrm{VarGrad}}(\lambda) -= -\mathrm{Var}\left( \log q_{\lambda}(z_i) - \log \pi\left(z_i\right) \right) \; , -``` - -where the variance is computed over the samples $z_1, \ldots, z_m \sim q_{\lambda}$. -Differentiating the VarGrad objective corresponds to the canonical score gradient combined with the "leave-one-out" control variate[^SK2014][^KvHW2019]. - -[^RBNRA2020]: Richter, L., Boustati, A., Nüsken, N., Ruiz, F., & Akyildiz, O. D. (2020). Vargrad: a low-variance gradient estimator for variational inference. Advances in Neural Information Processing Systems, 33, 13481-13492. -[^SK2014]: Salimans, T., & Knowles, D. A. (2014). On using control variates with stochastic approximation for variational bayes and its connection to stochastic linear regression. arXiv preprint arXiv:1401.1022. -[^KvHW2019]: Kool, W., van Hoof, H., & Welling, M. (2019). Buy 4 reinforce samples, get a baseline for free!. - Since the expectation of the `VarGrad` objective (not its gradient) is not exactly the ELBO, we separately obtain an unbiased estimate of the ELBO to be returned by [`estimate_objective`](@ref). -## `ScoreGradELBO` - -```@docs -ScoreGradELBO -``` diff --git a/ext/AdvancedVIBijectorsExt.jl b/ext/AdvancedVIBijectorsExt.jl index a28dc9d0f..0e85d4d43 100644 --- a/ext/AdvancedVIBijectorsExt.jl +++ b/ext/AdvancedVIBijectorsExt.jl @@ -25,7 +25,15 @@ function AdvancedVI.init( obj_st = AdvancedVI.init(rng, objective, adtype, q_init, prob, params, re) avg_st = AdvancedVI.init(averager, params) grad_buf = DiffResults.DiffResult(zero(eltype(params)), similar(params)) - return AdvancedVI.ParamSpaceSGDState(prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st) + return ( + prob=prob, + q=q_init, + iteration=0, + grad_buf=grad_buf, + opt_st=opt_st, + obj_st=obj_st, + avg_st=avg_st, + ) end function AdvancedVI.apply( diff --git a/src/AdvancedVI.jl b/src/AdvancedVI.jl index 6f51a9f08..37b16b0ed 100644 --- a/src/AdvancedVI.jl +++ b/src/AdvancedVI.jl @@ -277,13 +277,10 @@ export optimize include("utils.jl") include("optimize.jl") -## Parameter Space SGD -include("algorithms/paramspacesgd/abstractobjective.jl") -include("algorithms/paramspacesgd/paramspacesgd.jl") +## Parameter Space SGD Implementations -export ParamSpaceSGD +include("algorithms/abstractobjective.jl") -## Parameter Space SGD Implementations ### ELBO Maximization abstract type AbstractEntropyEstimator end @@ -304,10 +301,10 @@ Estimate the entropy of `q`. """ function estimate_entropy end -include("algorithms/paramspacesgd/subsampledobjective.jl") -include("algorithms/paramspacesgd/repgradelbo.jl") -include("algorithms/paramspacesgd/scoregradelbo.jl") -include("algorithms/paramspacesgd/entropy.jl") +include("algorithms/subsampledobjective.jl") +include("algorithms/repgradelbo.jl") +include("algorithms/scoregradelbo.jl") +include("algorithms/entropy.jl") export RepGradELBO, ScoreGradELBO, @@ -318,7 +315,8 @@ export RepGradELBO, StickingTheLandingEntropyZeroGradient, SubsampledObjective -include("algorithms/paramspacesgd/constructors.jl") +include("algorithms/constructors.jl") +include("algorithms/interface.jl") export KLMinRepGradDescent, KLMinRepGradProxDescent, KLMinScoreGradDescent, ADVI, BBVI diff --git a/src/algorithms/paramspacesgd/abstractobjective.jl b/src/algorithms/abstractobjective.jl similarity index 100% rename from src/algorithms/paramspacesgd/abstractobjective.jl rename to src/algorithms/abstractobjective.jl diff --git a/src/algorithms/paramspacesgd/constructors.jl b/src/algorithms/constructors.jl similarity index 63% rename from src/algorithms/paramspacesgd/constructors.jl rename to src/algorithms/constructors.jl index 2ec0ae41b..32c85b228 100644 --- a/src/algorithms/paramspacesgd/constructors.jl +++ b/src/algorithms/constructors.jl @@ -18,6 +18,22 @@ KL divergence minimization by running stochastic gradient descent with the repar - `operator::AbstractOperator`: Operator to be applied after each gradient descent step. (default: `IdentityOperator()`) - `subsampling::Union{<:Nothing,<:AbstractSubsampling}`: Data point subsampling strategy. If `nothing`, subsampling is not used. (default: `nothing`) +# Output +- `q_averaged`: The variational approximation formed by the averaged SGD iterates. + +# Callback +The callback function `callback` has a signature of + + callback(; rng, iteration, restructure, params, averaged_params, restructure, gradient) + +The arguments are as follows: +- `rng`: Random number generator internally used by the algorithm. +- `iteration`: The index of the current iteration. +- `restructure`: Function that restructures the variational approximation from the variational parameters. Calling `restructure(params)` reconstructs the current variational approximation. +- `params`: Current variational parameters. +- `averaged_params`: Variational parameters averaged according to the averaging strategy. +- `gradient`: The estimated (possibly stochastic) gradient. + # Requirements - The trainable parameters in the variational approximation are expected to be extractable through `Optimisers.destructure`. This requires the variational approximation to be marked as a functor through `Functors.@functor`. - The variational approximation ``q_{\\lambda}`` implements `rand`. @@ -25,6 +41,20 @@ KL divergence minimization by running stochastic gradient descent with the repar - The target `LogDensityProblems.logdensity(prob, x)` must be differentiable with respect to `x` by the selected AD backend. - Additonal requirements on `q` may apply depending on the choice of `entropy`. """ +struct KLMinRepGradDescent{ + Obj<:Union{<:RepGradELBO,<:SubsampledObjective}, + AD<:ADTypes.AbstractADType, + Opt<:Optimisers.AbstractRule, + Avg<:AbstractAverager, + Op<:AbstractOperator, +} <: AbstractVariationalAlgorithm + objective::Obj + adtype::AD + optimizer::Opt + averager::Avg + operator::Op +end + function KLMinRepGradDescent( adtype::ADTypes.AbstractADType; entropy::Union{<:ClosedFormEntropy,<:StickingTheLandingEntropy,<:MonteCarloEntropy}=ClosedFormEntropy(), @@ -39,7 +69,11 @@ function KLMinRepGradDescent( else SubsampledObjective(RepGradELBO(n_samples; entropy=entropy), subsampling) end - return ParamSpaceSGD(objective, adtype, optimizer, averager, operator) + return KLMinRepGradDescent{ + typeof(objective),typeof(adtype),typeof(optimizer),typeof(averager),typeof(operator) + }( + objective, adtype, optimizer, averager, operator + ) end const ADVI = KLMinRepGradDescent @@ -63,12 +97,42 @@ Thus, only the entropy estimators with a "ZeroGradient" suffix are allowed. - `averager::AbstractAverager`: Parameter averaging strategy. (default: `PolynomialAveraging()`) - `subsampling::Union{<:Nothing,<:AbstractSubsampling}`: Data point subsampling strategy. If `nothing`, subsampling is not used. (default: `nothing`) +# Output +- `q_averaged`: The variational approximation formed by the averaged SGD iterates. + +# Callback +The callback function `callback` has a signature of + + callback(; rng, iteration, restructure, params, averaged_params, restructure, gradient) + +The arguments are as follows: +- `rng`: Random number generator internally used by the algorithm. +- `iteration`: The index of the current iteration. +- `restructure`: Function that restructures the variational approximation from the variational parameters. Calling `restructure(params)` reconstructs the current variational approximation. +- `params`: Current variational parameters. +- `averaged_params`: Variational parameters averaged according to the averaging strategy. +- `gradient`: The estimated (possibly stochastic) gradient. + # Requirements - The variational family is `MvLocationScale`. - The target distribution and the variational approximation have the same support. - The target `LogDensityProblems.logdensity(prob, x)` must be differentiable with respect to `x` by the selected AD backend. - Additonal requirements on `q` may apply depending on the choice of `entropy_zerograd`. """ +struct KLMinRepGradProxDescent{ + Obj<:Union{<:RepGradELBO,<:SubsampledObjective}, + AD<:ADTypes.AbstractADType, + Opt<:Optimisers.AbstractRule, + Avg<:AbstractAverager, + Op<:ProximalLocationScaleEntropy, +} <: AbstractVariationalAlgorithm + objective::Obj + adtype::AD + optimizer::Opt + averager::Avg + operator::Op +end + function KLMinRepGradProxDescent( adtype::ADTypes.AbstractADType; entropy_zerograd::Union{ @@ -85,7 +149,11 @@ function KLMinRepGradProxDescent( else SubsampledObjective(RepGradELBO(n_samples; entropy=entropy_zerograd), subsampling) end - return ParamSpaceSGD(objective, adtype, optimizer, averager, operator) + return KLMinRepGradProxDescent{ + typeof(objective),typeof(adtype),typeof(optimizer),typeof(averager),typeof(operator) + }( + objective, adtype, optimizer, averager, operator + ) end """ @@ -106,15 +174,45 @@ KL divergence minimization by running stochastic gradient descent with the score - `operator::Union{<:IdentityOperator, <:ClipScale}`: Operator to be applied after each gradient descent step. (default: `IdentityOperator()`) - `subsampling::Union{<:Nothing,<:AbstractSubsampling}`: Data point subsampling strategy. If `nothing`, subsampling is not used. (default: `nothing`) +# Output +- `q_averaged`: The variational approximation formed by the averaged SGD iterates. + +# Callback +The callback function `callback` has a signature of + + callback(; rng, iteration, restructure, params, averaged_params, restructure, gradient) + +The arguments are as follows: +- `rng`: Random number generator internally used by the algorithm. +- `iteration`: The index of the current iteration. +- `restructure`: Function that restructures the variational approximation from the variational parameters. Calling `restructure(params)` reconstructs the current variational approximation. +- `params`: Current variational parameters. +- `averaged_params`: Variational parameters averaged according to the averaging strategy. +- `gradient`: The estimated (possibly stochastic) gradient. + # Requirements - The trainable parameters in the variational approximation are expected to be extractable through `Optimisers.destructure`. This requires the variational approximation to be marked as a functor through `Functors.@functor`. - The variational approximation ``q_{\\lambda}`` implements `rand`. - The variational approximation ``q_{\\lambda}`` implements `logpdf(q, x)`, which should also be differentiable with respect to `x`. - The target distribution and the variational approximation have the same support. """ +struct KLMinScoreGradDescent{ + Obj<:Union{<:ScoreGradELBO,<:SubsampledObjective}, + AD<:ADTypes.AbstractADType, + Opt<:Optimisers.AbstractRule, + Avg<:AbstractAverager, + Op<:AbstractOperator, +} <: AbstractVariationalAlgorithm + objective::Obj + adtype::AD + optimizer::Opt + averager::Avg + operator::Op +end + function KLMinScoreGradDescent( adtype::ADTypes.AbstractADType; - optimizer::Union{<:Descent,<:DoG,<:DoWG}=DoWG(), + optimizer::Optimisers.AbstractRule=DoWG(), n_samples::Int=1, averager::AbstractAverager=PolynomialAveraging(), operator::AbstractOperator=IdentityOperator(), @@ -125,7 +223,11 @@ function KLMinScoreGradDescent( else SubsampledObjective(ScoreGradELBO(n_samples), subsampling) end - return ParamSpaceSGD(objective, adtype, optimizer, averager, operator) + return KLMinScoreGradDescent{ + typeof(objective),typeof(adtype),typeof(optimizer),typeof(averager),typeof(operator) + }( + objective, adtype, optimizer, averager, operator + ) end const BBVI = KLMinScoreGradDescent diff --git a/src/algorithms/paramspacesgd/entropy.jl b/src/algorithms/entropy.jl similarity index 100% rename from src/algorithms/paramspacesgd/entropy.jl rename to src/algorithms/entropy.jl diff --git a/src/algorithms/interface.jl b/src/algorithms/interface.jl new file mode 100644 index 000000000..88a2623cd --- /dev/null +++ b/src/algorithms/interface.jl @@ -0,0 +1,83 @@ + +""" +This family of algorithms (`<:KLMinRepGradDescent`,`<:KLMinRepGradProxDescent`,`<:KLMinScoreGradDescent`) applies stochastic gradient descent (SGD) to the variational `objective` over the (Euclidean) space of variational parameters. +The trainable parameters in the variational approximation are expected to be extractable through `Optimisers.destructure`. +This requires the variational approximation to be marked as a functor through `Functors.@functor`. +""" +const ParamSpaceSGD = Union{ + <:KLMinRepGradDescent,<:KLMinRepGradProxDescent,<:KLMinScoreGradDescent +} + +function init(rng::Random.AbstractRNG, alg::ParamSpaceSGD, q_init, prob) + (; adtype, optimizer, averager, objective, operator) = alg + if q_init isa AdvancedVI.MvLocationScale && operator isa AdvancedVI.IdentityOperator + @warn( + "IdentityOperator is used with a variational family <:MvLocationScale. Optimization can easily fail under this combination due to singular scale matrices. Consider using the operator `ClipScale` in the algorithm instead.", + ) + end + params, re = Optimisers.destructure(q_init) + opt_st = Optimisers.setup(optimizer, params) + obj_st = init(rng, objective, adtype, q_init, prob, params, re) + avg_st = init(averager, params) + grad_buf = DiffResults.DiffResult(zero(eltype(params)), similar(params)) + return ( + prob=prob, + q=q_init, + iteration=0, + grad_buf=grad_buf, + opt_st=opt_st, + obj_st=obj_st, + avg_st=avg_st, + ) +end + +function output(alg::ParamSpaceSGD, state) + params_avg = value(alg.averager, state.avg_st) + _, re = Optimisers.destructure(state.q) + return re(params_avg) +end + +function step( + rng::Random.AbstractRNG, alg::ParamSpaceSGD, state, callback, objargs...; kwargs... +) + (; adtype, objective, operator, averager) = alg + (; prob, q, iteration, grad_buf, opt_st, obj_st, avg_st) = state + + iteration += 1 + + params, re = Optimisers.destructure(q) + + grad_buf, obj_st, info = estimate_gradient!( + rng, objective, adtype, grad_buf, obj_st, params, re, objargs... + ) + + grad = DiffResults.gradient(grad_buf) + opt_st, params = Optimisers.update!(opt_st, params, grad) + params = apply(operator, typeof(q), opt_st, params, re) + avg_st = apply(averager, avg_st, params) + + state = ( + prob=prob, + q=re(params), + iteration=iteration, + grad_buf=grad_buf, + opt_st=opt_st, + obj_st=obj_st, + avg_st=avg_st, + ) + + if !isnothing(callback) + averaged_params = value(averager, avg_st) + info′ = callback(; + rng, + iteration, + restructure=re, + params=params, + averaged_params=averaged_params, + gradient=grad, + state=state, + ) + info = !isnothing(info′) ? merge(info′, info) : info + end + state, false, info +end diff --git a/src/algorithms/paramspacesgd/paramspacesgd.jl b/src/algorithms/paramspacesgd/paramspacesgd.jl deleted file mode 100644 index 92bbb0e51..000000000 --- a/src/algorithms/paramspacesgd/paramspacesgd.jl +++ /dev/null @@ -1,125 +0,0 @@ - -""" - ParamSpaceSGD( - objective::AbstractVariationalObjective, - adtype::ADTypes.AbstractADType, - optimizer::Optimisers.AbstractRule, - averager::AbstractAverager, - operator::AbstractOperator, - ) - -This algorithm applies stochastic gradient descent (SGD) to the variational `objective` over the (Euclidean) space of variational parameters. - -The trainable parameters in the variational approximation are expected to be extractable through `Optimisers.destructure`. -This requires the variational approximation to be marked as a functor through `Functors.@functor`. - -!!! note - Different objective may impose different requirements on `adtype`, variational family, `optimizer`, and `operator`. It is therefore important to check the documentation corresponding to each specific objective. Essentially, each objective should be thought as forming its own unique algorithm. - -# Arguments -- `objective`: Variational Objective. -- `adtype`: Automatic differentiation backend. -- `optimizer`: Optimizer used for inference. -- `averager` : Parameter averaging strategy. -- `operator` : Operator applied to the parameters after each optimization step. - -# Output -- `q_averaged`: The variational approximation formed from the averaged SGD iterates. - -# Callback -The callback function `callback` has a signature of - - callback(; rng, iteration, restructure, params, averaged_params, restructure, gradient) - -The arguments are as follows: -- `rng`: Random number generator internally used by the algorithm. -- `iteration`: The index of the current iteration. -- `restructure`: Function that restructures the variational approximation from the variational parameters. Calling `restructure(params)` reconstructs the current variational approximation. -- `params`: Current variational parameters. -- `averaged_params`: Variational parameters averaged according to the averaging strategy. -- `gradient`: The estimated (possibly stochastic) gradient. - -""" -struct ParamSpaceSGD{ - Obj<:AbstractVariationalObjective, - AD<:ADTypes.AbstractADType, - Opt<:Optimisers.AbstractRule, - Avg<:AbstractAverager, - Op<:AbstractOperator, -} <: AbstractVariationalAlgorithm - objective::Obj - adtype::AD - optimizer::Opt - averager::Avg - operator::Op -end - -struct ParamSpaceSGDState{P,Q,GradBuf,OptSt,ObjSt,AvgSt} - prob::P - q::Q - iteration::Int - grad_buf::GradBuf - opt_st::OptSt - obj_st::ObjSt - avg_st::AvgSt -end - -function init(rng::Random.AbstractRNG, alg::ParamSpaceSGD, q_init, prob) - (; adtype, optimizer, averager, objective, operator) = alg - if q_init isa AdvancedVI.MvLocationScale && operator isa AdvancedVI.IdentityOperator - @warn( - "IdentityOperator is used with a variational family <:MvLocationScale. Optimization can easily fail under this combination due to singular scale matrices. Consider using the operator `ClipScale` in the algorithm instead.", - ) - end - params, re = Optimisers.destructure(q_init) - opt_st = Optimisers.setup(optimizer, params) - obj_st = init(rng, objective, adtype, q_init, prob, params, re) - avg_st = init(averager, params) - grad_buf = DiffResults.DiffResult(zero(eltype(params)), similar(params)) - return ParamSpaceSGDState(prob, q_init, 0, grad_buf, opt_st, obj_st, avg_st) -end - -function output(alg::ParamSpaceSGD, state) - params_avg = value(alg.averager, state.avg_st) - _, re = Optimisers.destructure(state.q) - return re(params_avg) -end - -function step( - rng::Random.AbstractRNG, alg::ParamSpaceSGD, state, callback, objargs...; kwargs... -) - (; adtype, objective, operator, averager) = alg - (; prob, q, iteration, grad_buf, opt_st, obj_st, avg_st) = state - - iteration += 1 - - params, re = Optimisers.destructure(q) - - grad_buf, obj_st, info = estimate_gradient!( - rng, objective, adtype, grad_buf, obj_st, params, re, objargs... - ) - - grad = DiffResults.gradient(grad_buf) - opt_st, params = Optimisers.update!(opt_st, params, grad) - params = apply(operator, typeof(q), opt_st, params, re) - avg_st = apply(averager, avg_st, params) - - state = ParamSpaceSGDState( - prob, re(params), iteration, grad_buf, opt_st, obj_st, avg_st - ) - - if !isnothing(callback) - averaged_params = value(averager, avg_st) - info′ = callback(; - rng, - iteration, - restructure=re, - params=params, - averaged_params=averaged_params, - gradient=grad, - state=state, - ) - info = !isnothing(info′) ? merge(info′, info) : info - end - state, false, info -end diff --git a/src/algorithms/paramspacesgd/repgradelbo.jl b/src/algorithms/repgradelbo.jl similarity index 100% rename from src/algorithms/paramspacesgd/repgradelbo.jl rename to src/algorithms/repgradelbo.jl diff --git a/src/algorithms/paramspacesgd/scoregradelbo.jl b/src/algorithms/scoregradelbo.jl similarity index 100% rename from src/algorithms/paramspacesgd/scoregradelbo.jl rename to src/algorithms/scoregradelbo.jl diff --git a/src/algorithms/paramspacesgd/subsampledobjective.jl b/src/algorithms/subsampledobjective.jl similarity index 100% rename from src/algorithms/paramspacesgd/subsampledobjective.jl rename to src/algorithms/subsampledobjective.jl diff --git a/test/algorithms/paramspacesgd/repgradelbo.jl b/test/algorithms/repgradelbo.jl similarity index 100% rename from test/algorithms/paramspacesgd/repgradelbo.jl rename to test/algorithms/repgradelbo.jl diff --git a/test/algorithms/paramspacesgd/repgradelbo_locationscale.jl b/test/algorithms/repgradelbo_locationscale.jl similarity index 100% rename from test/algorithms/paramspacesgd/repgradelbo_locationscale.jl rename to test/algorithms/repgradelbo_locationscale.jl diff --git a/test/algorithms/paramspacesgd/repgradelbo_locationscale_bijectors.jl b/test/algorithms/repgradelbo_locationscale_bijectors.jl similarity index 100% rename from test/algorithms/paramspacesgd/repgradelbo_locationscale_bijectors.jl rename to test/algorithms/repgradelbo_locationscale_bijectors.jl diff --git a/test/algorithms/paramspacesgd/repgradelbo_proximal_locationscale.jl b/test/algorithms/repgradelbo_proximal_locationscale.jl similarity index 100% rename from test/algorithms/paramspacesgd/repgradelbo_proximal_locationscale.jl rename to test/algorithms/repgradelbo_proximal_locationscale.jl diff --git a/test/algorithms/paramspacesgd/repgradelbo_proximal_locationscale_bijectors.jl b/test/algorithms/repgradelbo_proximal_locationscale_bijectors.jl similarity index 100% rename from test/algorithms/paramspacesgd/repgradelbo_proximal_locationscale_bijectors.jl rename to test/algorithms/repgradelbo_proximal_locationscale_bijectors.jl diff --git a/test/algorithms/paramspacesgd/scoregradelbo.jl b/test/algorithms/scoregradelbo.jl similarity index 100% rename from test/algorithms/paramspacesgd/scoregradelbo.jl rename to test/algorithms/scoregradelbo.jl diff --git a/test/algorithms/paramspacesgd/scoregradelbo_locationscale.jl b/test/algorithms/scoregradelbo_locationscale.jl similarity index 100% rename from test/algorithms/paramspacesgd/scoregradelbo_locationscale.jl rename to test/algorithms/scoregradelbo_locationscale.jl diff --git a/test/algorithms/paramspacesgd/scoregradelbo_locationscale_bijectors.jl b/test/algorithms/scoregradelbo_locationscale_bijectors.jl similarity index 100% rename from test/algorithms/paramspacesgd/scoregradelbo_locationscale_bijectors.jl rename to test/algorithms/scoregradelbo_locationscale_bijectors.jl diff --git a/test/algorithms/paramspacesgd/subsampledobj.jl b/test/algorithms/subsampledobj.jl similarity index 94% rename from test/algorithms/paramspacesgd/subsampledobj.jl rename to test/algorithms/subsampledobj.jl index f7e81d55f..c5b8720ec 100644 --- a/test/algorithms/paramspacesgd/subsampledobj.jl +++ b/test/algorithms/subsampledobj.jl @@ -44,7 +44,9 @@ end @testset "algorithm constructors" begin @testset for batchsize in [1, 3, 4] sub = ReshufflingBatchSubsampling(1:n_data, batchsize) - alg = KLMinRepGradDescent(AD; n_samples=10, subsampling=sub) + alg = KLMinRepGradDescent( + AD; n_samples=10, subsampling=sub, operator=ClipScale() + ) _, info, _ = optimize(alg, 10, prob, q0; show_progress=false) @test isfinite(last(info).elbo) @@ -63,8 +65,8 @@ end @testset "determinism" begin T = 128 sub = ReshufflingBatchSubsampling(1:n_data, 1) - sub_obj = SubsampledObjective(full_obj, sub) - alg = ParamSpaceSGD(sub_obj, AD, DoWG(), PolynomialAveraging(), ClipScale()) + alg = KLMinRepGradDescent(AD; subsampling=sub, operator=ClipScale()) + sub_obj = alg.objective rng = StableRNG(seed) q_avg, _, _ = optimize(rng, alg, T, prob, q0; show_progress=false) diff --git a/test/general/optimize.jl b/test/general/optimize.jl index 71c3e4fb4..126dc2e40 100644 --- a/test/general/optimize.jl +++ b/test/general/optimize.jl @@ -9,12 +9,7 @@ (; model, μ_true, L_true, n_dims, is_meanfield) = modelstats q0 = MeanFieldGaussian(zeros(Float64, n_dims), Diagonal(ones(Float64, n_dims))) - obj = RepGradELBO(10) - - optimizer = Optimisers.Adam(1e-2) - averager = PolynomialAveraging() - - alg = ParamSpaceSGD(obj, AD, optimizer, averager, IdentityOperator()) + alg = KLMinRepGradDescent(AD; optimizer=Optimisers.Adam(1e-2), operator=ClipScale()) @testset "default_rng" begin optimize(alg, T, model, q0; show_progress=false) diff --git a/test/runtests.jl b/test/runtests.jl index 2cf9474cb..2dae5b31c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -71,13 +71,13 @@ if GROUP == "All" || GROUP == "AD" include("general/ad.jl") include("general/mixedad_logdensity.jl") - include("algorithms/paramspacesgd/subsampledobj.jl") - include("algorithms/paramspacesgd/repgradelbo.jl") - include("algorithms/paramspacesgd/scoregradelbo.jl") - include("algorithms/paramspacesgd/repgradelbo_locationscale.jl") - include("algorithms/paramspacesgd/repgradelbo_locationscale_bijectors.jl") - include("algorithms/paramspacesgd/repgradelbo_proximal_locationscale.jl") - include("algorithms/paramspacesgd/repgradelbo_proximal_locationscale_bijectors.jl") - include("algorithms/paramspacesgd/scoregradelbo_locationscale.jl") - include("algorithms/paramspacesgd/scoregradelbo_locationscale_bijectors.jl") + include("algorithms/subsampledobj.jl") + include("algorithms/repgradelbo.jl") + include("algorithms/scoregradelbo.jl") + include("algorithms/repgradelbo_locationscale.jl") + include("algorithms/repgradelbo_locationscale_bijectors.jl") + include("algorithms/repgradelbo_proximal_locationscale.jl") + include("algorithms/repgradelbo_proximal_locationscale_bijectors.jl") + include("algorithms/scoregradelbo_locationscale.jl") + include("algorithms/scoregradelbo_locationscale_bijectors.jl") end