diff --git a/examples/classification.py b/examples/classification.py index 360f5b2f..3ab3c08a 100644 --- a/examples/classification.py +++ b/examples/classification.py @@ -193,15 +193,20 @@ # $\boldsymbol{x}$, we can expand the log of this about the posterior mode # $\hat{\boldsymbol{f}}$ via a Taylor expansion. This gives: # +# $$ # \begin{align} # \log\tilde{p}(\boldsymbol{f}|\mathcal{D}) = \log\tilde{p}(\hat{\boldsymbol{f}}|\mathcal{D}) + \left[\nabla \log\tilde{p}({\boldsymbol{f}}|\mathcal{D})|_{\hat{\boldsymbol{f}}}\right]^{T} (\boldsymbol{f}-\hat{\boldsymbol{f}}) + \frac{1}{2} (\boldsymbol{f}-\hat{\boldsymbol{f}})^{T} \left[\nabla^2 \tilde{p}(\boldsymbol{y}|\boldsymbol{f})|_{\hat{\boldsymbol{f}}} \right] (\boldsymbol{f}-\hat{\boldsymbol{f}}) + \mathcal{O}(\lVert \boldsymbol{f} - \hat{\boldsymbol{f}} \rVert^3). # \end{align} +# $$ # # Since $\nabla \log\tilde{p}({\boldsymbol{f}}|\mathcal{D})$ is zero at the mode, # this suggests the following approximation +# +# $$ # \begin{align} # \tilde{p}(\boldsymbol{f}|\mathcal{D}) \approx \log\tilde{p}(\hat{\boldsymbol{f}}|\mathcal{D}) \exp\left\{ \frac{1}{2} (\boldsymbol{f}-\hat{\boldsymbol{f}})^{T} \left[-\nabla^2 \tilde{p}(\boldsymbol{y}|\boldsymbol{f})|_{\hat{\boldsymbol{f}}} \right] (\boldsymbol{f}-\hat{\boldsymbol{f}}) \right\} # \end{align}, +# $$ # # that we identify as a Gaussian distribution, # $p(\boldsymbol{f}| \mathcal{D}) \approx q(\boldsymbol{f}) := \mathcal{N}(\hat{\boldsymbol{f}}, [-\nabla^2 \tilde{p}(\boldsymbol{y}|\boldsymbol{f})|_{\hat{\boldsymbol{f}}} ]^{-1} )$. diff --git a/examples/constructing_new_kernels.py b/examples/constructing_new_kernels.py index ceb8b486..98aca6c9 100644 --- a/examples/constructing_new_kernels.py +++ b/examples/constructing_new_kernels.py @@ -198,9 +198,15 @@ # kernels do not exhibit this behaviour and instead _wrap_ around the boundary # points to create a smooth function. Such a kernel was given in [Padonou & # Roustant (2015)](https://hal.inria.fr/hal-01119942v1) where any two angles -# $\theta$ and $\theta'$ are written as $$W_c(\theta, \theta') = \left\lvert +# $\theta$ and $\theta'$ are written as +# +# $$ +# \begin{align} +# W_c(\theta, \theta') & = \left\lvert # \left(1 + \tau \frac{d(\theta, \theta')}{c} \right) \left(1 - \frac{d(\theta, -# \theta')}{c} \right)^{\tau} \right\rvert \quad \tau \geq 4 \tag{1}.$$ +# \theta')}{c} \right)^{\tau} \right\rvert \quad \tau \geq 4 \tag{1}. +# \end{align} +# $$ # # Here the hyperparameter $\tau$ is analogous to a lengthscale for Euclidean # stationary kernels, controlling the correlation between pairs of observations. diff --git a/examples/graph_kernels.py b/examples/graph_kernels.py index 0bd9db41..1e43ee30 100644 --- a/examples/graph_kernels.py +++ b/examples/graph_kernels.py @@ -88,7 +88,9 @@ # # Graph kernels use the _Laplacian matrix_ $L$ to quantify the smoothness of a signal # (or function) on a graph +# # $$L=D-A,$$ +# # where $D$ is the diagonal _degree matrix_ containing each vertices' degree and $A$ # is the _adjacency matrix_ that has an $(i,j)^{\text{th}}$ entry of 1 if $v_i, v_j$ # are connected and 0 otherwise. [Networkx](https://networkx.org) gives us an easy diff --git a/examples/intro_to_gps.py b/examples/intro_to_gps.py index 699bde90..bf8cf707 100644 --- a/examples/intro_to_gps.py +++ b/examples/intro_to_gps.py @@ -17,6 +17,7 @@ # %% [markdown] # # New to Gaussian Processes? # +# # Fantastic that you're here! This notebook is designed to be a gentle # introduction to the mathematics of Gaussian processes (GPs). No prior # knowledge of Bayesian inference or GPs is assumed, and this notebook is @@ -33,10 +34,11 @@ # model are unknown, and our goal is to conduct inference to determine their # range of likely values. To achieve this, we apply Bayes' theorem # +# $$ # \begin{align} -# \label{eq:BayesTheorem} -# p(\theta\,|\, \mathbf{y}) = \frac{p(\theta)p(\mathbf{y}\,|\,\theta)}{p(\mathbf{y})} = \frac{p(\theta)p(\mathbf{y}\,|\,\theta)}{\int_{\theta}p(\mathbf{y}, \theta)\mathrm{d}\theta}\,, +# p(\theta\mid\mathbf{y}) = \frac{p(\theta)p(\mathbf{y}\mid\theta)}{p(\mathbf{y})} = \frac{p(\theta)p(\mathbf{y}\mid\theta)}{\int_{\theta}p(\mathbf{y}, \theta)\mathrm{d}\theta}, # \end{align} +# $$ # # where $p(\mathbf{y}\,|\,\theta)$ denotes the _likelihood_, or model, and # quantifies how likely the observed dataset $\mathbf{y}$ is, given the @@ -74,9 +76,13 @@ # new points $\mathbf{y}^{\star}$ through the _posterior predictive # distribution_. This is achieved by integrating out the parameter set $\theta$ # from our posterior distribution through +# +# $$ # \begin{align} # p(\mathbf{y}^{\star}\mid \mathbf{y}) = \int p(\mathbf{y}^{\star} \,|\, \theta, \mathbf{y} ) p(\theta\,|\, \mathbf{y})\mathrm{d}\theta\,. # \end{align} +# $$ +# # As with the marginal log-likelihood, evaluating this quantity requires # computing an integral which may not be tractable, particularly when $\theta$ # is high-dimensional. @@ -85,13 +91,16 @@ # distribution, so we often compute and report moments of the posterior # distribution. Most commonly, we report the first moment and the centred second # moment +# # $$ # \begin{alignat}{2} -# \mu = \mathbb{E}[\theta\,|\,\mathbf{y}] & = \int \theta p(\theta\mid\mathbf{y})\mathrm{d}\theta\\ +# \mu = \mathbb{E}[\theta\,|\,\mathbf{y}] & = \int \theta +# p(\theta\mid\mathbf{y})\mathrm{d}\theta \quad \\ # \sigma^2 = \mathbb{V}[\theta\,|\,\mathbf{y}] & = \int \left(\theta - # \mathbb{E}[\theta\,|\,\mathbf{y}]\right)^2p(\theta\,|\,\mathbf{y})\mathrm{d}\theta&\,. # \end{alignat} # $$ +# # Through this pair of statistics, we can communicate our beliefs about the most # likely value of $\theta$ i.e., $\mu$, and the uncertainty $\sigma$ around the # expected value. However, as with the marginal log-likelihood and predictive @@ -228,13 +237,16 @@ # %% [markdown] # Extending the intuition given for the moments of a univariate Gaussian random # variables, we can obtain the mean and covariance by +# # $$ # \begin{align} -# \mathbb{E}[\mathbf{y}] = \mathbf{\mu}, \quad \operatorname{Cov}(\mathbf{y}) & = \mathbf{E}\left[(\mathbf{y} - \mathbf{\mu)}(\mathbf{y} - \mathbf{\mu)}^{\top} \right]\\ +# \mathbb{E}[\mathbf{y}] & = \mathbf{\mu}, \\ +# \operatorname{Cov}(\mathbf{y}) & = \mathbf{E}\left[(\mathbf{y} - \mathbf{\mu})(\mathbf{y} - \mathbf{\mu})^{\top} \right] \\ # & =\mathbb{E}[\mathbf{y}\mathbf{y}^{\top}] - \mathbb{E}[\mathbf{y}]\mathbb{E}[\mathbf{y}]^{\top} \\ # & =\mathbf{\Sigma}\,. # \end{align} # $$ +# # The covariance matrix is a symmetric positive definite matrix that generalises # the notion of variance to multiple dimensions. The matrix's diagonal entries # contain the variance of each element, whilst the off-diagonal entries quantify @@ -336,6 +348,7 @@ # $\mathbf{x}\sim\mathcal{N}(\boldsymbol{\mu}_{\mathbf{x}}, \boldsymbol{\Sigma}_{\mathbf{xx}})$ and # $\mathbf{y}\sim\mathcal{N}(\boldsymbol{\mu}_{\mathbf{y}}, \boldsymbol{\Sigma}_{\mathbf{yy}})$. # We define the joint distribution as +# # $$ # \begin{align} # p\left(\begin{bmatrix} @@ -348,6 +361,7 @@ # \end{bmatrix} \right)\,, # \end{align} # $$ +# # where $\boldsymbol{\Sigma}_{\mathbf{x}\mathbf{y}}$ is the cross-covariance # matrix of $\mathbf{x}$ and $\mathbf{y}$. # @@ -363,6 +377,7 @@ # # For a joint Gaussian random variable, the marginalisation of $\mathbf{x}$ or # $\mathbf{y}$ is given by +# # $$ # \begin{alignat}{3} # & \int p(\mathbf{x}, \mathbf{y})\mathrm{d}\mathbf{y} && = p(\mathbf{x}) @@ -372,7 +387,9 @@ # \boldsymbol{\Sigma}_{\mathbf{yy}})\,. # \end{alignat} # $$ +# # The conditional distributions are given by +# # $$ # \begin{align} # p(\mathbf{y}\,|\, \mathbf{x}) & = \mathcal{N}\left(\boldsymbol{\mu}_{\mathbf{y}} + \boldsymbol{\Sigma}_{\mathbf{yx}}\boldsymbol{\Sigma}_{\mathbf{xx}}^{-1}(\mathbf{x}-\boldsymbol{\mu}_{\mathbf{x}}), \boldsymbol{\Sigma}_{\mathbf{yy}}-\boldsymbol{\Sigma}_{\mathbf{yx}}\boldsymbol{\Sigma}_{\mathbf{xx}}^{-1}\boldsymbol{\Sigma}_{\mathbf{xy}}\right)\,. @@ -401,6 +418,7 @@ # We aim to capture the relationship between $\mathbf{X}$ and $\mathbf{y}$ using # a model $f$ with which we may make predictions at an unseen set of test points # $\mathbf{X}^{\star}\subset\mathcal{X}$. We formalise this by +# # $$ # \begin{align} # y = f(\mathbf{X}) + \varepsilon\,, @@ -430,6 +448,7 @@ # convenience in the remainder of this article. # # We define a joint GP prior over the latent function +# # $$ # \begin{align} # p(\mathbf{f}, \mathbf{f}^{\star}) = \mathcal{N}\left(\mathbf{0}, \begin{bmatrix} @@ -437,14 +456,17 @@ # \end{bmatrix}\right)\,, # \end{align} # $$ +# # where $\mathbf{f}^{\star} = f(\mathbf{X}^{\star})$. Conditional on the GP's # latent function $f$, we assume a factorising likelihood generates our # observations +# # $$ # \begin{align} # p(\mathbf{y}\,|\,\mathbf{f}) = \prod_{i=1}^n p(y_i\,|\, f_i)\,. # \end{align} # $$ +# # Strictly speaking, the likelihood function is # $p(\mathbf{y}\,|\,\phi(\mathbf{f}))$ where $\phi$ is the likelihood function's # associated link function. Example link functions include the probit or @@ -479,20 +501,25 @@ # $p(y_i\,|\, f_i) = \mathcal{N}(y_i\,|\, f_i, \sigma_n^2)$, # marginalising $\mathbf{f}$ from the joint posterior to obtain # the posterior predictive distribution is exact +# # $$ # \begin{align} # p(\mathbf{f}^{\star}\mid \mathbf{y}) = \mathcal{N}(\mathbf{f}^{\star}\,|\,\boldsymbol{\mu}_{\,|\,\mathbf{y}}, \Sigma_{\,|\,\mathbf{y}})\,, # \end{align} # $$ +# # where +# # $$ # \begin{align} # \mathbf{\mu}_{\mid \mathbf{y}} & = \mathbf{K}_{\star f}\left( \mathbf{K}_{ff}+\sigma^2_n\mathbf{I}_n\right)^{-1}\mathbf{y} \\ # \Sigma_{\,|\,\mathbf{y}} & = \mathbf{K}_{\star\star} - \mathbf{K}_{xf}\left(\mathbf{K}_{ff} + \sigma_n^2\mathbf{I}_n\right)^{-1}\mathbf{K}_{fx} \,. # \end{align} # $$ +# # Further, the log of the marginal likelihood of the GP can # be analytically expressed as +# # $$ # \begin{align} # & = 0.5\left(-\underbrace{\mathbf{y}^{\top}\left(\mathbf{K}_{ff} + \sigma_n^2\mathbf{I}_n \right)^{-1}\mathbf{y}}_{\text{Data fit}} -\underbrace{\log\lvert \mathbf{K}_{ff} + \sigma^2_n\rvert}_{\text{Complexity}} -\underbrace{n\log 2\pi}_{\text{Constant}} \right)\,. @@ -505,6 +532,7 @@ # we call these terms the model hyperparameters # $\boldsymbol{\xi} = \{\boldsymbol{\theta},\sigma_n^2\}$ # from which the maximum likelihood estimate is given by +# # $$ # \begin{align*} # \boldsymbol{\xi}^{\star} = \operatorname{argmax}_{\boldsymbol{\xi} \in \Xi} \log p(\mathbf{y})\,. diff --git a/examples/intro_to_kernels.py b/examples/intro_to_kernels.py index 7fabec09..da03540b 100644 --- a/examples/intro_to_kernels.py +++ b/examples/intro_to_kernels.py @@ -8,7 +8,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.16.4 +# jupytext_version: 1.11.2 # kernelspec: # display_name: gpjax # language: python @@ -37,7 +37,7 @@ from sklearn.preprocessing import StandardScaler from examples.utils import use_mpl_style -from gpjax.parameters import PositiveReal +from gpjax.parameters import PositiveReal, Static from gpjax.typing import Array config.update("jax_enable_x64", True) @@ -54,11 +54,26 @@ cols = mpl.rcParams["axes.prop_cycle"].by_key()["color"] # %% [markdown] -# Using Gaussian Processes (GPs) to model functions can offer several advantages over alternative methods, such as deep neural networks. One key advantage is their rich quantification of uncertainty; not only do they provide *point estimates* for the values taken by a function throughout its domain, but they provide a full predictive posterior *distribution* over the range of values the function may take. This rich quantification of uncertainty is useful in many applications, such as Bayesian optimisation, which relies on being able to make *uncertainty-aware* decisions. +# Using Gaussian Processes (GPs) to model functions can offer several advantages over +# alternative methods, such as deep neural networks. One key advantage is their rich +# quantification of uncertainty; not only do they provide *point estimates* for the +# values taken by a function throughout its domain, but they provide a full predictive +# posterior *distribution* over the range of values the function may take. This rich +# quantification of uncertainty is useful in many applications, such as Bayesian +# optimisation, which relies on being able to make *uncertainty-aware* decisions. # -# However, another advantage of GPs is the ability for one to place *priors* on the functions being modelled. For instance, one may know that the underlying function being modelled observes certain characteristics, such as being *periodic* or having a certain level of *smoothness*. The *kernel*, or *covariance function*, is the primary means through which one is able to encode such prior knowledge about the function being modelled. This enables one to equip the GP with inductive biases which enable it to learn from data more efficiently, whilst generalising to unseen data more effectively. +# However, another advantage of GPs is the ability for one to place *priors* on the +# functions being modelled. For instance, one may know that the underlying function +# being modelled observes certain characteristics, such as being *periodic* or having a +# certain level of *smoothness*. The *kernel*, or *covariance function*, is the primary +# means through which one is able to encode such prior knowledge about the function +# being modelled. This enables one to equip the GP with inductive biases which enable it +# to learn from data more efficiently, whilst generalising to unseen data more +# effectively. # -# In this notebook we'll develop some intuition for what kinds of priors are encoded through the use of different kernels, and how this can be useful when modelling different types of functions. +# In this notebook we'll develop some intuition for what kinds of priors are encoded +# through the use of different kernels, and how this can be useful when modelling +# different types of functions. # %% [markdown] # ## What is a Kernel? @@ -66,20 +81,24 @@ # Intuitively, for a function $f$, the kernel defines the notion of *similarity* between # the value of the function at two points, $f(\mathbf{x})$ and $f(\mathbf{x}')$, and # will be denoted as $k(\mathbf{x}, \mathbf{x}')$: +# +# $$ # \begin{aligned} # k(\mathbf{x}, \mathbf{x}') &= \text{Cov}[f(\mathbf{x}), f(\mathbf{x}')] \\ # &= \mathbb{E}[(f(\mathbf{x}) - \mathbb{E}[f(\mathbf{x})])(f(\mathbf{x}') - \mathbb{E}[f(\mathbf{x}')])] # \end{aligned} -# One would expect that, given a previously unobserved test point $\mathbf{x}^*$, the -# training points which are *closest* to this unobserved point will be most similar to -# it. As such, the kernel is used to define this notion of similarity within the GP -# framework. It is up to the user to select a kernel function which is appropriate for -# the function being modelled. In this notebook we are going to give some examples of -# commonly used kernels, and try to develop an understanding of when one may wish to use -# one kernel over another. However, before we do this, it is worth discussing the -# necessary conditions for a function to be a valid kernel/covariance function. This -# requires a little bit of maths, so for those of you who just wish to obtain an -# intuitive understanding, feel free to skip to the section introducing the Matérn +# $$ +# +# One would expect that, given a previously unobserved test point $\mathbf{x}^*$, the +# training points which are *closest* to this unobserved point will be most similar to +# it. As such, the kernel is used to define this notion of similarity within the GP +# framework. It is up to the user to select a kernel function which is appropriate for +# the function being modelled. In this notebook we are going to give some examples of +# commonly used kernels, and try to develop an understanding of when one may wish to use +# one kernel over another. However, before we do this, it is worth discussing the +# necessary conditions for a function to be a valid kernel/covariance function. This +# requires a little bit of maths, so for those of you who just wish to obtain an +# intuitive understanding, feel free to skip to the section introducing the Matérn # family of kernels. # # ### What are the necessary conditions for a function to be a valid kernel? @@ -237,7 +256,6 @@ def forrester(x: Float[Array, "N"]) -> Float[Array, "N"]: # noqa: F821 # First we define our model, using the Matérn52 kernel, and construct our posterior *without* optimising the kernel hyperparameters: # %% - mean = gpx.mean_functions.Zero() kernel = gpx.kernels.Matern52( lengthscale=jnp.array(0.1) @@ -246,7 +264,7 @@ def forrester(x: Float[Array, "N"]) -> Float[Array, "N"]: # noqa: F821 prior = gpx.gps.Prior(mean_function=mean, kernel=kernel) likelihood = gpx.likelihoods.Gaussian( - num_datapoints=D.n, obs_stdev=Static(jnp.array(1e-3)) + num_datapoints=D.n, obs_stddev=Static(jnp.array(1e-3)) ) # Our function is noise-free, so we set the observation noise's standard deviation to a very small value no_opt_posterior = prior * likelihood diff --git a/examples/likelihoods_guide.py b/examples/likelihoods_guide.py index 4e210be5..5a558ce6 100644 --- a/examples/likelihoods_guide.py +++ b/examples/likelihoods_guide.py @@ -227,22 +227,24 @@ # expected log-likelihood. This term is evaluated in the # [stochastic variational Gaussian process](uncollapsed_vi.md) in the ELBO term. For a # variational approximation $q(f)= \mathcal{N}(f\mid m, S)$, the ELBO can be written as +# # $$ # \begin{align} -# \label{eq:elbo} # \mathcal{L}(q) = \mathbb{E}_{f\sim q(f)}\left[ p(\mathbf{y}\mid f)\right] - \mathrm{KL}\left(q(f)\mid\mid p(f)\right)\,. # \end{align} # $$ +# # As both $q(f)$ and $p(f)$ are Gaussian distributions, the Kullback-Leibler term can # be analytically computed. However, the expectation term is not always so easy to # compute. Fortunately, the bound in \eqref{eq:elbo} can be decomposed as a sum of the # datapoints +# # $$ # \begin{align} -# \label{eq:elbo_decomp} # \mathcal{L}(q) = \sum_{n=1}^N \mathbb{E}_{f\sim q(f)}\left[ p(y_n\mid f)\right] - \mathrm{KL}\left(q(f)\mid\mid p(f)\right)\,. # \end{align} # $$ +# # This simplifies computation of the expectation as it is now a series of $N$ # 1-dimensional integrals. As such, GPJax by default uses quadrature to compute these # integrals. However, for some likelihoods, such as the Gaussian likelihood, the diff --git a/examples/oceanmodelling.py b/examples/oceanmodelling.py index e1d8041b..bb1820ea 100644 --- a/examples/oceanmodelling.py +++ b/examples/oceanmodelling.py @@ -170,7 +170,9 @@ def prepare_data(df): # $$ # X_{D,i} = \left( \mathbf{x}_{D,i}, z \right), # $$ +# # and +# # $$ # Y_{D,i} = y_{D,i}^{(z)}, # $$ @@ -311,7 +313,7 @@ def latent_distribution(opt_posterior, pos_3d, dataset_train): # %% [markdown] -# We now replot the ground truth (testing data) $D_0$, the predicted latent vector field $\mathbf{F}_{\text{latent}}(\mathbf{x_i})$, and a heatmap of the residuals at each location $\mathbf{R}(\mathbf{x}_{0,i}) = \mathbf{y}_{0,i} - \mathbf{F}_{\text{latent}}(\mathbf{x}_{0,i}) $, as well as $\left|\left|\mathbf{R}(\mathbf{x}_{0,i})\right|\right|$. +# We now replot the ground truth (testing data) $D_0$, the predicted latent vector field $\mathbf{F}_{\text{latent}}(\mathbf{x_i})$, and a heatmap of the residuals at each location $\mathbf{R}(\mathbf{x}_{0,i}) = \mathbf{y}_{0,i} - \mathbf{F}_{\text{latent}}(\mathbf{x}_{0,i})$, as well as $\left|\left|\mathbf{R}(\mathbf{x}_{0,i})\right|\right|$. # %%