File: kernels.tex

package info (click to toggle)
pymvpa 0.4.5~dev23-2
links: PTS, VCS
area: main
in suites: squeeze
size: 6,884 kB
ctags: 3,480
sloc: python: 25,450; cpp: 2,663; ansic: 445; makefile: 426; sh: 194
file content (355 lines) | stat: -rw-r--r-- 16,174 bytes
parent folder | download | duplicates (2)
\documentclass[a4paper,11pt]{article}
\usepackage[latin1]{inputenc}
\usepackage[english]{babel}
\usepackage{epsfig}
\usepackage{amsmath}
\usepackage{amsfonts}

\newcommand\R{{\mathbb R}}
\newcommand\x{{\mathbf x}}
\newcommand\X{{\mathbf X}}
\newcommand\K{{\mathbf K}}
\newcommand\J{{\mathbf J}}
\newcommand\LL{{\mathbf L}}
\newcommand\ELL{{\Ivec \ell}}
%%\newcommand\L{{\mathbf L}}
%%\DeclareMathOperator*{\argmax}{arg\,max}
%% \DeclareMathOperator*{\argmax}{argmax} %% (argmax wihtouth mid space)
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\var}{var}
\DeclareMathOperator*{\dm}{dm}
\newcommand{\Rvec}[1]{{\bf #1}}
\newcommand{\Ivec}[1]{\mbox{\boldmath $#1$}}

\title{Kernels}
\author{Emanuele Olivetti}

\begin{document}

\maketitle

\section{Introduction}
This document gives a detailed description of kernels implemented in
PyMVPA together with derivation of their gradients. Gradients are
useful when trying to maximize the marginal likelihood of a Gaussian,
i.e., during model selection.

The following notation and definitions are used:
\begin{itemize}
\item $\x \in \R^D$ : a $D$-dimensional column vector, $\x =
  (x_1,\ldots,x_D)$.
\item $\X = (\x_1^{\top},\ldots,\x_N^{\top})$ : a $N \times D$ matrix
  where each row is a $D$-dimensional vector. $\X$ is also called set
  of \emph{samples}. $\X_{* i}$ indicates the $i$-th column of $\X$
  and is a column vector. $\X_{j *}$ indicates the $j$-th row of $\X$
  and is a row vector.
\item $k: \R^D \times \R^D \rightarrow \R$ : a covariance (or kernel)
  function.
\item $\K(\X,\X')$ : the matrix extension of $k$, i.e., $\K_{pq} =
  k(\X_{*p},\X'_{*q})$. If $\X$ is a $N \times D$ matrix and $\X'$ is
  $N' \times D$ then $\K(\X,\X')$ is a $N \times N'$ matrix.
\item $\J_{n,m}$ : the $n \times m$ matrix of ones, i.e., a matrix
  where each element is 1.
\item $\|\mathbf{z}\|_p$ : the $p$-norm of vector $\mathbf{z}$ defined
  as $\|\mathbf{z}\|_p = (\sum_{i=1}^D \|z_i\|^p)^{\frac{1}{p}}$.
  Euclidean norm is $p=2$, then $\|\mathbf{z}\|_2 =
  \sqrt{\mathbf{z}^{\top}\mathbf{z}} = \sqrt{\sum_{i=1}^D z_i^2}$.
\item $\|\mathbf{z},\mathbf{w}\|_p$ : the \emph{weighted} $p$-norm of
  vector $\mathbf{z}$ defined as $\|\mathbf{z},\mathbf{w}\|_p =
  (\sum_{i=1}^D w_i|z_i|^p)^{\frac{1}{p}}$.  Euclidean norm is
  $p=2$, then $\|\mathbf{z},\mathbf{w}\|_2 = \sqrt{\mathbf{z}^{\top}
    \mathbf{W}^{-1} \mathbf{z}} = \sqrt{\sum_{i=1}^D w_i z_i^2}$,
  where $\mathbf{W} = diag(\mathbf{w})$.
\item $\dm(\X,\X')$ : the \emph{Euclidean distance matrix} between
  $\X$ and $\X'$ defined element by element as $\dm(\X,\X')_{pq} =
  \|\X_{p *} - \X'_{q *}\|_2 = \sqrt{\sum_{i=1}^D (\X_{p i} - \X'_{q
      i})^2}$. If $\X$ is a $N \times D$ matrix and $\X'$ is $N'
  \times D$ then $\dm(\X,\X1)$ is a $N \times N'$ matrix. Note that
  $\dm(\X,X')$ is the square root of what it is usually called
  ``distance matrix''.
\item $\dm(\X,\X',\mathbf{w})$ : the \emph{weighted} Euclidean
  distance matrix between $\X$ and $\X'$ defined element by element as
  $\dm(\X,\X',\mathbf{w})_{pq} = \|(\X_{p *} - \X'_{q *})^{\top},
  \mathbf{w}\|_2 = \sqrt{\sum_{i=1}^D w_i(\X_{p i} - \X'_{q i})^2}$
  through the weight vector $\mathbf{w} \in \R^D$. It is
  straightforward that $\dm(\X,\X') = \dm(\X,\X',\mathbf{J}_{D,1})$.
\item $\X \bullet \mathbf{Y}$ : the Hadamard (or Schur) matrix
  product, i.e. the entrywise product between matrices of the same
  size. Let $\mathbf{Z} = \X \bullet \mathbf{Y}$, then $z_{ij} =
  x_{ij} y_{ij}$.
\item $\X^{\alpha}$ : $(\X^{\alpha})_{ij} = (X_{ij})^{\alpha}$.
\end{itemize}

\section{Constant kernel}
$$k(\x,\x') = \sigma_0^2$$
where $\sigma_0 \ge 0$ is the standard deviation of the Gaussian prior
probability $\mathcal{N}(0,\sigma_0^2)$ of the value of the constant.
$$\K(\X,\X') = \sigma_0^2 \J_{N,N'}$$
$$\mathbf{\Theta} = \{\sigma_0\}$$
$$\frac{\partial k}{\partial \sigma_0}(\x,\x') = 2\sigma_0$$
$$\frac{\partial \K}{\partial \sigma_0} = 2\sigma_0 \J_{N,N'}$$
$$A = \sigma_0^2$$
$$A \ge 0$$
$$\mathbf{\Theta}^* = \{A\}$$
$$k(\x,\x') = A$$
$$\K(\X,\X') = A \J_{N,N'}$$
$$\frac{\partial k}{\partial A} = 1$$
$$\nabla_A \K = \frac{\partial \K}{\partial A} = \J_{N,N'}$$
Note that using $A$ as hyperparameter the gradient becomes constant.

\section{Linear kernel}
Let $\Ivec{\Sigma}_p$ be the $D \times D$ covariance matrix of the Gaussian
prior probability $\mathcal{N}(\Ivec{0},\Ivec{\Sigma}_p)$ of the weights of
the Bayesian linear regression.
$$k(\x,\x') = \x^{\top} \Ivec{\Sigma}_p \x'$$
$$\K(\X,\X') = \X \Ivec{\Sigma}_p \X'^{\top}$$
In order to simplify formulas we assume $\Ivec{\Sigma}_p$ is diagonal, i.e.,
$\Ivec{\Sigma}_p = diag(\Ivec{\sigma}^2_p)$ where $\Ivec{\sigma}^2_p =
({\sigma^2_p}_1,\ldots,{\sigma^2_p}_D)$:
$$k(\x,\x') = \sum_{i=1}^D {\sigma^2_p}_i x_i x'_i$$
$$\mathbf{\Theta} = \{{\sigma_p}_1,\ldots,{\sigma_p}_D\}$$
$$\frac{\partial k}{\partial {\sigma_p}_i} = 2 {\sigma_p}_i x_i x'_i$$
$$A_i = {\sigma_p^2}_i$$
$$A_i \ge 0$$
$$\mathbf{A} = (A_1,\ldots,A_D)^{\top}$$
$$\mathbf{\Theta}^* = \{ \mathbf{A} \}$$
$$k(\x,\x') = \x^{\top} diag(\mathbf{A}) \x'$$
$$\K(\X,\X') = \X diag(\mathbf{A}) \X'^{\top}$$
$$\frac{\partial k}{\partial A_i} = x_i x'_i$$
$$\frac{\partial \K}{\partial A_i} = \X_{* i} {\X'_{* i}}^{\top}$$
$$\nabla_{\mathbf{A}} \K = ( \X_{* 1} {\X'_{* 1}}^{\top}, \ldots,
\X_{* D} {\X'_{* D}}^{\top})$$
As expected the gradient is independent of the hyperparameters values
and can be computed once for all at the beginning.

\section{Polynomial kernel}
$$k(\x,\x') = (\sigma_0^2 + \x^{\top} \Ivec{\Sigma}_p \x')^p =
(\sigma_0^2 + \sum_{i=1}^D {\sigma^2_p}_i x_i x'_i)^p$$
$$\K(\X,\X') = (\sigma_0^2 \mathbf{J}_{N,N'} + \X \Ivec{\Sigma}_p
\X'^{\top})^p$$
In order to simplify formulas we assume $\Ivec{\Sigma}_p$ is diagonal, i.e.,
$\Ivec{\Sigma}_p = diag(\Ivec{\sigma}^2_p)$ where $\Ivec{\sigma}^2_p =
({\sigma^2_p}_1,\ldots,{\sigma^2_p}_D)$.
$$\sigma_0 \ge 0$$
$$\Ivec{\sigma}_p = ({\sigma_p}_1,\ldots,{\sigma_p}_D)$$
$${\sigma_p}_i \ge 0$$
$$\mathbf{\Theta} = \{\sigma_0,\Ivec{\sigma}_p, p\}$$
$$A = \sigma_0^2$$
$$B_i = {\sigma^2_p}_i$$
$$\mathbf{B} = (B_1,\ldots,B_D)$$
$$\mathbf{\Theta}^* = \{A,\mathbf{B}, p\}$$
$$k(\x,\x') = (A + \x^{\top} diag(\mathbf{B}) \x')^p = (A +
\sum_{i=1}^D B_i x_i x'_i)^p$$
$$\frac{\partial k}{\partial A} = p(A + \sum_{i=1}^D B_i x_i
x'_i)^{p-1}$$
$$\frac{\partial \K}{\partial A} = p(A\mathbf{J}_{N,N'} + \sum_{i=1}^D
B_i \X_{*i} {\X'_{*i}}^{\top})^{p-1}$$
$$\frac{\partial k}{\partial B_i} = p(A + \sum_{i=1}^D B_i x_i
x'_i)^{p-1} x_i x'_i = \frac{\partial k}{\partial A} x_i x'_i$$
$$\frac{\partial \K}{\partial B_i} = \frac{\partial \K}{\partial A} \X_{*i} {\X'_{*i}}^{\top}$$
$$\frac{\partial k}{\partial p} = k(\x,\x') \ln(A + \sum_{i=1}^D B_i x_i
x'_i)$$
$$\frac{\partial \K}{\partial p} = \K(\X,\X') \bullet \ln(A \mathbf{J}_{N,N'}+ \sum_{i=1}^D B_i \X_{*i} \X'_{*i})$$
$$\mathbf{M} = A \mathbf{J}_{N,N'}+ \sum_{i=1}^D B_i \X_{*i} \X'_{*i}$$
$$\nabla_{A,\mathbf{B},p} \K = \left(p\mathbf{M}^{p-1},\left\{p\mathbf{M}^{p-1} \X_{*i} {\X'_{*i}}^{\top} \right\}_{i=1,\ldots,D}, \K(\X,\X') \bullet \ln(\mathbf{M}) \right)$$

\section{Exponential kernel}
\subsection{Scalar Lengthscale $\ell$}
$$k(\x,\x') = \sigma_f^2 e^{-\frac{\|\x-\x'\|_2}{\ell}}$$
$$\ell > 0$$
$$\sigma_f \ge 0$$
$$\mathbf{\Theta} = \{ \sigma_f, \ell \}$$
$$\K(\X,\X') = \sigma_f^2 e^{-\frac{1}{\ell}\dm(\X,\X')}$$
$$A = \sigma_f^2$$
$$A \ge 0$$
$$B = -\frac{1}{\ell}$$
$$B < 0$$
$$\mathbf{\Theta}^* = \{ A, B \}$$
$$k(\x,\x') = A e^{B\|\x-\x'\|_2}$$
$$\K(\X,\X') = A e^{B \dm(\X-\X')}$$
$$\frac{\partial k}{\partial A} = e^{B\|\x-\x'\|_2} = \frac{1}{A}k(\x,\x')$$
$$\frac{\partial \K}{\partial A} = e^{B \dm(\X,\X')} = \frac{1}{A} \K(\X,\X')$$
$$\frac{\partial k}{\partial B} = A e^{B\|\x-\x'\|_2} \|\x-\x'\|_2 =
k(\x,\x') \|\x-\x'\|_2$$
$$\frac{\partial \K}{\partial B} = \K(\X,\X') \bullet \dm(\X-\X')$$
$$\nabla_{A,B} \K = (\frac{1}{A} \K(\X,\X'), \K(\X,\X') \bullet \dm(\X-\X'))$$

Note that if $\K(\X,\X')$ is precomputed, then the gradient consists
in just two element-by-element products, the second being against a
constant matrix independent of the hyperparameters.

\subsection{Vector of Lengthscales $\Ivec{\ell}$}
Given $\Ivec{\ell} = (\ell_1,\ldots,\ell_D)$, $\ell_i \ge 0$ and
$\ELL^{-1} = (1/\ell_1,\ldots,1/\ell_D)$
$$k(\x,\x') = \sigma_f^2 e^{-\|\x-\x',\ELL^{-2}\|_2}= \sigma_f^2
  e^{-\sqrt{\sum_{i=1}^D \left(\frac{x_i - x'_i}{\ell_i}\right)^2}}$$
$$K(\X,\X') = \sigma_f^2 e^{-\dm(\X,\X',\ELL^{-2})}$$
$$\mathbf{\Theta} = \{ \sigma_f, \ELL\}$$
$$\sigma_f \ge 0$$
$$\ell_i > 0$$
$$A = \sigma_f^2$$
$$\mathbf{B} = \ELL^{-2}$$
$$A \ge 0$$
$$B_i > 0$$
$$\mathbf{\Theta}^* = \{ A, \mathbf{B}\}$$
$$k(\x,\x') = A e^{-\|\x-\x',\mathbf{B}\|_2} = A e^{-\sqrt{\sum_{i=1}^D
    B_i(x_i - x'_i)^2}}$$  
$$K(\X,\X') = A e^{-\dm(\X,\X',\mathbf{B})}$$
$$\frac{\partial k}{\partial A} = e^{-\sqrt{\sum_{i=1}^D B_i(x_i -
    x'_i)^2}} = \frac{k(\x,\x')}{A}$$
$$\frac{\partial \K}{\partial A} = e^{-\dm(\X,\X',\mathbf{B})} =
\frac{1}{A}\K(\X,\X')$$ 
$$\frac{\partial k}{\partial B_i} = A e^{-\|\x-\x',\mathbf{B}\|_2}
\left( -\frac{1}{2} \|\x-\x',\mathbf{B}\|_2^3 \right) (x_i - x'_i)^2 =
-\frac{1}{2} k(\x,\x') \|\x-\x',\mathbf{B}\|_2^3 (x_i - x'_i)^2 $$
$$\frac{\partial \K}{\partial B_i} = -\frac{1}{2} K(\X,\X')
\bullet \dm(\X-\X',\mathbf{B})^3 \bullet (\X_{*i}J_{1,N'}-
(\X'_{*i}J_{1,N})^{\top})^2$$
$$\nabla_{A,\mathbf{B}} \K = \left(\frac{1}{A}\K(\X,\X'), \left\{-\frac{1}{2}
    K(\X,\X') \bullet \dm(\X-\X',\mathbf{B})^3 \bullet
    (\X_{*i}J_{1,N'}- (\X'_{*i}J_{1,N})^{\top})^2 \right\}_{i=1 \ldots
    D} \right)$$

Note that $\frac{\partial \K}{\partial A}$ requires just the
multiplication of a constant by the kernel matrix $\K(\X,\X')$ whose
values usually already available. Instead $\frac{\partial \K}{\partial
  B_i}$ is a entrywise product of 3 matrices: $\K(\X,\X')$ (usually
already available), $\dm(\X-\X',\mathbf{B})^3$ (which is, apart the
cube, part of the computation of $\K(\X,\X')$ so it can be stored in
advance), and $(\X_{*i}J_{1,N'}- (\X'_{*i}J_{1,N})^{\top})^2$ which
does not depend upon $A$ and $\mathbf{B}$ so it can be computed once
for all. Note that in NumPy $(\X_{*i}J_{1,N'}-
(\X'_{*i}J_{1,N})^{\top})^2$ can be computed as {\ttfamily
  numpy.subtract.outer($X_{*i},X'_{*i}$)**2}.

\section{Squared Exponential kernel}

\subsection{Scalar Lengthscale $\ell$}
$$k(\x,\x') = \sigma_f^2 e^{-\frac{1}{2\ell^2} (\x-\x')^{\top}(\x-\x')}
  = \sigma_f^2 e^{-\frac{1}{2\ell^2} \sum_{i=1}^D (x_i - x'_i)^2} =
  \sigma_f^2 e^{-\frac{1}{2\ell^2} \|\x-\x'\|_2^2}$$ 
$$\K(\X,\X') = \sigma_f^2 e^{-\frac{1}{2\ell^2} \dm(\X,\X')^2}$$
$$\sigma_f \ge 0$$
$$\ell > 0$$
$$\mathbf{\Theta} = \{ \sigma_f, \ell \}$$
$$\frac{\partial k}{\partial \sigma_f} = \frac{2}{\sigma_f}k$$
$$\frac{\partial \K}{\partial \sigma_f} = \frac{2}{\sigma_f}\K$$
$$\frac{\partial k}{\partial \ell} = \ell^{-3} k \|\x-\x'\|_2^2 $$
$$\frac{\partial \K}{\partial \ell} = \ell^{-3} \K \bullet \dm(\X,\X')^2$$
$$\nabla_{\sigma_f,\ell} \K = \left(\frac{2}{\sigma_f}K(\X,\X'), \ell^{-3} \K \bullet \dm(\X-\X')^2 \right)$$
Logscale:
$$A = \ln{\sigma_f}$$
$$\sigma_f = e^A$$
$$\frac{\partial k}{\partial A} = 2k$$
$$\frac{\partial \K}{\partial \sigma_f} = 2\K$$
$$B = \ln{\ell}$$
$$\ell = e^B$$
$$\frac{\partial k}{\partial B} = \ell^{-2} k \|\x-\x'\|_2^2$$
$$\frac{\partial \K}{\partial B} = \ell^{-2} \K \bullet \dm(\X,\X')^2 = \K \bullet \dm(\X,\X',\ell^{-2}\J_N)^2$$
$$\nabla_{A,B} \K = \left( 2\K(\X,\X'), \ell^{-2} \K \bullet \dm(\X-\X')^2 \right)$$
Another mapping:
$$A = \sigma_f^2$$
$$B = -\frac{1}{\ell^2}$$
$$A \ge 0$$
$$B < 0$$
$$\mathbf{\Theta}^* = \{ A, B\}$$
$$k(\x,\x') = A e^{B (\x-\x')^{\top}(\x-\x')}$$
$$\K(\X,\X') = A e^{B \dm(\X,\X')^2}$$
$$\frac{\partial k}{\partial A} = \frac{k(\x,\x')}{A}$$
$$\frac{\partial \K}{\partial A} = \frac{1}{A}\K(\X,\X')$$
$$\frac{\partial k}{\partial B} = k(\x,\x') \|\x-\x'\|_2^2$$
$$\frac{\partial \K}{\partial B} = K(\X,\X') \bullet \dm(\X,\X')^2$$
$$\nabla_{A,B} \K = (\frac{1}{A} \K(\X,\X'), \dm(\X-\X')^2 \bullet
\K(\X,\X'))$$

Note that $\nabla_{A,B} \K$ is similar to that of the exponential
kernel so almost all comments made before applies here as well.


\subsection{Vector of Lengthscales $\ELL$}
Let $\mathbf{L} = diag(\ELL)$:
$$k(\x,\x') = \sigma_f^2 e^{-\frac{1}{2}(\x-\x')^{\top} \LL^{-2}
  (\x-\x')} = \sigma_f e^{-\frac{1}{2}\sum_{i=1}^D \frac{(x_i -
    x'_i)^2}{\ell_i^2}} = \sigma_f^2
e^{-\frac{1}{2}\|\x-\x',\ELL^{-2}\|_2^2}$$
$$\K(\X,\X') = \sigma_f^2 e^{-\frac{1}{2} \dm(\X,\X',\ELL^{-2})^2}$$
$$\sigma_f \ge 0$$
$$\ell_i > 0$$
$$\mathbf{\Theta} = \{ \sigma_f, \ELL\}$$
$$\frac{\partial k}{\partial \sigma_f} = \frac{2}{\sigma_f}k$$
$$\frac{\partial \K}{\partial \sigma_f} = \frac{2}{\sigma_f}\K$$
$$\frac{\partial k}{\partial \ell_i} = \ell_i^{-3} k \|\x_i-\x_i'\|_2^2 $$
$$\frac{\partial \K}{\partial \ell_i} = \ell_i^{-3} \K \bullet (\X_{*i}J_{1,N'}- (\X'_{*i}J_{1,N})^{\top})^2$$
$$\nabla_{\sigma_f,\ell} \K = \left(\frac{2}{\sigma_f}K(\X,\X'), \ell_i^{-3} \K \bullet (\X_{*i}J_{1,N'}- (\X'_{*i}J_{1,N})^{\top})^2 \right)$$
Logscale:
$$A = \ln{\sigma_f}$$
$$\sigma_f = e^A$$
$$\frac{\partial k}{\partial A} = 2k$$
$$\frac{\partial \K}{\partial \sigma_f} = 2\K$$
$$B_i = \ln{\ell_i}$$
$$\ell_i = e^B_i$$
$$\frac{\partial k}{\partial B_i} = \ell_i^{-2} k \|\x_i-\x'_i\|_2^2$$
$$\frac{\partial \K}{\partial B_i} = \ell^{-2} \K \bullet \dm(\X,\X')^2 = \ell_i^{-2} \K \bullet (\X_{*i}J_{1,N'}- (\X'_{*i}J_{1,N})^{\top})^2$$
$$\nabla_{A,\mathbf{B}} \K = \left( 2\K(\X,\X'), \left\{ \ell_i^{-2} \K \bullet (\X_{*i}J_{1,N'}- (\X'_{*i}J_{1,N})^{\top})^2 \right\}_{i=1 \ldots D} \right)$$
Another mapping:
$$A = \sigma_f$$
$$\mathbf{B} = -\frac{1}{2}\ELL^{-2} =
\left(-\frac{1}{2\ell_1^2},\ldots,-\frac{1}{2\ell_D^2} \right)$$
$$A \ge 0$$
$$B_i < 0$$
$$\mathbf{\Theta}^* = \{ A, \mathbf{B}\}$$
$$k(\x,\x') = A e^{(\x-\x')^{\top} diag(\mathbf{B}) (\x-\x')} = A
e^{\sum_{i=1}^D B_i (x_i - x'_i)^2}$$ 
$$\K(\X,\X') = A e^{\dm(\X,\X',\mathbf{B})}$$
$$\frac{\partial k}{\partial A} = \frac{k(\x,x')}{A}$$
$$\frac{\partial \K}{\partial A} = \frac{1}{A}\K(\X,X')$$
$$\frac{\partial k}{\partial B_i} = k(\x,\x') (x_i -x'_i)^2$$
$$\frac{\partial \K}{\partial B_i} = \K(\X,\X') \bullet
(\X_{*i}J_{1,N'}- (\X'_{*i}J_{1,N})^{\top})^2$$
$$\nabla_{A,\mathbf{B}} \K = \left(\frac{\K(\X,\X')}{A}, \left\{ K(\X,\X')
  \bullet (\X_{*i}J_{1,N'}- (\X'_{*i}J_{1,N})^{\top})^2 \right\}_{i=1
  \ldots D} \right)$$

Note that $\nabla_{A,\mathbf{B}} \K$ requires to compute $K(\X,\X')$
(which is usually already available), and its entrywise product with
$(\X_{*i}J_{1,N'}- (\X'_{*i}J_{1,N})^{\top})^2$ which is independent
of the value of the hyperparameters and can be precomputed once for
all.

\section{$\gamma$-Exponential kernels}
$$k(\x,\x') = \sigma_f^2 e^{-(\frac{\x-\x'}{\ell})^\gamma}$$
$$k(\x,\x') = \sigma_f^2 e^{-(\frac{\x-\x'}{\ELL})^\gamma}$$

\section{Mat\'ern kernels}
%% $$k_{\mbox{Mat\'ern}}(\x,\x') = \frac{2^{1-\nu}}{\Gamma(\nu)} \left(\sqrt{2\nu} \frac{\|\x-\x'\|_2}{\ell}\right)^{\nu} \K_{\nu}\left(\sqrt{2}\nu \frac{\|\x-\x'\|_2}{\ell} \right)$$
%% $$\ell > 0$$
%% $$\nu > 0$$
%% where $K_{\nu}$ is a modified Bessel function (REMOVE?).
Let $\nu$ be half integer, i.e., $\nu = p + 1/2$ ($p \in
\mathbb{Z}^+$), then
$$k_{\nu=p+1/2}(\x,\x') = e^{-\sqrt{2\nu}\frac{\|\x-\x'\|}{\ell}}
\frac{\Gamma(p+1)}{\Gamma(2p+1)} \sum_{i=0}^p
\frac{(p+i)!}{i!(p-i)!}\left(\sqrt{8\nu}\frac{\|\x-\x'\|}{\ell}\right)^{p-i}$$
is the class of Mat\'ern covariance functions with half-integer $\nu$.
$$\nu > 0$$
$$\ell > 0$$
When $\nu \rightarrow \infty$ we obtain the squared exponential
covariance function. Most popular cases of the Mat\'ern functions are
$p=0$ (exponential kernel), $p=1$ and $p=2$:
$$k_{\nu=1/2}(\x,\x') = e^{-\frac{\|\x-\x'\|_2}{\ell}}$$
$$k_{\nu=3/2}(\x,\x') = \left(1+\sqrt{3}\frac{\|\x-\x'\|_2}{\ell}
\right) e^{-\sqrt{3}\frac{\|\x-\x'\|_2}{\ell}}$$ 
$$k_{\nu=5/2}(\x,\x') = \left(1+\sqrt{5}\frac{\|\x-\x'\|_2}{\ell}
 + \frac{5\|\x-\x'\|_2^2}{3\ell^2}\right)
e^{-\sqrt{5}\frac{\|\x-\x'\|_2}{\ell}}$$


\section{Rational Quadratic kernels}
$$k_{RQ} = \left(1+\frac{\|\x-\x'\|_2^2}{2\alpha\ell^2}
\right)^{-\alpha}$$
$$\alpha > 0$$
$$\ell > 0$$



\end{document}