\chapter{Bayesian Regression Algorithms}%
\label{cha:Bayesian Regression Algorithms}

\section{Bayesian Linear Regression}%
\label{sec:Bayesian Linear Regression}
Für die Bayesian Linear Regression ist es möglich den Posterior und die Vorhersage ohne die Nutzung von Approximationen zu berechnen.
Hierzu werden die folgenden Komponenten benötigt:
\begin{itemize}
    \item Likelihood (einzelnes Sample): \tabto{6cm}$p(y|\bm x,\bm w) = \nomeq{gaussian_distribution}(y|\bm w^T \nomeq{vector_valued_function},\nomeq{variance})$
    \item Likelihood (ganzer Datensatz): \tabto{6cm}$p(\bm y|\bm X,\bm w) = \prod_i \nomeq{gaussian_distribution}(y_i|\bm w^T \bm\phi(\bm x_i), \nomeq{variance})$
    \item Gaussian Prior: \tabto{6cm}$p(\bm w) = \nomeq{gaussian_distribution}(\bm w|0,\nomeq{regularization_factor}^{-1}\nomeq{identity_matrix})$
\end{itemize}
Anschließend erfolgt die Regression nach den Schritten des \nameref{cha:Bayesian Learning}:
\begin{enumerate}
    \item Posterior errechnen:
        \begin{equation} \label{eq:bayesian_linear_regression_posterior}
            p(\bm w|\bm X,\bm y) = \frac{p(\bm y|\bm X,\bm w)p(\bm w)}{p(\bm y|\bm X)} 
                = \frac{p(\bm y|\bm X,\bm w)p(\bm w)}{\int p(\bm y|\bm X,\bm w)p(\bm w)d\bm w}
        \end{equation}
        Hierfür kann die 2. Gaussian Bayes Rule (\cref{sec:Gaussian Bayes Rules}) verwendet werden\\
        (mit $\bm\mu_{\bm x}=0$, $\nomeq{covariance}_{\bm x} = \nomeq{regularization_factor}^{-1}$, $\bm F = \bm\Phi$ und $\sigma_{\bm y}^2 = \sigma_{\bm y}^2$)
        \begin{equation} \label{eq:bayesian_linear_regression_posterior_gaussian_bayes_rule}
            p(\bm w|\bm X,\bm y) = \nomeq{gaussian_distribution}(\bm w|\bm\mu_{\bm w|\bm X,\bm y},\nomeq{covariance}_{\bm w|\bm X,\bm y}) 
        \end{equation}
        \begin{itemize}
            \item $\bm\mu_{\bm w|\bm X,\bm y} = (\bm\Phi^T\bm\Phi + \sigma_{\bm y}^2\nomeq{regularization_factor}\nomeq{identity_matrix})^{-1}\bm\Phi^T\bm y$
            \item $\nomeq{covariance}_{\bm w|\bm X,\bm y} = \sigma_{\bm y}^2(\bm\Phi^T\bm\Phi + \sigma_{\bm y}^2\nomeq{regularization_factor}\nomeq{identity_matrix})^{-1}$
        \end{itemize}
    \item Predictive Distribution errechnen:
        \begin{align} \label{eq:bayesion_linear_regression_predictive_distribution}
            p(y^*|\bm x^*,\bm X,\bm y) &= \int p(y^*|\bm w,\bm x^*)p(\bm w|\bm X,\bm y)d\bm w \\
            &= \int \nomeq{gaussian_distribution}(y_*|\phi_*^T\bm w,\sigma_{\bm y}^2)\nomeq{gaussian_distribution}(\bm w|\bm\mu_{\bm w|\bm X,\bm y},\nomeq{covariance}_{\bm w|\bm X,\bm y}) d\bm w
        \end{align}
        Um diese Gleichung zu lösen kann die \dref{sec:Gaussian Propagation} verwendet werden:
        \begin{itemize}
            \item $\nomeq{mean}(\bm x^*) = \phi(\bm x^*)^T(\bm\Phi^T\bm\Phi + \nomeq{regularization_factor}\sigma_{\bm y}^2\nomeq{identity_matrix})^{-1}\bm\Phi^T\bm y$
            \item $\nomeq{variance}(\bm x^*) = \sigma_{\bm y}^2(1+\phi(\bm x^*)^T(\bm\Phi^T\bm\Phi + \nomeq{regularization_factor}\sigma_{\bm y}^2\nomeq{identity_matrix})^{-1}\phi(\bm x^*))$
        \end{itemize}
\end{enumerate}
Es fällt auf, dass $\nomeq{mean}(\bm{x^*})$ sich im Vergleich zur \dref{sub:Ridge Regression} nicht verändert hat.
Allerdings ist $\nomeq{variance}(\bm x^*)$ jetzt abhängig von den Eingangsdaten.

\section{Gaussian Processes}%
\label{sec:Gaussian Processes}
Ein Gaußscher Prozess ist im Grunde nichts anderes als die kernelized version der \dref{sec:Bayesian Linear Regression} ({\color{red}Beweis: Vorlesung 07 Folie 44 ff.}).
\begin{equation} \label{eq:guassian_process_general_definition}
    f(\bm x)\sim\nomeq{gaussian_process}(\underbrace{m(\bm x)}_{\text{mean function}},\underbrace{k(\bm x,\bm x')}_{\text{covariance function}})
\end{equation}
\begin{itemize}
    \item Die \say{mean function} gibt den Prior für die Funktion:
        \begin{equation} \label{eq:gaussian_process_mean_function}
            \mathbb{E}[f(\bm x)] = m(\bm x) 
        \end{equation}
        (im folgenden wird angenommen $m(\bm x) = 0$)
    \item Die \say{covariance function} gibt an, wie ähnlich die Funktion $f$ an den Stellen $\bm x$ und $\bm x'$ ist
        \begin{equation}
            \label{eq:gaussian_process_covariance_function}
            \mathbb{E}[f(\bm x)f(\bm x')] = k(\bm x,\bm x') 
        \end{equation}
        (Covariance Function muss positiv definit sein (genau wie Kernel Function(\cref{sec:Positive Definite Kernels})))
\end{itemize}
Für Gaußsche Prozesse lässt ist der Posterior gegeben durch:\\
({\color{red} Herleitung und weitere Informationen Vorlesung 07 Folie 40})
\begin{equation} \label{eq:gaussian_process_posterior}
    p(\bm y|\bm X) = \nomeq{gaussian_distribution}(\bm y|0,\bm K + \sigma_y^2\nomeq{identity_matrix})
\end{equation}
Hierbei ist $\bm K$ die \say{covariance matrix} und nicht die \noms{kernel_matrix} (außer $k$ ist eine \noms{kernel_function})
\begin{equation} \label{eq:covariance_matrix}
    \bm K = \begin{bmatrix}
        k(\bm x_1,\bm x_1) & \cdots & k(\bm x_1,\bm x_N)\\
        \vdots & \ddots & \vdots\\
        k(\bm x_N, \bm x_1) & \cdots & k(\bm x_N,\bm x_N)
    \end{bmatrix}
\end{equation}
Die Vorhersage $p(y^*|\bm X,\bm y,\bm x^*)$ ist eine \noms{gaussian_distribution},
wobei \noms{mean} und \noms{variance} gegeben sind durch:\\
({\color{red} Herleitung Vorlesung 07 Folie 41})
\begin{itemize}
    \item $\nomeq{mean}(\bm x^*) = \bm k_{\bm x^*}^T(\bm K + \sigma_y^2\nomeq{identity_matrix})^{-1}\bm y$
    \item $\nomeq{variance}(\bm x^*) = k^* + \sigma_y^2 - \bm k_{\bm x^*}^T(\bm K + \sigma_y^2\nomeq{identity_matrix})^{-1}\bm k_{\bm x^*}$
\end{itemize}