diff --git a/chapters/Classical_Supervised_Learning/Linear_Classification.tex b/chapters/Classical_Supervised_Learning/Linear_Classification.tex
index 2c64be6..fed4653 100644
--- a/chapters/Classical_Supervised_Learning/Linear_Classification.tex
+++ b/chapters/Classical_Supervised_Learning/Linear_Classification.tex
@@ -53,7 +53,7 @@ $\bm{w}$ ist der Normalvektor (normal) zur Geraden und $b$ das Bias.
 \includegraphics[width=\textwidth]{linear_separability.png}
 
 \subsection{Optimization}%
-\label{sub:Optimization}
+\label{sub:Binary Classification:Optimization}
 Um den linearen Klassifikator zu optimieren sind mehrere Methoden denkbar.
 \subsubsection{0-1 loss}%
 \label{ssub:0-1 loss}
@@ -210,7 +210,7 @@ Dies stellt eine Approximation des tatsächlich erwarteten Verlustes nach dem Pr
     \mathbb{E}_{\bm{x}}\left[l(\bm{x};\bm{\theta})\right]\qquad \bm{\theta}_{t+1} = \bm{\theta}_t - \eta\mathbb{E}_{\bm{x}}\left[\nabla_{\bm{\theta}} l(\bm{x};\bm{\theta}_t)\right]
 \end{equation}
 
-\subsection{\glsxtrfull{SDG}}%
+\subsection{\texorpdfstring{\glsxtrfull{SDG}}{\glsfmtfull{SDG}}}%
 \label{sub:SDG}
 \begin{wrapfigure}{r}{.5\textwidth}
     \vspace*{-15mm}
diff --git a/chapters/Classical_Supervised_Learning/Linear_Regression.tex b/chapters/Classical_Supervised_Learning/Linear_Regression.tex
index 5e24d63..62e1a68 100644
--- a/chapters/Classical_Supervised_Learning/Linear_Regression.tex
+++ b/chapters/Classical_Supervised_Learning/Linear_Regression.tex
@@ -83,7 +83,7 @@ Dies ermöglicht es mittels der linearen Regression auch jede nicht-lineare Funk
 indem eine passende \nomf{vector_valued_function} gefunden wird.
 
 \subsection{Beispiele}%
-\label{sub:Beispiele}
+\label{sub:linear Regression:Beispiele}
 \subsubsection{Polynomial Curve Fitting}%
 \label{ssub:Polynomial Curve Fitting}
 \begin{wrapfigure}{r}{.4\textwidth}
diff --git a/chapters/Classical_Supervised_Learning/Model_Selection.tex b/chapters/Classical_Supervised_Learning/Model_Selection.tex
index 79a3010..6a74b74 100644
--- a/chapters/Classical_Supervised_Learning/Model_Selection.tex
+++ b/chapters/Classical_Supervised_Learning/Model_Selection.tex
@@ -101,7 +101,7 @@ Um die Nachteile der \nameref{sub:Hold-out Mehtod} zu umgehen wird meist die Cro
 
 \subsubsection{Sonderformen der Cross Validation}%
 \label{ssub:Sonderformen der Cross Validation}
-    \paragraph{\glsxtrfull{LLO} Cross Validation}%
+    \paragraph{\texorpdfstring{\glsxtrfull{LLO} Cross Validation}{\glsfmtfull{LLO} Cross Validation}}%
     \label{par:LLO Cross Validation}
     Sonderform, bei der $k=n$,
     wodurch es genau so viele Durchläufe wie Datenpunkte gibt
@@ -158,7 +158,7 @@ Man spricht hierbei von Data Augmentation.
         \centering
         \includegraphics[width=\linewidth]{artificial_noise2.png}
         \caption{mögl. Diskriminanten mit künstlichen Noise}
-        \label{fig:regression_without_artifical_noise}
+        \label{fig:regression_with_artifical_noise}
     \end{subfigure}
     \caption{Einfluss von künstlichen Noise}
     \label{fig:artificial_noise}
diff --git a/chapters/Classical_Supervised_Learning/Trees_and_Forests.tex b/chapters/Classical_Supervised_Learning/Trees_and_Forests.tex
index 26607c4..200d7fe 100644
--- a/chapters/Classical_Supervised_Learning/Trees_and_Forests.tex
+++ b/chapters/Classical_Supervised_Learning/Trees_and_Forests.tex
@@ -1,6 +1,6 @@
 \chapter{Trees and Forests}%
 \label{cha:Trees and Forests}
-\section{\glsxtrfull{CART}}%
+\section{\texorpdfstring{\glsxtrfull{CART}}{\glsfmtfull{CART}}}%
 \label{sec:CART}
 
 \begin{wrapfigure}[8]{r}{.5\textwidth}
@@ -77,7 +77,7 @@ In dieser Formel gibt $p_L(k)$ an,
 welchen Anteil die Klasse $k$ auf der linken Seite des Splits hat.
 
 \subsection{Beispiele}%
-\label{sub:Beispiele}
+\label{sub:CART:Beispiele}
 \subsubsection{Classification Tree}%
 \label{ssub:Classification Tree}
 \includegraphics[width=.6\textwidth]{classification_tree.png}
diff --git a/chapters/Classical_Supervised_Learning/k-Nearest_Neighbors.tex b/chapters/Classical_Supervised_Learning/k-Nearest_Neighbors.tex
index f787df4..f3d9731 100644
--- a/chapters/Classical_Supervised_Learning/k-Nearest_Neighbors.tex
+++ b/chapters/Classical_Supervised_Learning/k-Nearest_Neighbors.tex
@@ -1,4 +1,4 @@
-\chapter{\glsfmtfull{knn}}%
+\chapter[\glsfmtfull{knn}]{\texorpdfstring{\glsxtrfull{knn}}{\glsfmtfull{knn}}}%
 \label{cha:k-Nearest Neighbors}
 Beim \gls{knn}-Verfahren wird dem System eine Reihe von gelabelten Trainingsdaten übergeben.
 Für die Klassifizierung erfolgt durch
diff --git a/chapters/Einleitung.tex b/chapters/Einleitung.tex
index d103f76..2843046 100644
--- a/chapters/Einleitung.tex
+++ b/chapters/Einleitung.tex
@@ -60,7 +60,7 @@ Dies liegt vor allem an den folgenden Einflüssen:
 \end{itemize}
 
 \section{Anwendungsbeispiele}%
-\label{sec:Anwendungsbeispiele}
+\label{sec:ML:Anwendungsbeispiele}
 \begin{itemize}
     \item Handschrifterkennung: Klassifizierungsproblem
     \item Gesichtserkennung: Klassifizierungsproblem
diff --git a/chapters/Kernel_Methods/Kernel-Regression.tex b/chapters/Kernel_Methods/Kernel-Regression.tex
index c3481ca..45daeb3 100644
--- a/chapters/Kernel_Methods/Kernel-Regression.tex
+++ b/chapters/Kernel_Methods/Kernel-Regression.tex
@@ -45,7 +45,7 @@ Die Auswahl der passenden Hyperparameter (z.B. \nomsym{variance} für den \namer
     \label{fig:gaussian_kernel_model_selection}
 \end{figure}
 
-\section{Examples and comparison to \glsxtrshort{RBF} regression}%
+\section{Examples and comparison to \texorpdfstring{\glsxtrshort{RBF}}{\glsfmtshort{RBF}} regression}%
 \label{sec:Examples and comparison to RBF regression}
 \begin{center}
     \includegraphics[width=.9\textwidth]{kernel_regression_comparison.pdf}
diff --git a/chapters/Kernel_Methods/Support_Vector_Machines.tex b/chapters/Kernel_Methods/Support_Vector_Machines.tex
index 2070890..22cb0f8 100644
--- a/chapters/Kernel_Methods/Support_Vector_Machines.tex
+++ b/chapters/Kernel_Methods/Support_Vector_Machines.tex
@@ -1,4 +1,4 @@
-\chapter{\glsfmtfull{SVM}}%
+\chapter{\texorpdfstring{\glsxtrfull{SVM}}{\glsfmtfull{SVM}}}%
 \label{cha:SVM}
 \glspl{SVM} sind eine Methode zur binären Klassifikation (\cref{sec:Binary Classification}).
 Anders als bei anderen Algorithmen werden die Klassen hierbei nicht mit 0 und 1,
@@ -52,7 +52,7 @@ dass $\nomeq{margin}=\frac{2}{\|\bm w\|}$ ist.
 \begin{wrapfigure}{r}{.5\textwidth}
     \centering
     \includegraphics[width=\linewidth]{svm_positive_negative_support.png}
-    \caption{Support Vektoren einer \glsxtrshort{SVM}}
+    \caption{Support Vektoren einer \texorpdfstring{\glsxtrshort{SVM}}{\glsxfmtshort{SVM}}}
     \label{fig:svm_positive_negative_support}
     \vspace*{-10mm}
 \end{wrapfigure}
@@ -62,7 +62,7 @@ Zudem lassen sich im gleichen Zug die positiven und negativen Support Vektoren d
     \item negativer Support Vektor: $\bm w^T \bm x_- + b = +1$
 \end{itemize}
 
-\subsection{\glsxtrshort{SVM} Optimization}%
+\subsection{\texorpdfstring{\glsxtrshort{SVM} Optimization}{\glsfmtshort{SVM} Optimization}}%
 \label{sub:SVM Optimization}
 Das Problem ist für das Maximum Margin Verfahren gegeben durch:
 \begin{equation} \label{eq:maximum_margin_optimization_problem}
@@ -117,7 +117,7 @@ Die Interpretation der \noms{slack-variable} erfolgt dabei wie folgt:
 \end{figure}
 
 \subsection{Optimization}%
-\label{sub:Optimization}
+\label{sub:Soft Max-Margin:Optimization}
 Das Optimierungsproblem für die Soft Max-Margin Methode ist gegeben durch:
 \begin{equation} \label{eq:soft_max-margin_optimization}
     \argmin_{\bm w, \bm\xi} \|\bm w\|^2 + C\sum_i^N\nomeq{slack-variable}\qquad y_i(\bm w^T\bm x_i + b)\ge 1-\nomeq{slack-variable}, \nomeq{slack-variable}\ge 0
@@ -154,10 +154,10 @@ Im Falle des Hinge Loss bedeutet das:
 \end{equation}
 
 \section{Anwendungsbeispiele}%
-\label{sec:Anwendungsbeispiele}
+\label{sec:SVM:Anwendungsbeispiele}
 {\color{red} siehe Vorlesung 06 Folien 34 ff.}
 
-\section{\glsxtrshortpl{SVM} with Kernels}%
+\section{\texorpdfstring{\glsxtrshortpl{SVM} with Kernels}{\glsfmtshortpl{SVM} with Kernels}}%
 \label{sec:SVMs with Kernels}
 Mithilfe des Kernel Tricks (\cref{sec:Kernel Trick}) und der Lagrangian Optimization (\cref{sec:Lagrangian Multipliers}) kann die \gls{SVM}-Optimierung als Dual Optimization Problem formuliert werden ({\color{red} Herleitung Vorlesung 06 Folien 52-56}):
 \begin{itemize}
@@ -185,6 +185,6 @@ Die verstellbaren Parameter sind hierbei:
     \item die Parameter des gewählten Kernels 
 \end{itemize}
 
-\subsection{Beispiele}%
-\label{sub:Beispiele}
+\subsubsection{Beispiele}%
+\label{ssub:SVM:Model Selection:Beispiele}
 {\color{red} siehe Vorlesung 06 Folien 57-60 und 62-63}
diff --git a/chapters/Mathematische_Grundlagen/Probability_Theory.tex b/chapters/Mathematische_Grundlagen/Probability_Theory.tex
index 74a977d..b0a8665 100644
--- a/chapters/Mathematische_Grundlagen/Probability_Theory.tex
+++ b/chapters/Mathematische_Grundlagen/Probability_Theory.tex
@@ -167,7 +167,7 @@ Die Verteilung wird durch den \nomf{mean-vector} und die \nomf{covariance} volls
     \item die Summe von zwei gaußschen Normalverteilungen ist wieder eine gaußsche Normalverteilung
 \end{itemize}
 
-\section{\glsxtrfull{MLE}}%
+\section{\texorpdfstring{\glsxtrfull{MLE}}{\glsfmtfull{MLE}}}%
 \label{sec:MLE}
 Für einen gegebenen Trainingsdatensatz $D=\{(x_i,y_i)\}_{i=1\dots N}$ von unabhängig und identisch verteilten Zufallsvariablen (\gls{iid})
 auf Basis einer Wahrscheinlichkeitsdichtefunktion $p_{\text{data}}$ soll ein $\bm{\theta}$ gefunden werden,
@@ -183,7 +183,7 @@ In Bezug auf den Gesamten Datensatz bedeutet dies:
     \text{lik}(\bm{\theta};D) = \prod_i p_{\bm{\theta}}(x_i,y_i)
 \end{equation}
 Und die Log-likelihood ist definiert durch:
-\begin{equation} \label{eq:fittness_theta_whole_dataset}
+\begin{equation} \label{eq:loglik_theta_whole_dataset}
     \log\text{lik}(\bm{\theta};D) = \sum_i \log p_{\bm{\theta}}(x_i,y_i)
 \end{equation}
 Dieser wird zumeist für die Optimierung vewendet, da
@@ -211,7 +211,7 @@ In diesem Zusammenhang berechnet sich die \gls{MLE} durch:
         \nomeq{mean} &= \dfrac{\sum_i x_i}{N}
     \end{align}
 
-\subsection{\glsxtrshort{MLE}: conditional log-likelihood}%
+\subsection{\texorpdfstring{\glsxtrshort{MLE}}{\glsfmtshort{MLE}}: conditional log-likelihood}%
 \label{sub:MLE: conditional log-likelihood}
     \begin{equation} \label{eq:MLE:conditional}
         \log\text{lik}(\bm{\theta};D) = \sum_i \log p_{\bm{\theta}}(y_i|x_i)