diff --git a/Glossary.tex b/Glossary.tex
index 10cea49..262dfea 100644
--- a/Glossary.tex
+++ b/Glossary.tex
@@ -66,7 +66,7 @@
 \newcommand{\noms}[1]{\glsentryname{#1}\xspace}
 \newcommand{\nomS}[1]{\Glsentryname{#1}\xspace}
 %use nomenclature entry (symbol only)
-\newcommand{\nomsym}[1]{\texorpdfstring{\glslink{#1}{\boldmath\ensuremath{\glsentrysymbol{#1}}}}{#1}\xspace}
+\newcommand{\nomsym}[1]{\texorpdfstring{\glslink{#1}{\ensuremath{\glsentrysymbol{#1}}}}{#1}\xspace}
 %use nomenclature entry (use in equation)
 \newcommand{\nomeq}[1]{\glslink{#1}{\glsentrysymbol{#1}}}
 
@@ -74,7 +74,8 @@
 \newnom{gaussian_noise}{Gausches Rauschen}{\epsilon}{zufällige (normalverteilte) Abweichung}{}
 \newnom{vector_valued_function}{vektorwertige Funktion}{\phi(\bm{x})}{vektorwertige Funktion der des Eingangsvektor $\bm{x}$}{}
 \newnom{regularization_factor}{Regularisierungsfaktor}{\lambda}{}{}
-\newnom{identity_matrix}{Identitätsmatrix}{\bm{I}}{$\begin{bmatrix} 1 & 0 & \cdots & 0 \\ 0 & 1 & cdots & 0 \\ \vdots & \vdots & \ddots & \vdots\\ 0 & 0 & \cdots & 1 \end{bmatrix}$}{}
+\newnom{identity_matrix}{Identitätsmatrix}{\bm{I}}{$\begin{bmatrix} 1 & 0 & \cdots & 0 \\ 0 & 1 & \cdots & 0 \\ \vdots & \vdots & \ddots & \vdots\\ 0 & 0 & \cdots & 1 \end{bmatrix}$}{}
+\newnom{probability_mass_function}{Probability Mass Function}{p(x)}{Wahrscheinlichkeitsdichte-\slash\,Wahrscheinlichkeitsmassefunktion}
 
 \shorthandoff{"}
 
diff --git a/ML_Zusammenfassung.tex b/ML_Zusammenfassung.tex
index 7c0f30f..0b935d3 100644
--- a/ML_Zusammenfassung.tex
+++ b/ML_Zusammenfassung.tex
@@ -31,6 +31,11 @@
     \part{Einleitung}
     \input{chapters/Einleitung.tex}
 
+    \part{Mathematische Grundlagen}
+    \label{part:Mathematische Grundlagen}
+    \input{chapters/Mathematische_Grundlagen/Lineare_Algebra.tex}
+    \input{chapters/Mathematische_Grundlagen/Probability_Theory.tex}
+
     \part{Classical Supervised Learning}
     \label{part:Classical Supervised Learning}
     \input{chapters/Classical_Supervised_Learning/Linear_Regression.tex}
diff --git a/chapters/Classical_Supervised_Learning/Linear_Regression.tex b/chapters/Classical_Supervised_Learning/Linear_Regression.tex
index 888de0c..1979e3a 100644
--- a/chapters/Classical_Supervised_Learning/Linear_Regression.tex
+++ b/chapters/Classical_Supervised_Learning/Linear_Regression.tex
@@ -1,100 +1,6 @@
 \chapter{Linear Regression}%
 \label{cha:Linear Regression}
 
-Das Ziel von linearer Regression ist es eine Gerade zu finden,
-die eine Menge von Eingabedatenpunkten am besten approximiert.
-
-\section{Lineare Algebra}%
-\label{sec:Lineare Algebra}
-
-\subsection{Vektoren}%
-\label{sec:Vektoren}
-\begin{itemize}
-    \item Alle Vektoren werden \textbf{fett} geschrieben:
-        $\underbrace{x=1}_{\text{Skalar}},\qquad\underbrace{\bm{x}=\begin{bmatrix} 1\\2\\4 \end{bmatrix}}_{\text{Vektor}}$
-    \item ein Vektor $\bm{x}$ ist immer ein Spaltenvektor:
-        $\bm{x}=\begin{bmatrix} 1\\2\\4 \end{bmatrix}$
-    \item ein transponierter Vektor $\bm{x}^T$ ist immer ein Reihenvektor:
-        $\bm{x}^T=\begin{bmatrix} 1 & 2 & 4 \end{bmatrix}$
-    \item Multiplikation eines Vektors mit einem Skalar:
-        $2\begin{bmatrix} 1\\2\\4 \end{bmatrix} = \begin{bmatrix} 2\\4\\8\end{bmatrix}$
-    \item Vektoraddition:
-        $\begin{bmatrix} 1\\2\\4 \end{bmatrix} + \begin{bmatrix} 2\\1\\4 \end{bmatrix} = \begin{bmatrix} 3\\3\\8 \end{bmatrix}$
-    \item Skalarprodukt $\langle \bm{v},\bm{w}\rangle$ der Vektoren $\bm{v}=\begin{bmatrix} 1\\2\\4 \end{bmatrix}$ und $\bm{w}=\begin{bmatrix} 2\\4\\8 \end{bmatrix}$:
-        $\langle \bm{v},\bm{w}\rangle = 1\cdot 2 + 2\cdot 4 + 4\cdot 8 = 42$
-    \item Länge eines Vektors:
-        $||\bm{v}|| = \langle \bm{v},\bm{v}\rangle^{\frac{1}{2}} = (1^2 + 2^2 + 4^2)^{\frac{1}{2}} = \sqrt{21}$
-\end{itemize}
-
-\subsection{Matrizen}%
-\label{sec:Matrizen}
-\begin{itemize}
-    \item Matrizen werden \uppercase{groß} und \textbf{fett} geschrieben:
-        $\bm{X}=\begin{bmatrix} 1&3\\2&3\\4&7 \end{bmatrix}\qquad\bm{A}=\begin{bmatrix}1&3&5&4\\2&3&7&2 \end{bmatrix}$
-    \item Mehrere Vektoren können zu einer Matrix zusammengefasst werden.
-        Hierbei werden die Vektoren meistens zu Zeilen und repräsentieren einen Einzelnen Datensatz:
-        $$\bm{x}_1 = \begin{bmatrix} 37\\72\\175 \end{bmatrix} \qquad \bm{x}_2 = \begin{bmatrix} 10\\30\\61 \end{bmatrix}\qquad \bm{x}_3 = \begin{bmatrix} 25\\65\\121 \end{bmatrix}\qquad \bm{x}_4 = \begin{bmatrix} 66\\67\\175 \end{bmatrix}$$
-        $$\bm{X} = \begin{bmatrix} \bm{x}_1^T\\\bm{x}_2^T\\\bm{x}_3^T\\\bm{x}_4^T \end{bmatrix} = \begin{bmatrix} 37&72&175\\10&30&61\\25&65&121\\66&67&175 \end{bmatrix}$$
-    \item Multiplikation mit einem Skalar:
-        $ 3\bm{M} = 3\begin{bmatrix} 3 & 4 & 5 \\ 1 & 0 & 1 \end{bmatrix} = \begin{bmatrix} 9 & 12 & 15\\ 3 & 0 & 3 \end{bmatrix}$
-    \item Addition von Matrizen:
-        $ \bm{M} + \bm{N} = \begin{bmatrix} 3 & 4 & 5 \\ 1 & 0 & 1 \end{bmatrix} + \begin{bmatrix} 1 & 2 & 1 \\ 3 & 1 & 1 \end{bmatrix} = \begin{bmatrix} 4 & 6 & 6 \\ 4 & 1 & 2 \end{bmatrix}$
-    \item Transponierte Matrizen:
-        $ \bm{M} = \begin{bmatrix} 3 & 4 & 5 \\ 1 & 0 & 1 \end{bmatrix}, \bm{M}^T = \begin{bmatrix} 3 & 1\\ 4 & 0\\ 5 & 1 \end{bmatrix}$
-    \item Matrix-Vektor-Produkt:
-        $$ \bm{Wv} = \underbrace{\begin{bmatrix} \bm{w}_1 & \cdots & \bm{w}_n \end{bmatrix}}_{\bm{W}}\underbrace{\begin{bmatrix} v_1 \\ \vdots \\ v_n \end{bmatrix}}_{\bm{v}} = \begin{bmatrix} v_1\bm{w}_1 + \cdots + v_n\bm{w}_n \end{bmatrix} $$
-    $$ \bm{Wv} = \begin{bmatrix} 3 & 4 & 5 \\ 1 & 0 & 1 \end{bmatrix}\begin{bmatrix} 3\\0\\2 \end{bmatrix} = \begin{bmatrix} 3\cdot1 + 4\cdot0 + 5\cdot2 \\ 1\cdot1 + 0\cdot0 + 1\cdot2 \end{bmatrix} = \begin{bmatrix} 13\\3 \end{bmatrix}$$
-    \item Matrix-Matrix-Produkt:
-        $$ \bm{WV} = \begin{bmatrix} 3 & 4 & 5\\ 1 & 0 & 1 \end{bmatrix}\begin{bmatrix} 1 & 0\\0 & 3\\2 & 4 \end{bmatrix} = 
-        \begin{bmatrix} 3\cdot1+4\cdot0+5\cdot2 & 3\cdot0+4\cdot3+5\cdot4\\ 1\cdot1+0\cdot0+1\cdot2 & 1\cdot0+0\cdot3+1\cdot4 \end{bmatrix} = \begin{bmatrix} 13 & 32\\ 3 & 4 \end{bmatrix}$$
-        \begin{itemize}
-            \item Dimensionen: $\underbrace{m\times n}_{\bm{W}}\cdot\underbrace{n\times j}_{\bm{V}} = \underbrace{m\times j}_{\bm{U}}$
-            \item nicht kommutativ: $\bm{VW} \neq \bm{WV}$
-            \item Assoziativ: $\bm{V}(\bm{WX}) = (\bm{VW})\bm{X}$
-            \item Transponiertes Produkt: $(\bm{VW})^T = \bm{W}^T\bm{V}^T$
-        \end{itemize}
-    \item Reihen"~ und Spaltendurchschnitt:
-        $$\bm{X} = \begin{bmatrix} \bm{X}_{1,1} & \cdots & \bm{X}_{1,m}\\ \vdots & \ddots & \vdots\\ \bm{X}_{n,1} & \cdots & \bm{X}_{n,m} \end{bmatrix}$$
-        \begin{itemize}
-            \item Vektor von Reihen-Durchschnitten:
-                $$\begin{bmatrix} \frac{1}{m}\sum_{i=1}^{m} X_{1,i}\\ \vdots \\ \frac{1}{m}\sum_{i=1}^{m} X_{n,i} \end{bmatrix} = \bm{X}\begin{bmatrix} \frac{1}{m}\\\vdots\\\frac{1}{m} \end{bmatrix} = \bm{Xa},
-                \text{ mit } \bm{a} = \begin{bmatrix} \frac{1}{m}\\\vdots\\\frac{1}{m} \end{bmatrix}$$
-            \item Vektor von Spalten-Durchschnitten:
-                $$ \begin{bmatrix} \frac{1}{n}\sum_{i=1}^{n} X_{i,1} & \cdots & \frac{1}{n}\sum_{i=1}^{n}X_{i,m} \end{bmatrix} = \begin{bmatrix} \frac{1}{n} & \cdots & \frac{1}{n} \end{bmatrix}\bm{X} = 
-                \bm{b}^T\bm{X}, \text{ mit }\bm{b} = \begin{bmatrix} \frac{1}{n}\\\vdots\\\frac{1}{n} \end{bmatrix}$$
-        \end{itemize}
-    \item Matrix Inverse:
-        \begin{itemize}
-            \item Definition: $\bm{WW}^{-1} = \bm{I},\qquad\bm{W}^{-1}\bm{W} = \bm{I}$
-            \item Identiätsmatrix $\bm{I} = \begin{bmatrix} 1 & \cdots & 0 \\ \vdots & \ddots & \vdots \\ 0 & \cdots & 1 \end{bmatrix}$
-            \item \textbf{nur} quadratische Matrizen können invertiert werden
-        \end{itemize}
-\end{itemize}
-
-\subsection{Matrix-Calculus}%
-\label{sec:Matrix-Calculus}
-\begin{itemize}
-    \item Die Ableitung einer skalare Funktion, die auf einem Vektor ausgewertet wird ergibt einen Gradientenvektor:
-        $$\nabla_{\bm{x}}f = \frac{\partial f(\bm{x})}{\partial \bm{x}} = \begin{bmatrix} \frac{\partial f(\bm{x})}{\partial x_1} \\\vdots\\ \frac{\partial f(\bm{x})}{\partial x_d}\end{bmatrix}$$
-    \item Die Ableitung einer Vektor-Funktion, die auf einem Vektor ausgewertet wird ergibt eine Jacobische Matrix
-        $$\nabla_{\bm{x}}\bm{f} = \frac{\partial \bm{f}(\bm{x})}{\partial \bm{x}} 
-        = \begin{bmatrix} \frac{\partial f_1(\bm{x})}{\partial x_1} & \cdots & \frac{\partial f_k(\bm{x})}{\partial x_1} \\ \vdots & \ddots & \vdots \\ \frac{\partial f_1(\bm{x})}{\partial x_d} & \cdots & \frac{\partial f_k(\bm{x})}{\partial x_d}\end{bmatrix}$$
-    \item die Ableitung einer skalaren Funktion, die auf einer Matrix ausgewertet wird ergibt eine Matrix:
-        $$\nabla_{\bm{W}}f = \frac{\partial f(\bm{W})}{\partial \bm{W}} 
-        = \begin{bmatrix} \frac{\partial f(\bm{W})}{\partial W_{11}} & \cdots & \frac{\partial f(\bm{W})}{\partial W_{1d}} \\ \vdots & \ddots & \vdots \\ \frac{\partial f(\bm{W})}{\partial W_{k1}} & \cdots & \frac{\partial f(\bm{W})}{\partial W_{kd}}\end{bmatrix}$$
-    \item die Ableitung einer Vektor-Funktion, die auf einer Matrix ausgewertet wird ergibt einen 3D-Tensor (sehr kompliziert, wird (fast) nie benötigt)
-    \item Grundlegende Formeln:\\
-        \begin{tabular}{l c|c|c}
-            & \bfseries Skalar & \bfseries Vektor & \bfseries Matrix\\
-            &&&\\
-            \textbf{Linear} & $\frac{\partial a x}{\partial x} = a$ & $\nabla_{\bm{x}}\bm{Ax} = \bm{A}^T$ & \begin{tabular}{@{}c@{}} $\nabla_{\bm{X}}\bm{a}^T\bm{Xb} = \bm{ab}^T $ \\ $\nabla_{\bm{X}}tr(\bm{AXB}) = \bm{A}^T\bm{B}^T$ \end{tabular}\\
-            &&&\\
-            \textbf{Quadratisch} & $\frac{\partial x^2}{\partial x} = 2x$ &  $\begin{aligned} \nabla_{\bm{x}}\bm{x}^T\bm{x} &= 2\bm{x} \\ \nabla_{\bm{x}}\bm{x}^T\bm{Ax} &= (\bm{A}^T + \bm{A})\bm{x}\\&=2\bm{Ax}\text{ wenn $A$ symmetrisch ist }\end{aligned}$ & 
-        \end{tabular}
-\end{itemize}
-
-\section{Regression}%
-\label{sec:Regression}
 Das Ziel einer Regression ist es eine kontinuierliche Funktion $y=f(x)+\nomeq{gaussian_noise}$ zu lernen.
 Im Falle der linearen Regression bedeutet das,
 dass versucht wird eine Gerade zu finden,
@@ -113,7 +19,7 @@ Die Regression verfolgt hierbei zumeist das Ziel die Summe oder den Durchschnitt
     \nomeq{summed_squared_error} = \sum_{i=1}^{N}(y_i-f(x_i))^2
 \end{equation}
 
-\subsection{Regression für d-dimensionale Eingabevektoren}%
+\section{Regression für d-dimensionale Eingabevektoren}%
 \label{sub:Regression für d-dimensionale Eingabevektoren}
 Wenn die Eingangswerte durch einen d-dimensionalen Vektor $\bm{x}$ dargestellt werden,
 ergibt sich die folgende Funktion:
diff --git a/chapters/Mathematische_Grundlagen/Lineare_Algebra.tex b/chapters/Mathematische_Grundlagen/Lineare_Algebra.tex
new file mode 100644
index 0000000..8cd549f
--- /dev/null
+++ b/chapters/Mathematische_Grundlagen/Lineare_Algebra.tex
@@ -0,0 +1,90 @@
+\chapter{Lineare Algebra}%
+\label{cha:Lineare Algebra}
+
+\section{Vektoren}%
+\label{sec:Vektoren}
+\begin{itemize}
+    \item Alle Vektoren werden \textbf{fett} geschrieben:
+        $\underbrace{x=1}_{\text{Skalar}},\qquad\underbrace{\bm{x}=\begin{bmatrix} 1\\2\\4 \end{bmatrix}}_{\text{Vektor}}$
+    \item ein Vektor $\bm{x}$ ist immer ein Spaltenvektor:
+        $\bm{x}=\begin{bmatrix} 1\\2\\4 \end{bmatrix}$
+    \item ein transponierter Vektor $\bm{x}^T$ ist immer ein Reihenvektor:
+        $\bm{x}^T=\begin{bmatrix} 1 & 2 & 4 \end{bmatrix}$
+    \item Multiplikation eines Vektors mit einem Skalar:
+        $2\begin{bmatrix} 1\\2\\4 \end{bmatrix} = \begin{bmatrix} 2\\4\\8\end{bmatrix}$
+    \item Vektoraddition:
+        $\begin{bmatrix} 1\\2\\4 \end{bmatrix} + \begin{bmatrix} 2\\1\\4 \end{bmatrix} = \begin{bmatrix} 3\\3\\8 \end{bmatrix}$
+    \item Skalarprodukt $\langle \bm{v},\bm{w}\rangle$ der Vektoren $\bm{v}=\begin{bmatrix} 1\\2\\4 \end{bmatrix}$ und $\bm{w}=\begin{bmatrix} 2\\4\\8 \end{bmatrix}$:
+        $\langle \bm{v},\bm{w}\rangle = 1\cdot 2 + 2\cdot 4 + 4\cdot 8 = 42$
+    \item Länge eines Vektors:
+        $||\bm{v}|| = \langle \bm{v},\bm{v}\rangle^{\frac{1}{2}} = (1^2 + 2^2 + 4^2)^{\frac{1}{2}} = \sqrt{21}$
+\end{itemize}
+
+\section{Matrizen}%
+\label{sec:Matrizen}
+\begin{itemize}
+    \item Matrizen werden \uppercase{groß} und \textbf{fett} geschrieben:
+        $\bm{X}=\begin{bmatrix} 1&3\\2&3\\4&7 \end{bmatrix}\qquad\bm{A}=\begin{bmatrix}1&3&5&4\\2&3&7&2 \end{bmatrix}$
+    \item Mehrere Vektoren können zu einer Matrix zusammengefasst werden.
+        Hierbei werden die Vektoren meistens zu Zeilen und repräsentieren einen Einzelnen Datensatz:
+        $$\bm{x}_1 = \begin{bmatrix} 37\\72\\175 \end{bmatrix} \qquad \bm{x}_2 = \begin{bmatrix} 10\\30\\61 \end{bmatrix}\qquad \bm{x}_3 = \begin{bmatrix} 25\\65\\121 \end{bmatrix}\qquad \bm{x}_4 = \begin{bmatrix} 66\\67\\175 \end{bmatrix}$$
+        $$\bm{X} = \begin{bmatrix} \bm{x}_1^T\\\bm{x}_2^T\\\bm{x}_3^T\\\bm{x}_4^T \end{bmatrix} = \begin{bmatrix} 37&72&175\\10&30&61\\25&65&121\\66&67&175 \end{bmatrix}$$
+    \item Multiplikation mit einem Skalar:
+        $ 3\bm{M} = 3\begin{bmatrix} 3 & 4 & 5 \\ 1 & 0 & 1 \end{bmatrix} = \begin{bmatrix} 9 & 12 & 15\\ 3 & 0 & 3 \end{bmatrix}$
+    \item Addition von Matrizen:
+        $ \bm{M} + \bm{N} = \begin{bmatrix} 3 & 4 & 5 \\ 1 & 0 & 1 \end{bmatrix} + \begin{bmatrix} 1 & 2 & 1 \\ 3 & 1 & 1 \end{bmatrix} = \begin{bmatrix} 4 & 6 & 6 \\ 4 & 1 & 2 \end{bmatrix}$
+    \item Transponierte Matrizen:
+        $ \bm{M} = \begin{bmatrix} 3 & 4 & 5 \\ 1 & 0 & 1 \end{bmatrix}, \bm{M}^T = \begin{bmatrix} 3 & 1\\ 4 & 0\\ 5 & 1 \end{bmatrix}$
+    \item Matrix-Vektor-Produkt:
+        $$ \bm{Wv} = \underbrace{\begin{bmatrix} \bm{w}_1 & \cdots & \bm{w}_n \end{bmatrix}}_{\bm{W}}\underbrace{\begin{bmatrix} v_1 \\ \vdots \\ v_n \end{bmatrix}}_{\bm{v}} = \begin{bmatrix} v_1\bm{w}_1 + \cdots + v_n\bm{w}_n \end{bmatrix} $$
+    $$ \bm{Wv} = \begin{bmatrix} 3 & 4 & 5 \\ 1 & 0 & 1 \end{bmatrix}\begin{bmatrix} 3\\0\\2 \end{bmatrix} = \begin{bmatrix} 3\cdot1 + 4\cdot0 + 5\cdot2 \\ 1\cdot1 + 0\cdot0 + 1\cdot2 \end{bmatrix} = \begin{bmatrix} 13\\3 \end{bmatrix}$$
+    \item Matrix-Matrix-Produkt:
+        $$ \bm{WV} = \begin{bmatrix} 3 & 4 & 5\\ 1 & 0 & 1 \end{bmatrix}\begin{bmatrix} 1 & 0\\0 & 3\\2 & 4 \end{bmatrix} = 
+        \begin{bmatrix} 3\cdot1+4\cdot0+5\cdot2 & 3\cdot0+4\cdot3+5\cdot4\\ 1\cdot1+0\cdot0+1\cdot2 & 1\cdot0+0\cdot3+1\cdot4 \end{bmatrix} = \begin{bmatrix} 13 & 32\\ 3 & 4 \end{bmatrix}$$
+        \begin{itemize}
+            \item Dimensionen: $\underbrace{m\times n}_{\bm{W}}\cdot\underbrace{n\times j}_{\bm{V}} = \underbrace{m\times j}_{\bm{U}}$
+            \item nicht kommutativ: $\bm{VW} \neq \bm{WV}$
+            \item Assoziativ: $\bm{V}(\bm{WX}) = (\bm{VW})\bm{X}$
+            \item Transponiertes Produkt: $(\bm{VW})^T = \bm{W}^T\bm{V}^T$
+        \end{itemize}
+    \item Reihen"~ und Spaltendurchschnitt:
+        $$\bm{X} = \begin{bmatrix} \bm{X}_{1,1} & \cdots & \bm{X}_{1,m}\\ \vdots & \ddots & \vdots\\ \bm{X}_{n,1} & \cdots & \bm{X}_{n,m} \end{bmatrix}$$
+        \begin{itemize}
+            \item Vektor von Reihen-Durchschnitten:
+                $$\begin{bmatrix} \frac{1}{m}\sum_{i=1}^{m} X_{1,i}\\ \vdots \\ \frac{1}{m}\sum_{i=1}^{m} X_{n,i} \end{bmatrix} = \bm{X}\begin{bmatrix} \frac{1}{m}\\\vdots\\\frac{1}{m} \end{bmatrix} = \bm{Xa},
+                \text{ mit } \bm{a} = \begin{bmatrix} \frac{1}{m}\\\vdots\\\frac{1}{m} \end{bmatrix}$$
+            \item Vektor von Spalten-Durchschnitten:
+                $$ \begin{bmatrix} \frac{1}{n}\sum_{i=1}^{n} X_{i,1} & \cdots & \frac{1}{n}\sum_{i=1}^{n}X_{i,m} \end{bmatrix} = \begin{bmatrix} \frac{1}{n} & \cdots & \frac{1}{n} \end{bmatrix}\bm{X} = 
+                \bm{b}^T\bm{X}, \text{ mit }\bm{b} = \begin{bmatrix} \frac{1}{n}\\\vdots\\\frac{1}{n} \end{bmatrix}$$
+        \end{itemize}
+    \item Matrix Inverse:
+        \begin{itemize}
+            \item Definition: $\bm{WW}^{-1} = \bm{I},\qquad\bm{W}^{-1}\bm{W} = \bm{I}$
+            \item Identiätsmatrix $\bm{I} = \begin{bmatrix} 1 & \cdots & 0 \\ \vdots & \ddots & \vdots \\ 0 & \cdots & 1 \end{bmatrix}$
+            \item \textbf{nur} quadratische Matrizen können invertiert werden
+        \end{itemize}
+\end{itemize}
+
+\section{Matrix-Calculus}%
+\label{sec:Matrix-Calculus}
+\begin{itemize}
+    \item Die Ableitung einer skalare Funktion, die auf einem Vektor ausgewertet wird ergibt einen Gradientenvektor:
+        $$\nabla_{\bm{x}}f = \frac{\partial f(\bm{x})}{\partial \bm{x}} = \begin{bmatrix} \frac{\partial f(\bm{x})}{\partial x_1} \\\vdots\\ \frac{\partial f(\bm{x})}{\partial x_d}\end{bmatrix}$$
+    \item Die Ableitung einer Vektor-Funktion, die auf einem Vektor ausgewertet wird ergibt eine Jacobische Matrix
+        $$\nabla_{\bm{x}}\bm{f} = \frac{\partial \bm{f}(\bm{x})}{\partial \bm{x}} 
+        = \begin{bmatrix} \frac{\partial f_1(\bm{x})}{\partial x_1} & \cdots & \frac{\partial f_k(\bm{x})}{\partial x_1} \\ \vdots & \ddots & \vdots \\ \frac{\partial f_1(\bm{x})}{\partial x_d} & \cdots & \frac{\partial f_k(\bm{x})}{\partial x_d}\end{bmatrix}$$
+    \item die Ableitung einer skalaren Funktion, die auf einer Matrix ausgewertet wird ergibt eine Matrix:
+        $$\nabla_{\bm{W}}f = \frac{\partial f(\bm{W})}{\partial \bm{W}} 
+        = \begin{bmatrix} \frac{\partial f(\bm{W})}{\partial W_{11}} & \cdots & \frac{\partial f(\bm{W})}{\partial W_{1d}} \\ \vdots & \ddots & \vdots \\ \frac{\partial f(\bm{W})}{\partial W_{k1}} & \cdots & \frac{\partial f(\bm{W})}{\partial W_{kd}}\end{bmatrix}$$
+    \item die Ableitung einer Vektor-Funktion, die auf einer Matrix ausgewertet wird ergibt einen 3D-Tensor (sehr kompliziert, wird (fast) nie benötigt)
+    \item Grundlegende Formeln:\\
+        \begin{tabular}{l c|c|c}
+            & \bfseries Skalar & \bfseries Vektor & \bfseries Matrix\\
+            &&&\\
+            \textbf{Linear} & $\frac{\partial a x}{\partial x} = a$ & $\nabla_{\bm{x}}\bm{Ax} = \bm{A}^T$ & \begin{tabular}{@{}c@{}} $\nabla_{\bm{X}}\bm{a}^T\bm{Xb} = \bm{ab}^T $ \\ $\nabla_{\bm{X}}tr(\bm{AXB}) = \bm{A}^T\bm{B}^T$ \end{tabular}\\
+            &&&\\
+            \textbf{Quadratisch} & $\frac{\partial x^2}{\partial x} = 2x$ &  $\begin{aligned} \nabla_{\bm{x}}\bm{x}^T\bm{x} &= 2\bm{x} \\ \nabla_{\bm{x}}\bm{x}^T\bm{Ax} &= (\bm{A}^T + \bm{A})\bm{x}\\&=2\bm{Ax}\text{ wenn $A$ symmetrisch ist }\end{aligned}$ & 
+        \end{tabular}
+\end{itemize}
+
+
diff --git a/chapters/Mathematische_Grundlagen/Probability_Theory.tex b/chapters/Mathematische_Grundlagen/Probability_Theory.tex
new file mode 100644
index 0000000..1c9046f
--- /dev/null
+++ b/chapters/Mathematische_Grundlagen/Probability_Theory.tex
@@ -0,0 +1,61 @@
+\chapter{Probability Theory}%
+\label{cha:Probability Theory}
+Eine Funktion \nomsym{probability_mass_function},
+die die Wahrscheinlichkeit angibt,
+dass $X$ den Wert $x$ annimmt,
+wird als \noms{probability_mass_function} bezeichnet.
+Eine gültige \noms{probability_mass_function} muss folgende Eigenschaften erfüllen:
+\begin{itemize}
+    \item weist jedem $x\in X$ einen Wert zu
+    \item nicht-negativ
+    \item die Summe aller Wahrscheinlichkeiten ist 1
+\end{itemize}
+
+Zwei Wahrscheinlichkeitsdichtefunktionen können dabei auf verschiedene Arten miteinander zusammenhängen:
+\begin{itemize}
+    \item \textbf{Joint Distribution} $p(x,y)$: Die Wahrscheinlichkeit das $X=x$ und $Y=y$
+    \item \textbf{Conditional Distribution} $p(x|y)$: Die Wahrscheinlichkeit für $X=x$, wenn $Y=y$ gegeben ist
+\end{itemize}
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=.6\linewidth]{images/conditional_and_joint_distribution.png}
+    \caption{Conditional and Joint Distribution}
+    \label{fig:conditional_and_joint_distribution}
+\end{figure}
+
+\section{Rules of Probability}%
+\label{sec:Rules of Probability}
+\paragraph{Summenregel}%
+\label{par:Summenregel}
+\begin{align} \label{eq:sum_rule}
+    p(x) &= \sum_y p(x,y)\\
+    p(x_1) &= \sum_{x_2}\sum_{x_3}\cdots\sum_{x_D} p(x_1,\dots,x_D)
+\end{align}
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=0.6\textwidth]{images/sum_rule.png}
+    \caption{Summenregel}
+    \label{fig:sum_rule}
+\end{figure}
+
+\paragraph{Ketten"~\slash\,Produktregel}%
+\label{par:Ketten-/Produktregel}
+\begin{align}\label{eq:chain_rule}
+    p(x,y) &= p(x|y)p(y) \\
+    p(x_1,\dots,x_D) &= p(x_1)p(x_2|x_1)\dots p(x_D|x_1,\dots, x_{D-1})
+\end{align}
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=0.7\textwidth]{images/chain_rule.png}
+    \caption{Ketten"~\slash\,Produktregel}
+    \label{fig:chain_rule}
+\end{figure}
+
+\subsection{Bayes Rule}%
+\label{sub:Bayes Rule}
+Die Regel von Bayes ist eine der wichtigsten Regeln der Wahrscheinlichkeitstheorie und essentiell im Bereich des Maschinellen Lernens.
+\begin{equation} \label{eq:Bayes Rule}
+    p(x|y) = \dfrac{p(y|x)p(x)}{p(y)} = \dfrac{p(y|x)p(x)}{\sum_{x'}p(y|x')p(x')}
+\end{equation}
+
+Weiter auf Seite 114
diff --git a/images/chain_rule.png b/images/chain_rule.png
new file mode 100644
index 0000000..0645c28
Binary files /dev/null and b/images/chain_rule.png differ
diff --git a/images/conditional_and_joint_distribution.png b/images/conditional_and_joint_distribution.png
new file mode 100644
index 0000000..8c5d973
Binary files /dev/null and b/images/conditional_and_joint_distribution.png differ
diff --git a/images/sum_rule.png b/images/sum_rule.png
new file mode 100644
index 0000000..27402ce
Binary files /dev/null and b/images/sum_rule.png differ