diff --git a/Glossary.tex b/Glossary.tex
index b39acd2..0f494ee 100644
--- a/Glossary.tex
+++ b/Glossary.tex
@@ -80,6 +80,9 @@
 \newacronym{RBF}{RBF}{Radial Basis Function Kernel}
 \newacronym{SVM}{SVM}{Support Vector Machine}
 \newacronym{ARD}{ARD}{Automatic Relevance Determination}
+\newacronym{MLP}{MLP}{Multi-Layer Perceptron}
+\newacronym{ReLU}{ReLU}{Rectified Linear Unit}
+\newacronym{ELU}{ELU}{Exponential Linear Units}
 
 %--------------------
 %nomenclature
@@ -134,6 +137,7 @@
 \newnom{gaussian_distribution}{Gaußsche Normalverteilung}{\mathcal{N}}{}
 \newnom{gaussian_process}{Gaußscher Prozess}{\mathcal{GP}}{}
 \newnom{hyper_parameters}{Hyper-Parameter}{\bm{\beta}}{}
+\newnom{activation_function}{Aktivierungsfunktion}{\phi}{}
 \shorthandoff{"}
 
 \makeglossaries
diff --git a/ML_Zusammenfassung.tex b/ML_Zusammenfassung.tex
index 5c26169..87e70af 100644
--- a/ML_Zusammenfassung.tex
+++ b/ML_Zusammenfassung.tex
@@ -51,6 +51,7 @@
 
     \part{Neural Networks}
     \label{part:Neural Networks}
+    \input{chapters/Neural_Networks/Basics.tex}
     \input{chapters/Neural_Networks/Neural_Networks_and_Backpropagation.tex}
     \input{chapters/Neural_Networks/CNNs_and_LSTMs.tex}
 
diff --git a/Packages.tex b/Packages.tex
index a3d3b6c..095db22 100644
--- a/Packages.tex
+++ b/Packages.tex
@@ -22,6 +22,8 @@
 \usepackage{svg}
 %subfigures
 \usepackage{subcaption}
+%borders around images
+\usepackage[export]{adjustbox}
 
 %--------------------
 %german quotation
@@ -117,6 +119,7 @@ rightsub = \grq%
 \DeclareMathOperator*{\lik}{lik}
 \DeclareMathOperator*{\loss}{loss}
 \DeclareMathOperator*{\loglik}{loglik}
+\DeclareMathOperator*{\softmax}{softmax}
 %special symbols
 \usepackage{fontawesome}
 \usepackage{amssymb}
diff --git a/chapters/Neural_Networks/Basics.tex b/chapters/Neural_Networks/Basics.tex
new file mode 100644
index 0000000..0b54e5f
--- /dev/null
+++ b/chapters/Neural_Networks/Basics.tex
@@ -0,0 +1,169 @@
+\chapter{Basics}%
+\label{cha:Neural Networks:Basics}
+Neuronale Netze sind vom menschlichen Gehirn inspiriert.
+Dieses besteht aus einer Vielzahl ($\approx 10^11$) Neuronen,
+die untereinander stark verknüpft sind ($\approx 10^4$ Verknüpfungen pro Neuron).
+Das Signal der eingehenden Neuronen muss stark genug sein,
+damit ein angesteuertes Neuron ein Signal ausgibt.
+
+Diesem Prinzip folgend bestehen auch Neuronale Netze aus einfachen Bestandteilen (den Neuronen\slash\,Perzeptronen).
+Ein Neuron hat dabei mehrere Eingänge und einen Ausgang.
+Die Berechnung des Ausgabewertes ergibt sich durch
+
+\begin{equation} \label{eq:perceptron_output_function}
+    y = \nomeq{activation_function}(\bm{w}^T\bm x + b)
+\end{equation}
+\begin{wrapfigure}{r}{.5\textwidth}
+    \vspace*{-5mm}
+    \centering
+    \includegraphics[width=0.8\linewidth]{perceptron.png}
+    \caption{Aufbau eines Perzeptron}
+    \label{fig:perceptron}
+    \vspace*{-20mm}
+\end{wrapfigure}
+\begin{itemize}
+    \item $y$:\tabto{15mm}Output
+    \item \nomsym{activation_function}:\tabto{15mm}\noms{activation_function}
+    \item $\bm w$:\tabto{15mm}Gewichte
+    \item $\bm x$:\tabto{15mm} Inputs
+    \item $b$:\tabto{15mm}Bias
+\end{itemize}
+Ein Beispiel für eine solche Output-Funktion ist schon aus \cref{cha:Linear Regression} bekannt.
+
+\section{Feedforward Neural Networks}%
+\label{sec:Feedforward Neural Networks}
+\begin{wrapfigure}{r}{.4\textwidth}
+    \vspace*{-10mm}
+    \centering
+    \includegraphics[width=\linewidth]{feedforward_neural_network.png}
+    \caption{Feedforward Neural Network}
+    \label{fig:feedforward_neural_network}
+\end{wrapfigure}
+Ein Feedforward Neural Network ist ein Neurales Netzwerk,
+bei dem die einzelnen Neuronen in Schichten aufgebaut sind,
+wobei jedes Neutronen nur Inputs aus der darunterliegenden Schicht erhält und an die darüberliegende Schicht weitergibt.
+Feedforward Neural Networks werden daher auch als \glspl{MLP} bezeichnet.
+Das Netzwerk hat daher die Form eines gerichteten azyklischen Graphens (Directed Acyclic Graph).
+
+Hierbei werden die Eingabewerte an die Eingabeschicht (Input Layer) übergeben und die Ausgabe des Neuralen Netzwerkes erfolgt an der Output Layer.
+Alle Schichten zwischen Input und Output Layer werden als verdeckte Schichten (Hidden Layer) bezeichnet.
+
+Die Gewichte zwischen den einzelnen Layern bilden eine Matrix $\bm W$.
+Der Ausgabevektor $\bm y$,
+bestehend aus den Ausgabewerten der in der Schicht enthaltenen Neuronen,
+ergibt sich durch:
+
+\begin{equation} \label{eq:output_function_feedforward_layer}
+    \bm y = \nomeq{activation_function}(\bm W\bm x + \bm b)
+\end{equation}
+\begin{wrapfigure}{r}{.3\textwidth}
+    \vspace*{-8mm}
+    \centering
+    \includegraphics[width=0.8\linewidth]{feedforward_neural_network_composition.png}
+    \caption{Feedforward Neural Network mit Funktionen}
+    \label{fig:feedforward_neural_network_composition}
+    \vspace*{-20mm}
+\end{wrapfigure}
+Da jede Layer einen Vektor als Eingabe und Ausgabe hat,
+kann man sie auch als vektorwertige Funktion schreiben
+\begin{align} \label{eq:feedforward_neural_network_vector_valued_function}
+    \bm h^{(1)} &= f^{(1)}(\bm x)\\
+    \bm h^{(2)} &= f^{(2)}(\bm h^{(1)})\\
+    \vdots\\
+    \bm y &= f^{(L)}(\bm h^{(L-1)})
+\end{align}
+Daher kann man die Ausgabe eines Feedforward Neural Networks auch als Komposition der Funktionen der einzelnen Schichten betrachten.
+\begin{equation} \label{eq:feedforward_neural_network_composition}
+    \bm y = f^{(L)}\circ f^{(L-1)}\circ\dots f^{(1)}(\bm x)
+\end{equation}
+Hieraus ergibt sich allerdings auch,
+dass jedes mehrschichtige Feedforward Neural Network auch durch ein Netzwerk mit nur einer Schicht ersetzt werden kann (Universal Representation).
+Hierzu wären allerdings eine exponentielle Anzahl an Neuronen nötig,
+weshalb in der Praxis mehrschichtige Netzwerke verwendet werden (Compact Representation).
+
+\subsection{Example: XOR}%
+\label{sub:Example: XOR}
+({\color{red}siehe Vorlesung 08 Folie 25 und 26})
+
+\section{\nomf{activation_function}}%
+\label{sec:Activation Function}
+Die \nomf{activation_function} gibt vor,
+ab welchem Schwellwert das Produkt aus Eingangswerten und Gewichten zu relevanten Ausgabewert führt.
+In den meisten Fällen wird die \glsxtrshort{ReLU} \noms{activation_function} verwendet,
+wobei es sich auch lohnt, die Leaky \glsxtrshort{ReLU} oder \glsxtrshort{ELU} auszubrobieren.
+Die Sigmoid Funktion (\cref{ssub:Logistic sigmoid function}) sollte ausschließlich als \noms{activation_function} in Klassifikationsproblemen verwendet werden.\\
+\includegraphics[scale=.7]{sigmoid_activation_function.png}\\
+\hrule{\textwidth,1mm}
+\includegraphics[scale=.7]{tanh_activation_function.png}\\
+\hrule{\textwidth,1mm}
+\includegraphics[scale=.7]{ReLU_activation_function.png}\\
+\hrule{\textwidth,1mm}
+\includegraphics[scale=.7]{Leaky_ReLU_activation_function.png}\\
+\hrule{\textwidth,1mm}
+\includegraphics[scale=.7]{exponential_linear_units_activation_function.png}\\
+
+
+\section{Optimization}%
+\label{sec:Optimization}
+Die Optimierung der Parameter $\bm \theta$ erfolgen nach allgemeinen ML Rezept:
+\begin{equation} \label{eq:neural_network_parameter_optimization}
+    \bm\theta^* = \argmin_{\bm\theta} \sum_{i=1}^N l(\bm x_i,\bm\theta) + \nomeq{regularization_factor}\text{ penalty}(\bm\theta)
+\end{equation}
+Welche Loss Function $l$ verwendet werden sollte hängt dabei vom Anwendungsgebiet ab:
+\begin{table}[H]
+    \centering
+    \caption{Loss Functions für verschiedene Anwendungsaufgaben}
+    \label{tab:loss_functions_for_different_tasks}
+    \begin{tabularx}{\textwidth}{|c|Y|Y|}
+        \hline
+        & \bfseries Deterministic & \bfseries Probabilistic \\
+        \hline
+        \hline
+        \multicolumn{3}{|c|}{\bfseries Regression} \\
+        \hline
+            \bfseries Output Layer & 
+                \begin{tabular}{@{}c@{}} linear \\ $\bm f = \bm W^{(L)}\bm h^{(L-1)}+\bm b^{(L)}$ \end{tabular} &
+                \begin{tabular}{@{}c@{}} linear Gaussian \\ $p(\bm y|\bm x) = \nomeq{gaussian_distribution}(\bm y|\bm W^{(L)}\bm h^{(L-1)} + \bm b^{(L)}, \nomeq{covariance})$\end{tabular}\\
+        \hline
+            \bfseries Loss & 
+                \begin{tabular}{@{}c@{}} squared error \\ $l_i(\bm x_i,\bm\theta) = \frac{1}{2}(\bm f(\bm x_i)-\bm y_i)^2$\end{tabular} &
+                \begin{tabular}{@{}c@{}} negative log-likelihood \\ $l_i(\bm x_i,\bm\theta) = -\log\nomeq{gaussian_distribution}(\bm y_i|\bm\mu(\bm x_i),\nomeq{covariance})$ \end{tabular}\\
+        \hline
+        \hline
+        \multicolumn{3}{|c|}{\bfseries Binary Classification} \\
+        \hline
+            \bfseries Output Layer & 
+                \begin{tabular}{@{}c@{}} linear \\ $\bm f = \bm W^{(L)}\bm h^{(L-1)}+ b^{(L)}$ \end{tabular} &
+                \begin{tabular}{@{}c@{}} sigmoid \\ $f = \sigma(\bm W^{(L)}\bm h^{(L-1)} + b^{(L)})$\end{tabular}\\
+        \hline
+            \bfseries Loss & 
+                \begin{tabular}{@{}c@{}} \nameref{sub:Hinge Loss} \\ $l(\bm x_i, \bm\theta) = \max(0,1-y_i f(\bm x_i))$\end{tabular} &
+                \begin{tabular}{@{}c@{}} negative log-likelihood \\ $l_i(\bm x_i,\bm\theta) = -c_i \log f(\bm x_i)$\end{tabular}\\
+        \hline
+        \hline
+        \multicolumn{3}{|c|}{\bfseries Multi-class Classification} \\
+        \hline
+            \bfseries Output Layer & 
+                \begin{tabular}{@{}c@{}} linear \\ $\bm f = \bm W^{(L)}\bm h^{(L-1)} + \bm b^{(L)}$\end{tabular} &
+                \begin{tabular}{@{}c@{}} sigmoid \\ $\bm f = \softmax(\bm W^{(L)}\bm h^{(L-1)} + \bm b^{(L)})$\end{tabular}\\
+        \hline
+            \bfseries Loss & 
+                \begin{tabular}{@{}c@{}} Multi-class SVM loss \\ {\color{red} in diesem Fach nicht behandelt} \end{tabular} &
+                \begin{tabular}{@{}c@{}} negative log-likelihood \\ $l_i(\bm x_i,\bm\theta) = -\sum_{k=1}^K \bm h_{c_i,k} \log y_k(\bm x_i)$\end{tabular}\\
+        \hline
+    \end{tabularx}
+\end{table}
+
+\section{Feature Learning}%
+\label{sec:Feature Learning}
+Die letzte Schicht eines Neural Networks macht im Prinzip nur eine \nameref{cha:Linear Regression},
+welche durch die vorhergegangenen Schichten vorbereitet wird.
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=0.7\textwidth]{feature_learning.png}
+    \caption{Schichten vor dem letzten Layer bereiten die Daten für eine \nameref{cha:Linear Regression} vor}
+    \label{fig:feature_learning}
+\end{figure}
+\subsection{Beispiel}%
+\label{sub:Feature Learning:Beispiel}
+{\color{red} siehe Vorlesung 08 Folie 35}
diff --git a/images/Leaky_ReLU_activation_function.png b/images/Leaky_ReLU_activation_function.png
new file mode 100644
index 0000000..a4abb51
Binary files /dev/null and b/images/Leaky_ReLU_activation_function.png differ
diff --git a/images/ReLU_activation_function.png b/images/ReLU_activation_function.png
new file mode 100644
index 0000000..6e144cb
Binary files /dev/null and b/images/ReLU_activation_function.png differ
diff --git a/images/activation_functions.png b/images/activation_functions.png
new file mode 100644
index 0000000..d3cd071
Binary files /dev/null and b/images/activation_functions.png differ
diff --git a/images/exponential_linear_units_activation_function.png b/images/exponential_linear_units_activation_function.png
new file mode 100644
index 0000000..cfcb41c
Binary files /dev/null and b/images/exponential_linear_units_activation_function.png differ
diff --git a/images/feature_learning.png b/images/feature_learning.png
new file mode 100644
index 0000000..f88e483
Binary files /dev/null and b/images/feature_learning.png differ
diff --git a/images/feedforward_neural_network.png b/images/feedforward_neural_network.png
new file mode 100644
index 0000000..fa70c1a
Binary files /dev/null and b/images/feedforward_neural_network.png differ
diff --git a/images/feedforward_neural_network_composition.png b/images/feedforward_neural_network_composition.png
new file mode 100644
index 0000000..76661d2
Binary files /dev/null and b/images/feedforward_neural_network_composition.png differ
diff --git a/images/perceptron.png b/images/perceptron.png
new file mode 100644
index 0000000..8889e66
Binary files /dev/null and b/images/perceptron.png differ
diff --git a/images/sigmoid_activation_function.png b/images/sigmoid_activation_function.png
new file mode 100644
index 0000000..1ea4423
Binary files /dev/null and b/images/sigmoid_activation_function.png differ
diff --git a/images/tanh_activation_function.png b/images/tanh_activation_function.png
new file mode 100644
index 0000000..96c7b6f
Binary files /dev/null and b/images/tanh_activation_function.png differ