From ffa483afd3d0b397179544f9ee1d1391b51ff8f3 Mon Sep 17 00:00:00 2001 From: Pascal Schindler Date: Thu, 25 Nov 2021 12:10:55 +0100 Subject: [PATCH] Solution 1.1 finished --- exercise.ipynb | 79 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 6 deletions(-) diff --git a/exercise.ipynb b/exercise.ipynb index b78e7c6..01a5132 100644 --- a/exercise.ipynb +++ b/exercise.ipynb @@ -86,7 +86,7 @@ }, "outputs": [], "source": [ - "data = np.load(\"iris_data.npz\")\n", + "data = np.load(\"/Users/pascalschindler/Desktop/ML_Übungsblätter/Exercise_2/iris_data.npz\")\n", "train_samples = data[\"train_features\"]\n", "train_labels = data[\"train_labels\"]\n", "test_samples = data[\"test_features\"]\n", @@ -192,13 +192,16 @@ "\n", "\\begin{align} \n", "\\mathcal L_{\\mathrm{cat-NLL}} \n", - "=& - \\sum_{i=1}^N \\log p(c_i | \\boldsymbol x_i) \\\\ \n", - "=& - \\sum_{i=1}^N \\sum_{k=1}^K h_{i, k} \\log(p_{i,k}) \\\\ \n", - "=& - \\sum_{i=1}^N \\sum_{k=1}^K h_{i, k} \\log(\\textrm{softmax}(\\boldsymbol{w}_k^T \\boldsymbol \\phi(\\boldsymbol{x}_i))_k)\\\\ \n", + "=& - \\sum_{i=1}^N \\log p(c_i | \\boldsymbol x_i) \\\\\n", + "=& - \\sum_{i=1}^N \\sum_{k=1}^K h_{i, k} \\log(p_{i,k}) \\\\\n", + "=& - \\sum_{i=1}^N \\sum_{k=1}^K h_{i, k} \\log(\\textrm{softmax}(\\boldsymbol{w}_k^T \\boldsymbol \\phi(\\boldsymbol{x}_i))_k)\\\\\n", "=& - \\sum_{i=1}^N \\left(\\sum_{k=1}^K h_{i,k}\\boldsymbol{w}^T_k \\boldsymbol \\phi(\\boldsymbol{x}_i) - \\log \\sum_{j=1}^K \\exp(\\boldsymbol{w}_j^T \\boldsymbol \\phi(\\boldsymbol{x}_i))\\right).\n", "\\end{align}\n", "\n", "In order to use gradient based optimization for this, we of course also need to derive the gradient.\n", + "This gives us the loss for a single sample. To get the loss for all samples we will need to sum over loss for a single sample\n", + "\n", + "\n", "\n", "### 1.1) Derivation (4 Points)\n", "Derive the gradient $\\dfrac{\\partial \\mathcal L_{\\mathrm{cat-NLL}}}{\\partial \\boldsymbol{w}}$ of the loss function w.r.t. the full weight vector $\\boldsymbol w \\equiv \\begin{pmatrix} \\boldsymbol w_1^T & \\dots & \\boldsymbol w_K^T \\end{pmatrix}^T$, which is obtained by stacking the class-specific weight vectors $\\boldsymbol w_k$.\n", @@ -206,9 +209,73 @@ "**Hint 1:** Follow the steps in the derivation of the gradient of the loss for the binary classification in the lecture.\n", "\n", "**Hint 2:** Derive the gradient not for the whole vector $\\boldsymbol w$ but only for $\\boldsymbol w_k$ i.e., $\\dfrac{\\partial \\mathcal L_{\\mathrm{cat-NLL}}}{\\partial \\boldsymbol{w}_k}$. The gradients for the individual\n", - "$\\boldsymbol w_k$ can then be stacked to obtain the full gradient.\n" + "$\\boldsymbol w_k$ can then be stacked to obtain the full gradient." ] }, + { + "cell_type": "markdown", + "source": [ + "\\begin{align}\n", + "\\mathcal L_{\\mathrm{cat-NLL}}\n", + "=& - \\sum_{i=1}^N \\left(\\sum_{k=1}^K h_{i,k}\\boldsymbol{w}^T_k \\boldsymbol \\phi(\\boldsymbol{x}_i) - \\log \\sum_{j=1}^K \\exp(\\boldsymbol{w}_j^T \\boldsymbol \\phi(\\boldsymbol{x}_i))\\right) \\\\\n", + "=& - \\sum_{i=1}^N \\left(\\sum_{k=1}^K h_{i,k}\\boldsymbol{w}^T_k \\boldsymbol \\phi(\\boldsymbol{x}_i) \\right) + \\sum_{i=1}^N \\log \\left(\\sum_{j=1}^K h_{i,k} \\exp(\\boldsymbol{w}_j^T \\boldsymbol \\phi(\\boldsymbol{x}_i)\\right) \\\\\n", + "\\end{align}\n" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Take $\\nabla_{w_j}\\mathcal L_{\\mathrm{cat-NLL}}$ with respect to a particular weight vector $\\boldsymbol{w}_j$, which leads to the fact that the sum $\\sum_{k=1}^K$ is no longer necessary and $\\sum_{i=1}^N\\log \\left(\\sum_{j=1}^K h_{i,k} \\exp(\\boldsymbol{w}_j^T \\boldsymbol \\phi(\\boldsymbol{x}_i)\\right) = \\sum_{i=1}^N \\log \\left( \\sum_{j=1}^K \\exp(\\boldsymbol{w}_j^T \\boldsymbol \\phi(\\boldsymbol{x}_i)\\right)$, since $\\sum_{k=1}^K h_{i,k} = 1$\n" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "\\begin{align}\n", + "\\dfrac{\\partial \\mathcal L_{\\mathrm{cat-NLL}}}{\\partial \\boldsymbol{w}_k}\n", + "=& \\dfrac{\\partial}{\\partial \\boldsymbol{w}_k} \\left(- \\sum_{i=1}^N \\left( h_{i,k}\\boldsymbol{w}^T_k \\boldsymbol \\phi(\\boldsymbol{x}_i) \\right) + \\sum_{i=1}^N \\log \\left(\\sum_{j=1}^K \\exp(\\boldsymbol{w}_j^T \\boldsymbol \\phi(\\boldsymbol{x}_i)\\right)\\right) \\\\\n", + "=& - \\sum_{i=1}^N \\left( h_{i,k}\\boldsymbol \\phi(\\boldsymbol{x}_i) \\right) + \\sum_{i=1}^N \\left( \\frac{1}{\\sum_{j=1}^K \\exp(\\boldsymbol{w}_j^T \\boldsymbol \\phi(\\boldsymbol{x}_i))} \\right) \\exp(\\boldsymbol{w}_j^T \\boldsymbol \\phi(\\boldsymbol{x}_i)) * \\phi(\\boldsymbol{x}_i)\\\\\n", + "=& \\sum_{i=1}^N \\left[ \\left( \\frac{\\exp(\\boldsymbol{w}_j^T \\boldsymbol \\phi(\\boldsymbol{x}_i))}{\\sum_{j=1}^K \\exp(\\boldsymbol{w}_j^T \\boldsymbol \\phi(\\boldsymbol{x}_i))} - h_{i,k} \\right ) \\right] \\phi(\\boldsymbol{x}_i))\n", + "\\end{align}" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "This is the derivation for one specific weight vector/class. This has to be done for all weight vectors\n" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "\\begin{align}\n", + "\\dfrac{\\partial \\mathcal L_{\\mathrm{cat-NLL}}}{\\partial \\boldsymbol{w}}\n", + "=& \\begin{pmatrix}\n", + "\\sum_{i=1}^N \\left[ \\left( \\frac{\\exp(\\boldsymbol{w}_j^T \\boldsymbol \\phi(\\boldsymbol{x}_i))}{\\sum_{j=1}^K \\exp(\\boldsymbol{w}_j^T \\boldsymbol \\phi(\\boldsymbol{x}_i))} - h_{i,1} \\right ) \\right] \\phi(\\boldsymbol{x}_i)) \\\\\n", + "\\vdots \\\\\n", + "\\sum_{i=1}^N \\left[ \\left( \\frac{\\exp(\\boldsymbol{w}_j^T \\boldsymbol \\phi(\\boldsymbol{x}_i))}{\\sum_{j=1}^K \\exp(\\boldsymbol{w}_j^T \\boldsymbol \\phi(\\boldsymbol{x}_i))} - h_{i,K} \\right ) \\right] \\phi(\\boldsymbol{x}_i))\n", + "\\end{pmatrix}\n", + "\\end{align}" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, { "cell_type": "markdown", "metadata": {}, @@ -1030,4 +1097,4 @@ }, "nbformat": 4, "nbformat_minor": 1 -} +} \ No newline at end of file