diff --git a/alternative_1_1.ipynb b/alternative_1_1.ipynb new file mode 100644 index 0000000..ea88634 --- /dev/null +++ b/alternative_1_1.ipynb @@ -0,0 +1,50 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\\begin{align}\n", + "\\frac{\\partial}{\\partial w_k} &= \\frac{\\partial}{\\partial w_k}\\left(\\sum_{k=1}^Kh_{c_i,k} w_k^T \\phi(x_i) - \\log \\left(\\sum_{j=1}^K e^{w_j^T \\phi(x_i)}\\right)\\right)\\\\\n", + "\\text{the $k$ in $\\partial w_k$ is not the same as the $k$ in the sum $\\sum_{k=1}^K$. To avoid confusion we will rename the first $w_k$ to $w_l$:}\\\\\n", + "&= \\frac{\\partial}{\\partial w_l}\\left(\\sum_{k=1}^Kh_{c_i,k} w_k^T \\phi(x_i) - \\log \\left(\\sum_{j=1}^K e^{w_j^T \\phi(x_i)}\\right)\\right)\\\\\n", + "\\text{because } h_{c_i,k}=\\begin{cases}0 & \\forall k\\neq l \\\\ 1 & \\text{for }k=l\\end{cases} \\text{ the sum $\\sum_{k=1}^K$ collapses:}\\\\\n", + "&= \\frac{\\partial}{\\partial w_l}\\left(w_l^T \\phi(x_i)\\right) - \\frac{\\partial}{\\partial w_l}\\left( \\log \\left(\\sum_{j=1}^K e^{w_j^T \\phi(x_i)}\\right)\\right)\\\\\n", + "\\text{from $\\frac{\\partial x^T a}{\\partial x}=a^T$ and $\\frac{\\partial \\log(f(x))}{\\partial x}=\\frac{f'(x)}{f(x)}$ follows:}\\\\\n", + "&= \\phi(x_i)^T - \\frac{\\frac{\\partial}{\\partial w_l}\\left(\\sum_{j=1}^K e^{w_j^T \\phi(x_i)}\\right)}{\\sum_{j=1}^K e^{w_j^T \\phi(x_i)}}\\\\\n", + "\\frac{\\partial}{\\partial w_l} e^{w_j^T\\phi(x_i)} = \\begin{cases}0\\cdot e^{w_j^T\\phi(x_i)}=0 & \\forall w_j\\neq w_l\\\\ \\phi(x_i)^T e^{w_l^T\\phi(x_i)} &\\text{for } w_j=w_l\\end{cases}\\text{ leads to:}\\\\\n", + "&= \\phi(x_i)^T - \\frac{ \\phi(x_i)^T e^{w_l^T \\phi(x_i)}}{\\sum_{j=1}^K e^{w_j^T \\phi(x_i)}}\\\\\n", + "&= \\phi(x_i)^T \\left(1 - \\frac{e^{w_l^T \\phi(x_i)}}{\\sum_{j=1}^K e^{w_j^T \\phi(x_i)}}\\right)\\\\\n", + "\\end{align}\n", + "\n", + "If we do the same thing for every $w_l$ we get\n", + "\n", + "$$\n", + "\\dfrac{\\partial \\mathcal L_{\\mathrm{cat-NLL}}}{\\partial \\boldsymbol{w}}\n", + "= \\begin{pmatrix}\n", + "\\phi(x_i)^T \\left(1 - \\frac{e^{w_1^T \\phi(x_i)}}{\\sum_{j=1}^K e^{w_j^T \\phi(x_i)}}\\right)\\\\\n", + "\\vdots \\\\\n", + "\\phi(x_i)^T \\left(1 - \\frac{e^{w_K^T \\phi(x_i)}}{\\sum_{j=1}^K e^{w_j^T \\phi(x_i)}}\\right)\\\\\n", + "\\end{pmatrix}\n", + "$$" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "a170f962d07ed8515a1e0cfe90051c8c49c07a10f515ca6de22ae850428164e9" + }, + "kernelspec": { + "display_name": "Python 3.8.5 64-bit ('base': conda)", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.8.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}