Friday, December 8, 2017

Forward and backward propagations for gradient descent

$I$ :  number of examples
$i$ :  example index
$R$:  number of features in $0^\text{th}$ layer
$r$:   index for $0^\text{th}$ layer
$S$:   number of features in $1^\text{st}$ layer
$s$ :   feature index for layer $[l]$
$T$:   number of features in $2^\text{nd}$ layer
$t$ :   index in $2^\text{nd}$ layer
Summary of forward propagation:
\begin{array}{cl|lll}
\text{Layer} & \text{Name} &  \text{Variable} & \text{Size}& \text{Element} & \text{Forward Prop.} & \text{Forward Element} &  \\
\hline
0 & Feature &  A^{[0]} & R\times I & A^{r[0]}_i & & \\
\hline
1 & Feature &  A^{[1]} & S\times I & A^{s[1]}_i &  A^{[1]} = g(Z^{[1]})  &
        A^{s[1]}_i = g(Z^{s[1]}_i) \\
   & Z           &  Z^{[1]} & S\times I & Z^{s[1]}_i & Z^{[1]} = W^{[1]}A^{[0]}+b^{[1]} &
        Z^{s[1]}_i = W^{s[1]}_r A^{r[0]}_i + b^{s[1]} \\
   & Weight  & W^{[1]} &S\times R &W^{s[1]}_r& & \\
   & Bias      &   b^{[1]} & S\times 1& b^{s[1]} & &\\
\hline
2 & Feature &  A^{[2]} & T\times I & A^{t[2]}_i & A^{[2]} = \sigma(Z^{[2]}) &
       A^{t[2]}_i = \sigma(Z^{t[2]}_i) \\
   & Z           &  Z^{[2]} & T\times I & Z^{t[2]}_i & Z^{[2]} = W^{[2]}A^{[1]}+b^{[2]} &
       Z^{t[2]}_i = W^{t[2]}_s A^{s[1]}_i + b^{t[2]}  \\
   & Weight  & W^{[2]} &T\times S &W^{t[2]}_s& & \\
   & Bias      &   b^{[2]} & T\times 1&  b^{t[2]} \\
\end{array}
Summary of backward propagation:
\begin{array}{cl|lll}
\text{Layer} & \text{Name} &  \text{Variable} & \text{Size}& \text{Element} & \text{Backward Prop.} & \text{Backward Element} &  \\
\hline
1  & Feature &  dA^{[1]} & S\times I & dA^{s[1]}_i & dA^{[1]} = W^{[2]T} dZ^{[2]} & dA_i^{s[1]} = dZ_i^{t[2]} W_t^{s[2]} &  \\
    & Z           &  dZ^{[1]} & S\times I & dZ^{s[1]}_i & dZ^{[1]} = W^{[2]T} dZ^{[2]}
        \cdot g'(Z^{[1]}) &  dZ_i^{s[1]} =  W_t^{s[2]} dZ_i^{t[2]}  g'(Z_i^{s[1]}) \\
    & Weight  & dW^{[1]} &S\times R &dW^{s[1]}_r& dW^{[1]} = dZ^{[1]} A^{[0]T} &
        dW_r^{s[1]} =  dZ_i^{s[1]} A_r^{i[0]}\\
    & Bias      &   db^{[1]} & S\times 1& db^{s[1]} & db^{[1]} = \sum_i Z^{[1]}_i &
        db^{s[1]} = \sum_i Z_i^{s[1]} \\
\hline
 2  & Feature &  dA^{[2]} & T\times I & dA^{t[1]}_i &   \\
    & Z           &  dZ^{[2]} & T\times I & dZ^{t[2]}_i & dZ^{[2]} &
         dZ_i^{t[2]} = \partial J / \partial Z_t^{i[2]}  \\
     & Weight  & dW^{[2]} &T\times S &dW^{t[2]}_s& dW^{[2]} = dZ^{[2]} A^{[1]T} &
         dW_s^{t[2]} = dZ_i^{t[2]} A_s^{i[1]} \\
     & Bias      &   db^{[2]} & T\times 1&  db^{t[2]}    & db^{[2]} = \sum_i dZ^{[2]}_i  &
         db^{t[2]} = \sum_{i}dZ_i^{t[2]} \\
\end{array}
Derivations:

Forward propagation from layer $0$ to $1$ and $1$ to $2$:
\begin{align}
Z^{[1]} &= W^{[1]}A^{[0]}+b^{[1]} \\
 A^{[1]} &= g(Z^{[1]}) \\
Z^{[2]} &= W^{[2]}A^{[1]}+b^{[2]} \\
 A^{[2]} &= \sigma(Z^{[2]})
\end{align}
or using Einstein notation,
\begin{align}
Z^{s[1]}_i &= W^{s[1]}_r A^{r[0]}_i + b^{s[1]} \\
 A^{s[1]}_i &= g(Z^{s[1]}_i) \\
Z^{t[2]}_i &= W^{t[2]}_s A^{s[1]}_i + b^{t[2]} \\
 A^{t[2]}_i &= \sigma(Z^{t[2]}_i)
\end{align}
The cost function is $J$; suppose we know,
\begin{align}
dZ_i^{t[2]} = \frac{\partial J}{\partial Z_t^{i[2]}}
\end{align}
Backward propagation:
Layer 2,
\begin{align}
dW_s^{t[2]} &= \frac{\partial J}{\partial W_t^{s[2]}} &=&  \frac{\partial J}{\partial Z_t^{i[2]}}  \frac{\partial Z_t^{i[2]}}{\partial W_t^{s[2]}} &=& dZ_i^{t[2]} A_s^{i[1]} \\
db^{t[2]} &= \frac{\partial J}{\partial b_t^{[2]}} &=&  \frac{\partial J}{\partial Z_t^{i[2]}} \frac{\partial Z_t^{i[2]}}{\partial b_t^{[2]}} = dZ_i^{t[2]} 1^i &=& \sum_{i}dZ_i^{t[2]}
\end{align}
The matrix: element pairs are $dZ^{[2]}$: $dZ^{t[2]}_i$, $A^{[1]}$: $A^{s[1]}_i$, $dW^{[2]}$: $dW^{t[2]}_s$, $db^{[2]}$: $db^{t[2]}$,
\begin{align}
dW^{[2]} &= dZ^{[2]} A^{[1]T}\\
    db^{[2]} &= \sum_i dZ^{[2]}_i
\end{align}

Layer 1,
\begin{align}
dA_i^{s[1]} &= \frac{\partial J}{\partial A_s^{i[1]}} =  \frac{\partial J}{\partial Z_t^{i[2]}}  \frac{\partial Z_t^{i[2]}}{\partial A_s^{i[1]}} = dZ_i^{t[2]} W_t^{s[2]} \\
& \frac{\partial A_s^{i[1]}}{\partial Z_s^{i[1]}} = g'(Z_i^{s[1]})
\end{align}
In matrix form,
\begin{align}
dA^{[1]} = W^{[2]T} dZ^{[2]}
\end{align}
and,
\begin{align}
dZ_i^{s[1]} &= \frac{\partial J}{\partial Z_s^{i[1]}} = \frac{\partial J}{\partial A_s^{i[1]}}  \frac{\partial A_s^{i[1]}}{\partial Z_s^{i[1]}} = dZ_i^{t[2]} W_t^{s[2]} g'(Z_i^{s[1]}) , \text{ element-wise multiplication for } g'\\
dW_r^{s[1]}&= \frac{\partial J}{\partial W_s^{r[1]}} = \frac{\partial J}{\partial Z_s^{i[1]}} \frac{\partial Z_s^{i[1]}}{\partial W_s^{r[1]}} = dZ_i^{s[1]} A_r^{i[0]} \\
db^{s[1]} &= \frac{\partial J}{\partial b_s^{[1]}} = \frac{\partial J}{\partial Z_s^{i[1]}} \frac{\partial Z_s^{i[1]}} {\partial b_s^{[1]}} = dZ_i^{s[1]} 1^i = \sum_i Z_i^{s[1]}
\end{align}
and,
\begin{align}
dZ^{[1]} &= W^{[2]T} dZ^{[2]} g'(Z^{[1]}) , \text{ element-wise multiplication for } g'\\
dW^{[1]} &= dZ^{[1]} A^{[0]T} \\
db^{[1]} &= \sum_i Z^{[1]_i}
\end{align}



No comments:

Post a Comment