Table of Contents:

Two-Layer FC Net: Forward Pass

Neural networks are often explained in the most complicated ways possible, but we’ll show just how simple they can be.

Suppose we wish to implement a fully-connected feedforward neural network with 1 input layer, 1 hidden layer, and 1 output layer. We call such a network to be a two-layer neural network (ignoring the input layer as it is trivially present).

In the feedforward step, we feed an input \(x \in \mathbb{R}^{1 \times d}\) through the network.

\[z^{(1)} = W^{(1)}x + b^{(1)}\] \[a^{(1)} = \sigma(z^{(1)})\] \[z^{(2)} = W^{(2)}a^{(1)} + b^{(2)}\] \[\hat{y} = a^{(2)} = \mbox{ softmax }(z^{(2)} )\]

Forward Pass In Numpy

def python_feedforward(X: np.ndarray, cache: dict[str, np.ndarray]):
  assert(X.shape[0] == cache['W1'].shape[1])
  cache['X'] = X

  z1 = cache['W1'].dot(X) + np.matlib.repmat(cache['b1'], cache['N'], 1).T
  cache['z1'] = z1

  a1 = 1./ (1. + np.exp(-z1))
  cache['a1'] = a1

  assert(a1.shape[0] == cache['W2'].shape[1])
  z2 = cache['W2'].dot(a1) + np.matlib.repmat(cache['b2'], cache['N'], 1).T
  cache['z2'] = z2

  #logits = np.exp(z2) / np.matlib.repmat( np.sum(np.exp(z2.T),axis=1).T, cache['D_out'], 1)
  logits = np.exp(z2) / np.matlib.repmat( np.sum(np.exp(z2),axis=0), cache['D_out'], 1)
  cache['logits'] = logits # also called cache.yc, a2
  return logits

We’ll add L2 regularization on the weights of the network:

def norms(cache):
  all_sum =  scipy.linalg.norm(cache['W1']) **2
  all_sum += scipy.linalg.norm(cache['W2']) **2
  return all_sum

We use a cross-entropy loss:

\[CE(y, \hat{y}) = - \sum\limits_{i=0}^{C-1} y_i \mbox{ log }(\hat{y}_i)\]

And we also add the penalization term for the magnitude of \(W\):

\[J(W,b;x,y) = \frac{1}{N} \sum\limits_{i=1}^N CE^{(i)}(y, \hat{y}) + 0.5 \lambda \| p \|^2\]
def python_loss(cache: dict[str, np.ndarray], y: np.ndarray):
  """
  a = array([[ 0,  1,  2],
  	       [ 3,  4,  5],
  	       [ 6,  7,  8],
  	       [ 9, 10, 11]])
  logits[np.array([3,2,1]),np.arange(3)]

  returns array([9, 7, 5])
  """
  yc_scores = cache['logits'][y,np.arange(cache['N'])]
  ce_sum = -np.sum(np.log(yc_scores))

  data_loss = ce_sum / cache['N'] # divide by batch size
  reg_loss = 0.5 * cache['reg'] * norms(cache)
  loss = data_loss + reg_loss
  print "Loss: ", loss
  return loss

Backprop: Derivation and Numpy Implementation

Now we use the chain rule to backpropagate gradients:

\[\frac{\partial CE(y, \hat{y}) }{ \partial z^{(2)}} = \hat{y} − y\]

Recall that \(z^{(2)} = W^{(2)}a^{(1)} + b^{(2)} \). Therefore,

\[\frac{ \partial CE(y, \hat{y} )}{\partial W^{(2)}} = \frac{ \partial CE(y, \hat{y} )}{\partial z^{(2)}} \frac{\partial z^{(2)}}{\partial W^{(2)}}\] \[\frac{ \partial CE(y, \hat{y} )}{ \partial W^{(2)} }= ( \hat{y} − y) a^{(1)^T}\]

Similarly,

\[\frac{ \partial CE(y, yˆ) }{ \partial b^{(2)} }= \hat{y} − y\]

Going across L2:

\[\frac{ \partial z^{(2)} }{\partial a^{(1)} }= W^{(2)^T}\] \[\frac{ \partial CE(y, \hat{y}) }{\partial a^{(1)} } = \frac{\partial CE(y, \hat{y})}{\partial z^{(2)} } \frac{ \partial z^{(2)} }{\partial a^{(1)} } = W^{(2)^T} ( \hat{y} − y)\]

Going across the non-linearity of L1:

\[\frac{\partial CE(y, yˆ) }{\partial z^{(1)} }= \frac{ \partial CE(y, \hat{y})}{ \partial a^{(1)}} \frac{ \partial σ(z)}{ \partial z^{(1)} }\] \[= \frac{\partial CE(y, \hat{y})}{\partial a^{(1)} } \odot \sigma(z^{(1)}) ◦ (1 − \sigma(z^{(1)}))\]

Note that we have assumed that \(\sigma(·)\) works on matrices by applying an element-wise sigmoid, and ◦ is the element-wise (Hadamard) product. That brings us to our final gradients:

\[\frac{\partial CE(y, \hat{y})}{\partial W^{(1)}} = \frac{\partial CE(y, \hat{y})}{\partial z^{(1)}}\frac{\partial z^{(1)}}{\partial W^{(1)}} = x^T\]

Similarly, \(\frac{\partial CE(y, \hat{y})}{\partial b(1)} = \partial CE(y, \hat{y}) ...\)

def python_backprop(
  cache: dict[str, np.ndarray], grads: dict[str, np.ndarray], y: np.ndarray
):
  # divide by batch size
  one_hot_y = np.zeros((cache['D_out'],cache['N']))
  one_hot_y[y,np.arange(cache['N'])] = np.ones(cache['N'])
  diff = (1.0 / cache['N']) * (cache['logits'] - one_hot_y)
  grads['dW2'] = mm(diff, cache['a1'].T) + cache['reg'] * cache['W2'] # 0.5 cancels
  #grads['db2'] = np.sum(diff, axis=1)
  grads['db2'] = np.sum(diff.T, axis=0).T

  da1 = mm( cache['W2'].T, diff )
  # elementwise
  dz1 = np.multiply(da1, np.multiply( cache['a1'], (1 - cache['a1']) ) )

  grads['dW1'] = mm(dz1, cache['X'].T ) + cache['reg'] * cache['W1']  # 0.5 cancels
  #grads['db1'] = np.sum(dz1, axis=1)
  grads['db1'] = np.sum(dz1.T, axis=0).T

Implementing the same network in Pytorch

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pdb
from torch.autograd import Variable
import numpy.matlib
import scipy.linalg
from numpy import matmul as mm


class PytorchFCNet(nn.Module):
  def __init__(self, cache):
    super(PytorchFCNet, self).__init__()
    self.cache = cache
    self.fc1 = nn.Linear( cache['D_in'], cache['H'])
    self.fc2 = nn.Linear( cache['H'], cache['D_out'])

    self.weights_init()

  def weights_init(self):
    for idx, m in enumerate( self.modules() ):
      #classname = m.__class__.__name__
      if not isinstance(m, nn.Linear):
        continue
      print 'idx = %d' % idx
      if idx == 1:
        #m.weight.data = torch.from_numpy(self.cache['W1'])
        #m.bias.data = torch.from_numpy(self.cache['b1'])
        torch.nn.init.constant(m.weight[:,0], 1)
        torch.nn.init.constant(m.weight[:,1], 2)
        torch.nn.init.constant(m.weight[:,2], 3)
        torch.nn.init.constant(m.bias, 1)
      elif idx == 2:
        #m.weight.data = torch.from_numpy(self.cache['W2'])
        #m.bias.data = torch.from_numpy(self.cache['b2'])
        torch.nn.init.constant(m.weight, 2)
        torch.nn.init.constant(m.bias, 2)

		# # Create random Tensors for weights; setting requires_grad=True means that we
		# # want to compute gradients for these Tensors during the backward pass.
		# W1 = torch.fromnumpy(cache['W1'] , requires_grad=True)
		# b1 = torch.fromnumpy(cache['b1'] device=device, requires_grad=True)

		# W2 = torch.fromnumpy(cache['W2'] device=device, requires_grad=True)
		# b2 = torch.fromnumpy(cache['b2'] device=device, requires_grad=True)

		# W1 = torch.as_tensor(W1, device=device)
def forward(self, x):
  pdb.set_trace()
  x = self.fc1(x)
  x = F.sigmoid(x)
  x = self.fc2(x)
  return x
def run_pytorch_fc_net(X, y, cache):

  model = PytorchFCNet(cache)
  #device = torch.device('cpu')

  # Create random Tensors to hold input and outputs
  X = torch.from_numpy(X.T) #, device=device)
  y = torch.from_numpy(y) #, device=device)

  X = Variable(X.type(torch.FloatTensor), volatile=False)
  y = Variable(y.type(torch.LongTensor), volatile=False)

  learning_rate = 5.0

  #optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=cache['reg'])



  logits = model(X)
  logits = F.log_softmax(logits) #, dim=1)
  loss = F.nll_loss(logits, y, size_average=True)

  # exclude bias weights
  norms =  torch.norm( list(model.modules())[1].weight) **2
  norms += torch.norm( list(model.modules())[2].weight) **2

  loss += 0.5 * cache['reg'] * norms


  print loss # sum up batch loss

  # Use autograd to compute the backward pass. This call will compute the
  # gradient of loss with respect to all Tensors with requires_grad=True.
  # After this call w1.grad and w2.grad will be Tensors holding the gradient
  # of the loss with respect to w1 and w2 respectively.
  #optimizer.zero_grad()


  loss.backward()
  #optimizer.step()


  # optimizer weight_decay=0

  print 'W1 grad:'
  print list(model.modules())[1].weight.grad.data

  print 'b1 grad:'
  print list(model.modules())[1].bias.grad.data

  print 'W2 grad:'
  print list(model.modules())[2].weight.grad.data

  print 'b2 grad:'
  print list(model.modules())[2].bias.grad.data


  # print 'W1 weight:'
  # print list(model.modules())[1].weight.data

  # print 'b1 weight:'
  # print list(model.modules())[1].bias.data

  # print 'W2 weight:'
  # print list(model.modules())[2].weight.data

  # print 'b2 weight:'
  # print list(model.modules())[2].bias.data

  # Update weights using gradient descent. For this step we just want to mutate
  # the values of w1 and w2 in-place; we don't want to build up a computational
  # graph for the update steps, so we use the torch.no_grad() context manager
  # to prevent PyTorch from building a computational graph for the updates
  # with torch.no_grad():
  # list(model.modules())[1].weight.data -= learning_rate * list(model.modules())[1].weight.grad.data
  # list(model.modules())[2].weight.data -= learning_rate * list(model.modules())[2].weight.grad.data
def run_python_fc_net(X,y, cache):
  learning_rate = 5.0

  grads = {}

  python_feedforward(X, cache)
  print 'logits:'
  print cache['logits']
  python_loss(cache, y)
  python_backprop(cache, grads, y)

  print 'W1 grad'
  print grads['dW1']

  print 'b1 grad'
  print grads['db1']

  print 'W2 grad'
  print grads['dW2']

  print 'b2 grad'
  print grads['db2']

  # # Update weights using gradient descent
  # cache['W1'] -= learning_rate * grads['dW1']
  # cache['b1'] -= learning_rate * grads['db1']

  # cache['W2'] -= learning_rate * grads['dW2']
  # cache['b2'] -= learning_rate * grads['db2']


  # print 'weight for W1', cache['W1']
  # print 'weight for b1', cache['b1']
  # print 'weight for W2', cache['W2']
  # print 'weight for b2', cache['b2']



def main():
  # N is batch size; D_in is input dimension;
  # H is hidden dimension; D_out is output dimension.
  N = 2
  D_in = 3
  H = 4
  D_out = 5

  cache = {}
  cache['reg'] = 0.5

  cache['N'] = N
  cache['D_in'] = D_in
  cache['H'] = H
  cache['D_out'] = D_out

  # Create random input and output data.
  X = np.array(range(D_in*N))
  X = np.reshape(X, (D_in, N)) 
  y = np.array([3,4]) 
  #	[1,0],
  #	[0,1],
  #	[0,0]]	)


  #np.random.randn(D_out,N)

  # Randomly initialize weights.
  cache['W1'] = np.ones((H, D_in))
  cache['W1'][:,1] *= 2.
  cache['W1'][:,2] *= 3.
  cache['W2'] = np.ones((D_out, H)) * 2.

  cache['b1'] = np.ones(cache['H'])
  cache['b2'] = np.ones(cache['D_out']) * 2.
  run_pytorch_fc_net(X, y, cache)
  run_python_fc_net(X, y, cache)

Numerical Gradient Evaluation

The numerical gradient is calculated using the finite difference approximation formula (specifically a centered difference formula):

\[\frac{df(x)}{dx} = \lim_{h \to 0} \frac{f(x + h) - f(x - h)}{2h}\]

However, we use a vectorized variant: we calculate the partial derivative for every dimension to assemble the full gradient vector \(\nabla f(\mathbf{x})\):

\[\frac{\partial f(\mathbf{x})}{\partial x_i} \approx \frac{f(\mathbf{x} + h\mathbf{e}_i) - f(\mathbf{x} - h\mathbf{e}_i)}{2h}\]

Karpathy’s CS231N at Stanford uses:

def eval_numerical_gradient(f: Callable, x: np.ndarray, verbose=True, h=0.00001):
  """A naive implementation of numerical gradient of f at x

  Args:
    f: A function that takes a single argument.
    x: the point (numpy array) to evaluate the gradient at.
  """
  assert x.dtype in [np.float16, np.float32, np.float64]
  fx = f(x)  # evaluate function value at original point
  grad = np.zeros_like(x)
  # Iterate over all indices in x.
  it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
  while not it.finished:

    # Evaluate function at x+h
    ix = it.multi_index
    oldval = x[ix]
    x[ix] = oldval + h  # Increment by h
    fxph = f(x)  # Evaluate f(x + h)
    x[ix] = oldval - h
    fxmh = f(x)  # Evaluate f(x - h)
    x[ix] = oldval  # Restore

    # Compute the partial derivative with centered formula.
    grad[ix] = (fxph - fxmh) / (2 * h)  # the slope
    if verbose:
      print(ix, grad[ix])
    it.iternext()  # step to next dimension

  return grad
def f_numpy(w: np.ndarray) -> np.ndarray:
  """ """
  w1, w2 = w
  a = np.cos(w1)
  b = np.cos(w2)
  c = a * b
  d = sigmoid_numpy(w2)
  g = w1 + w2
  h = np.log(g)
  i = np.square(w1)
  j = i * w2

  f1 = c + d
  f2 = h + j
  #return f1
  return f2
def numerical_autodiff_toy_problem():
  """ """
  # x = np.array([1.,2.]).reshape(2,1)
  # grad = eval_numerical_gradient_array(f=f_numpy, x=x, df=1.)
  # print(grad)

  x = np.array([1.,2.]).reshape(2,1)
  grad = eval_numerical_gradient(f_numpy,x)
  print(grad)

  # print(f_numpy(np.array([1.,2.])))

Matrix Calculus: https://ocw.mit.edu/courses/18-s096-matrix-calculus-for-machine-learning-and-beyond-january-iap-2023/download/

References:

(1) Eric Darve, Stanford University.

(2) CS 231N.

https://atmos.washington.edu/~dennis/MatrixCalculus.pdf