Fully Connected Neural Networks From Scratch
Neural networks are often explained in the most complicated ways possible, but we’ll show just how simple they can be.
Suppose we wish to implement a fully-connected feedforward neural network with 1 input layer, 1 hidden layer, and 1 output layer. We call such a network to be a two-layer neural network (ignoring the input layer as it is trivially present).
In the feedforward step, we feed an input \(x \in R^{1 \times d}\) through the network.
\[z^{(1)} = W^{(1)}x + b^{(1)}\] \[a^{(1)} = \sigma(z^{(1)})\] \[z^{(2)} = W^{(2)}a^{(1)} + b^{(2)}\] \[\hat{y} = a^{(2)} = \mbox{ softmax }(z^{(2)} )\]def python_feedforward(X, cache):
assert(X.shape[0] == cache['W1'].shape[1])
cache['X'] = X
pdb.set_trace()
z1 = cache['W1'].dot(X) + np.matlib.repmat(cache['b1'], cache['N'], 1).T
cache['z1'] = z1
a1 = 1./ (1. + np.exp(-z1))
cache['a1'] = a1
assert(a1.shape[0] == cache['W2'].shape[1])
z2 = cache['W2'].dot(a1) + np.matlib.repmat(cache['b2'], cache['N'], 1).T
cache['z2'] = z2
#logits = np.exp(z2) / np.matlib.repmat( np.sum(np.exp(z2.T),axis=1).T, cache['D_out'], 1)
logits = np.exp(z2) / np.matlib.repmat( np.sum(np.exp(z2),axis=0), cache['D_out'], 1)
cache['logits'] = logits # also called cache.yc, a2
return logits
We’ll add L2 regularization on the weights of the network:
def norms(cache):
all_sum = scipy.linalg.norm(cache['W1']) **2
all_sum += scipy.linalg.norm(cache['W2']) **2
return all_sum
We use a cross-entropy loss:
\[CE(y, \hat{y}) = - \sum\limits_{i=0}^{C-1} y_i \mbox{ log }(\hat{y}_i)\]And we also add the penalization term for the magnitude of \(W\):
\[J(W,b;x,y) = \frac{1}{N} \sum\limits_{i=1}^N CE^{(i)}(y, \hat{y}) + 0.5 \lambda \| p \|^2\]def python_loss(cache, y):
"""
a = array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11]])
logits[np.array([3,2,1]),np.arange(3)]
returns array([9, 7, 5])
"""
yc_scores = cache['logits'][y,np.arange(cache['N'])]
ce_sum = -np.sum(np.log(yc_scores))
data_loss = ce_sum / cache['N'] # divide by batch size
reg_loss = 0.5 * cache['reg'] * norms(cache)
loss = data_loss + reg_loss
print "Loss: ", loss
return loss
Now we use the chain rule to backpropagate gradients:
\[\frac{\partial CE(y, \hat{y}) }{ \partial z^{(2)}} = \hat{y} − y\]Recall that \(z^{(2)} = W^{(2)}a^{(1)} + b^{(2)} \). Therefore,
\[\frac{ \partial CE(y, \hat{y} )}{\partial W^{(2)}} = \frac{ \partial CE(y, \hat{y} )}{\partial z^{(2)}} \frac{\partial z^{(2)}}{\partial W^{(2)}}\] \[\frac{ \partial CE(y, \hat{y} )}{ \partial W^{(2)} }= ( \hat{y} − y) a^{(1)^T}\]Similarly,
\[\frac{ \partial CE(y, yˆ) }{ \partial b^{(2)} }= \hat{y} − y\]Going across L2:
\[\frac{ \partial z^{(2)} }{\partial a^{(1)} }= W^{(2)^T}\] \[\frac{ \partial CE(y, yˆ) }{\partial a^{(1)} } = \frac{\partial CE(y, yˆ)}{\partial z^{(2)} } \frac{ \partial z^{(2)} }{\partial a^{(1)} } = W^{(2)^T} ( \hat{y} − y)\]Going across the non-linearity of L1:
\[\frac{\partial CE(y, yˆ) }{\partial z^{(1)} }= \frac{ \partial CE(y, yˆ)}{ \partial a^{(1)}} \frac{ \partial σ(z)}{ \partial z^{(1)} }\] \[= \frac{\partial CE(y, yˆ)}{\partial a^{(1)} } ◦ σ(z^{(1)}) ◦ (1 − \sigma(z^{(1)}))\]Note that we have assumed that \(\sigma(·)\) works on matrices by applying an element-wise sigmoid, and ◦ is the element-wise (Hadamard) product. That brings us to our final gradients: \partial CE(y, yˆ) \partial W(1) = \partial CE(y, yˆ) \partial z(1) \partial z(1) \partial W(1) \partial CE(y, yˆ) \partial W(1) = ∂CE(y, yˆ) ∂z(1) x T (6) Similarly, \partial CE(y, yˆ) \partial b(1) = \partial CE(y, yˆ)
def python_backprop(cache, grads, y):
# divide by batch size
one_hot_y = np.zeros((cache['D_out'],cache['N']))
one_hot_y[y,np.arange(cache['N'])] = np.ones(cache['N'])
diff = (1.0 / cache['N']) * (cache['logits'] - one_hot_y)
grads['dW2'] = mm(diff, cache['a1'].T) + cache['reg'] * cache['W2'] # 0.5 cancels
#grads['db2'] = np.sum(diff, axis=1)
grads['db2'] = np.sum(diff.T, axis=0).T
da1 = mm( cache['W2'].T, diff )
# elementwise
dz1 = np.multiply(da1, np.multiply( cache['a1'], (1 - cache['a1']) ) )
grads['dW1'] = mm(dz1, cache['X'].T ) + cache['reg'] * cache['W1'] # 0.5 cancels
#grads['db1'] = np.sum(dz1, axis=1)
grads['db1'] = np.sum(dz1.T, axis=0).T
Implementing the same network in Pytorch
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pdb
from torch.autograd import Variable
import numpy.matlib
import scipy.linalg
from numpy import matmul as mm
class PytorchFCNet(nn.Module):
def __init__(self, cache):
super(PytorchFCNet, self).__init__()
self.cache = cache
self.fc1 = nn.Linear( cache['D_in'], cache['H'])
self.fc2 = nn.Linear( cache['H'], cache['D_out'])
self.weights_init()
def weights_init(self):
for idx, m in enumerate( self.modules() ):
#classname = m.__class__.__name__
if isinstance(m, nn.Linear):
print 'idx = %d' % idx
if idx == 1:
#m.weight.data = torch.from_numpy(self.cache['W1'])
#m.bias.data = torch.from_numpy(self.cache['b1'])
torch.nn.init.constant(m.weight[:,0], 1)
torch.nn.init.constant(m.weight[:,1], 2)
torch.nn.init.constant(m.weight[:,2], 3)
torch.nn.init.constant(m.bias, 1)
elif idx == 2:
#m.weight.data = torch.from_numpy(self.cache['W2'])
#m.bias.data = torch.from_numpy(self.cache['b2'])
torch.nn.init.constant(m.weight, 2)
torch.nn.init.constant(m.bias, 2)
# # Create random Tensors for weights; setting requires_grad=True means that we
# # want to compute gradients for these Tensors during the backward pass.
# W1 = torch.fromnumpy(cache['W1'] , requires_grad=True)
# b1 = torch.fromnumpy(cache['b1'] device=device, requires_grad=True)
# W2 = torch.fromnumpy(cache['W2'] device=device, requires_grad=True)
# b2 = torch.fromnumpy(cache['b2'] device=device, requires_grad=True)
# W1 = torch.as_tensor(W1, device=device)
def forward(self, x):
pdb.set_trace()
x = self.fc1(x)
x = F.sigmoid(x)
x = self.fc2(x)
return x
def run_pytorch_fc_net(X, y, cache):
model = PytorchFCNet(cache)
#device = torch.device('cpu')
# Create random Tensors to hold input and outputs
X = torch.from_numpy(X.T) #, device=device)
y = torch.from_numpy(y) #, device=device)
X = Variable(X.type(torch.FloatTensor), volatile=False)
y = Variable(y.type(torch.LongTensor), volatile=False)
learning_rate = 5.0
#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=cache['reg'])
logits = model(X)
logits = F.log_softmax(logits) #, dim=1)
loss = F.nll_loss(logits, y, size_average=True)
# exclude bias weights
norms = torch.norm( list(model.modules())[1].weight) **2
norms += torch.norm( list(model.modules())[2].weight) **2
loss += 0.5 * cache['reg'] * norms
print loss # sum up batch loss
# Use autograd to compute the backward pass. This call will compute the
# gradient of loss with respect to all Tensors with requires_grad=True.
# After this call w1.grad and w2.grad will be Tensors holding the gradient
# of the loss with respect to w1 and w2 respectively.
#optimizer.zero_grad()
loss.backward()
#optimizer.step()
# optimizer weight_decay=0
print 'W1 grad:'
print list(model.modules())[1].weight.grad.data
print 'b1 grad:'
print list(model.modules())[1].bias.grad.data
print 'W2 grad:'
print list(model.modules())[2].weight.grad.data
print 'b2 grad:'
print list(model.modules())[2].bias.grad.data
# print 'W1 weight:'
# print list(model.modules())[1].weight.data
# print 'b1 weight:'
# print list(model.modules())[1].bias.data
# print 'W2 weight:'
# print list(model.modules())[2].weight.data
# print 'b2 weight:'
# print list(model.modules())[2].bias.data
# Update weights using gradient descent. For this step we just want to mutate
# the values of w1 and w2 in-place; we don't want to build up a computational
# graph for the update steps, so we use the torch.no_grad() context manager
# to prevent PyTorch from building a computational graph for the updates
# with torch.no_grad():
# list(model.modules())[1].weight.data -= learning_rate * list(model.modules())[1].weight.grad.data
# list(model.modules())[2].weight.data -= learning_rate * list(model.modules())[2].weight.grad.data
def run_python_fc_net(X,y, cache):
learning_rate = 5.0
grads = {}
python_feedforward(X, cache)
print 'logits:'
print cache['logits']
python_loss(cache, y)
python_backprop(cache, grads, y)
print 'W1 grad'
print grads['dW1']
print 'b1 grad'
print grads['db1']
print 'W2 grad'
print grads['dW2']
print 'b2 grad'
print grads['db2']
# # Update weights using gradient descent
# cache['W1'] -= learning_rate * grads['dW1']
# cache['b1'] -= learning_rate * grads['db1']
# cache['W2'] -= learning_rate * grads['dW2']
# cache['b2'] -= learning_rate * grads['db2']
# print 'weight for W1', cache['W1']
# print 'weight for b1', cache['b1']
# print 'weight for W2', cache['W2']
# print 'weight for b2', cache['b2']
def main():
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N = 2
D_in = 3
H = 4
D_out = 5
cache = {}
cache['reg'] = 0.5
cache['N'] = N
cache['D_in'] = D_in
cache['H'] = H
cache['D_out'] = D_out
# Create random input and output data
X = np.array(range(D_in*N))
X = np.reshape(X, (D_in, N))
y = np.array([3,4])
# [1,0],
# [0,1],
# [0,0]] )
#np.random.randn(D_out,N)
# Randomly initialize weights
cache['W1'] = np.ones((H, D_in))
cache['W1'][:,1] *= 2.
cache['W1'][:,2] *= 3.
cache['W2'] = np.ones((D_out, H)) * 2.
cache['b1'] = np.ones(cache['H'])
cache['b2'] = np.ones(cache['D_out']) * 2.
run_pytorch_fc_net(X, y, cache)
run_python_fc_net(X, y, cache)
Numerical Gradient Evaluation
From CS231N:
def eval_numerical_gradient(f: Callable, x: np.ndarray, verbose=True, h=0.00001):
"""A naive implementation of numerical gradient of f at x
Args:
f: a function that takes a single argument
x: the point (numpy array) to evaluate the gradient at
"""
assert x.dtype in [np.float16, np.float32, np.float64]
fx = f(x) # evaluate function value at original point
grad = np.zeros_like(x)
# iterate over all indexes in x
it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
while not it.finished:
# evaluate function at x+h
ix = it.multi_index
oldval = x[ix]
x[ix] = oldval + h # increment by h
fxph = f(x) # evalute f(x + h)
x[ix] = oldval - h
fxmh = f(x) # evaluate f(x - h)
x[ix] = oldval # restore
# compute the partial derivative with centered formula
grad[ix] = (fxph - fxmh) / (2 * h) # the slope
if verbose:
print(ix, grad[ix])
it.iternext() # step to next dimension
return grad
def f_numpy(w: np.ndarray) -> np.ndarray:
""" """
w1, w2 = w
a = np.cos(w1)
b = np.cos(w2)
c = a * b
d = sigmoid_numpy(w2)
g = w1 + w2
h = np.log(g)
i = np.square(w1)
j = i * w2
f1 = c + d
f2 = h + j
#return f1
return f2
def numerical_autodiff_toy_problem():
""" """
# x = np.array([1.,2.]).reshape(2,1)
# grad = eval_numerical_gradient_array(f=f_numpy, x=x, df=1.)
# print(grad)
x = np.array([1.,2.]).reshape(2,1)
grad = eval_numerical_gradient(f_numpy,x)
print(grad)
# print(f_numpy(np.array([1.,2.])))
Matrix Calculus: https://ocw.mit.edu/courses/18-s096-matrix-calculus-for-machine-learning-and-beyond-january-iap-2023/download/
References:
(1) Eric Darve, Stanford University.
https://atmos.washington.edu/~dennis/MatrixCalculus.pdf