Fully Connected Neural Networks From Scratch

Neural networks are often explained in the most complicated ways possible, but we’ll show just how simple they can be.

Suppose we wish to implement a fully-connected feedforward neural network with 1 input layer, 1 hidden layer, and 1 output layer. We call such a network to be a two-layer neural network (ignoring the input layer as it is trivially present).

In the feedforward step, we feed an input \(x \in R^{1 \times d}\) through the network.

\[z^{(1)} = W^{(1)}x + b^{(1)}\] \[a^{(1)} = \sigma(z^{(1)})\] \[z^{(2)} = W^{(2)}a^{(1)} + b^{(2)}\] \[\hat{y} = a^{(2)} = \mbox{ softmax }(z^{(2)} )\]

def python_feedforward(X, cache):
	assert(X.shape[0] == cache['W1'].shape[1])
	cache['X'] = X

	pdb.set_trace()
	z1 = cache['W1'].dot(X) + np.matlib.repmat(cache['b1'], cache['N'], 1).T
	cache['z1'] = z1

	a1 = 1./ (1. + np.exp(-z1))
	cache['a1'] = a1

	assert(a1.shape[0] == cache['W2'].shape[1])
	z2 = cache['W2'].dot(a1) + np.matlib.repmat(cache['b2'], cache['N'], 1).T
	cache['z2'] = z2

	#logits = np.exp(z2) / np.matlib.repmat( np.sum(np.exp(z2.T),axis=1).T, cache['D_out'], 1)
	logits = np.exp(z2) / np.matlib.repmat( np.sum(np.exp(z2),axis=0), cache['D_out'], 1)
	cache['logits'] = logits # also called cache.yc, a2
	return logits

We’ll add L2 regularization on the weights of the network:

def norms(cache):
	all_sum =  scipy.linalg.norm(cache['W1']) **2
	all_sum += scipy.linalg.norm(cache['W2']) **2
	return all_sum

We use a cross-entropy loss:

\[CE(y, \hat{y}) = - \sum\limits_{i=0}^{C-1} y_i \mbox{ log }(\hat{y}_i)\]

And we also add the penalization term for the magnitude of \(W\):

\[J(W,b;x,y) = \frac{1}{N} \sum\limits_{i=1}^N CE^{(i)}(y, \hat{y}) + 0.5 \lambda \| p \|^2\]

def python_loss(cache, y):
	"""
	a = array([[ 0,  1,  2],
		       [ 3,  4,  5],
		       [ 6,  7,  8],
		       [ 9, 10, 11]])
	logits[np.array([3,2,1]),np.arange(3)]
	
	returns array([9, 7, 5])
	"""
	yc_scores = cache['logits'][y,np.arange(cache['N'])]
	ce_sum = -np.sum(np.log(yc_scores))

	data_loss = ce_sum / cache['N'] # divide by batch size
	reg_loss = 0.5 * cache['reg'] * norms(cache)
	loss = data_loss + reg_loss
	print "Loss: ", loss
	return loss

Now we use the chain rule to backpropagate gradients:

\[\frac{\partial CE(y, \hat{y}) }{ \partial z^{(2)}} = \hat{y} − y\]

Recall that \(z^{(2)} = W^{(2)}a^{(1)} + b^{(2)} \). Therefore,

\[\frac{ \partial CE(y, \hat{y} )}{\partial W^{(2)}} = \frac{ \partial CE(y, \hat{y} )}{\partial z^{(2)}} \frac{\partial z^{(2)}}{\partial W^{(2)}}\] \[\frac{ \partial CE(y, \hat{y} )}{ \partial W^{(2)} }= ( \hat{y} − y) a^{(1)^T}\]

Similarly,

\[\frac{ \partial CE(y, yˆ) }{ \partial b^{(2)} }= \hat{y} − y\]

Going across L2:

\[\frac{ \partial z^{(2)} }{\partial a^{(1)} }= W^{(2)^T}\] \[\frac{ \partial CE(y, yˆ) }{\partial a^{(1)} } = \frac{\partial CE(y, yˆ)}{\partial z^{(2)} } \frac{ \partial z^{(2)} }{\partial a^{(1)} } = W^{(2)^T} ( \hat{y} − y)\]

Going across the non-linearity of L1:

\[\frac{\partial CE(y, yˆ) }{\partial z^{(1)} }= \frac{ \partial CE(y, yˆ)}{ \partial a^{(1)}} \frac{ \partial σ(z)}{ \partial z^{(1)} }\] \[= \frac{\partial CE(y, yˆ)}{\partial a^{(1)} } ◦ σ(z^{(1)}) ◦ (1 − \sigma(z^{(1)}))\]

Note that we have assumed that \(\sigma(·)\) works on matrices by applying an element-wise sigmoid, and ◦ is the element-wise (Hadamard) product. That brings us to our final gradients: \partial CE(y, yˆ) \partial W(1) = \partial CE(y, yˆ) \partial z(1) \partial z(1) \partial W(1) \partial CE(y, yˆ) \partial W(1) = ∂CE(y, yˆ) ∂z(1) x T (6) Similarly, \partial CE(y, yˆ) \partial b(1) = \partial CE(y, yˆ)

def python_backprop(cache, grads, y):
	# divide by batch size
	one_hot_y = np.zeros((cache['D_out'],cache['N']))
	one_hot_y[y,np.arange(cache['N'])] = np.ones(cache['N'])
	diff = (1.0 / cache['N']) * (cache['logits'] - one_hot_y)
	grads['dW2'] = mm(diff, cache['a1'].T) + cache['reg'] * cache['W2'] # 0.5 cancels
	#grads['db2'] = np.sum(diff, axis=1)
	grads['db2'] = np.sum(diff.T, axis=0).T

	da1 = mm( cache['W2'].T, diff )
	# elementwise
	dz1 = np.multiply(da1, np.multiply( cache['a1'], (1 - cache['a1']) ) )

	grads['dW1'] = mm(dz1, cache['X'].T ) + cache['reg'] * cache['W1']  # 0.5 cancels
	#grads['db1'] = np.sum(dz1, axis=1)
	grads['db1'] = np.sum(dz1.T, axis=0).T

Implementing the same network in Pytorch

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pdb
from torch.autograd import Variable
import numpy.matlib
import scipy.linalg
from numpy import matmul as mm


class PytorchFCNet(nn.Module):
	def __init__(self, cache):
		super(PytorchFCNet, self).__init__()
		self.cache = cache
		self.fc1 = nn.Linear( cache['D_in'], cache['H'])
		self.fc2 = nn.Linear( cache['H'], cache['D_out'])

		self.weights_init()

	def weights_init(self):
		for idx, m in enumerate( self.modules() ):
			#classname = m.__class__.__name__
			if isinstance(m, nn.Linear):
				print 'idx = %d' % idx
				if idx == 1:
					#m.weight.data = torch.from_numpy(self.cache['W1'])
					#m.bias.data = torch.from_numpy(self.cache['b1'])
					torch.nn.init.constant(m.weight[:,0], 1)
					torch.nn.init.constant(m.weight[:,1], 2)
					torch.nn.init.constant(m.weight[:,2], 3)
					torch.nn.init.constant(m.bias, 1)
				elif idx == 2:
					#m.weight.data = torch.from_numpy(self.cache['W2'])
					#m.bias.data = torch.from_numpy(self.cache['b2'])
					torch.nn.init.constant(m.weight, 2)
					torch.nn.init.constant(m.bias, 2)

		# # Create random Tensors for weights; setting requires_grad=True means that we
		# # want to compute gradients for these Tensors during the backward pass.
		# W1 = torch.fromnumpy(cache['W1'] , requires_grad=True)
		# b1 = torch.fromnumpy(cache['b1'] device=device, requires_grad=True)

		# W2 = torch.fromnumpy(cache['W2'] device=device, requires_grad=True)
		# b2 = torch.fromnumpy(cache['b2'] device=device, requires_grad=True)

		# W1 = torch.as_tensor(W1, device=device)

	def forward(self, x):
		pdb.set_trace()
		x = self.fc1(x)
		x = F.sigmoid(x)
		x = self.fc2(x)
		return x

def run_pytorch_fc_net(X, y, cache):

	model = PytorchFCNet(cache)
	#device = torch.device('cpu')

	# Create random Tensors to hold input and outputs
	X = torch.from_numpy(X.T) #, device=device)
	y = torch.from_numpy(y) #, device=device)

	X = Variable(X.type(torch.FloatTensor), volatile=False)
	y = Variable(y.type(torch.LongTensor), volatile=False)

	learning_rate = 5.0

	#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=cache['reg'])



	logits = model(X)
	logits = F.log_softmax(logits) #, dim=1)
	loss = F.nll_loss(logits, y, size_average=True)
	
	# exclude bias weights
	norms =  torch.norm( list(model.modules())[1].weight) **2
	norms += torch.norm( list(model.modules())[2].weight) **2

	loss += 0.5 * cache['reg'] * norms


	print loss # sum up batch loss

	# Use autograd to compute the backward pass. This call will compute the
	# gradient of loss with respect to all Tensors with requires_grad=True.
	# After this call w1.grad and w2.grad will be Tensors holding the gradient
	# of the loss with respect to w1 and w2 respectively.
	#optimizer.zero_grad()


	loss.backward()
	#optimizer.step()


	# optimizer weight_decay=0

	print 'W1 grad:'
	print list(model.modules())[1].weight.grad.data

	print 'b1 grad:'
	print list(model.modules())[1].bias.grad.data

	print 'W2 grad:'
	print list(model.modules())[2].weight.grad.data

	print 'b2 grad:'
	print list(model.modules())[2].bias.grad.data


	# print 'W1 weight:'
	# print list(model.modules())[1].weight.data

	# print 'b1 weight:'
	# print list(model.modules())[1].bias.data

	# print 'W2 weight:'
	# print list(model.modules())[2].weight.data

	# print 'b2 weight:'
	# print list(model.modules())[2].bias.data

	# Update weights using gradient descent. For this step we just want to mutate
	# the values of w1 and w2 in-place; we don't want to build up a computational
	# graph for the update steps, so we use the torch.no_grad() context manager
	# to prevent PyTorch from building a computational graph for the updates
	# with torch.no_grad():
	# list(model.modules())[1].weight.data -= learning_rate * list(model.modules())[1].weight.grad.data
	# list(model.modules())[2].weight.data -= learning_rate * list(model.modules())[2].weight.grad.data

def run_python_fc_net(X,y, cache):
	learning_rate = 5.0

	grads = {}

	python_feedforward(X, cache)
	print 'logits:'
	print cache['logits']
	python_loss(cache, y)
	python_backprop(cache, grads, y)

	print 'W1 grad'
	print grads['dW1']

	print 'b1 grad'
	print grads['db1']

	print 'W2 grad'
	print grads['dW2']

	print 'b2 grad'
	print grads['db2']

	# # Update weights using gradient descent
	# cache['W1'] -= learning_rate * grads['dW1']
	# cache['b1'] -= learning_rate * grads['db1']

	# cache['W2'] -= learning_rate * grads['dW2']
	# cache['b2'] -= learning_rate * grads['db2']


	# print 'weight for W1', cache['W1']
	# print 'weight for b1', cache['b1']
	# print 'weight for W2', cache['W2']
	# print 'weight for b2', cache['b2']



def main():
	# N is batch size; D_in is input dimension;
	# H is hidden dimension; D_out is output dimension.
	N = 2
	D_in = 3
	H = 4
	D_out = 5

	cache = {}
	cache['reg'] = 0.5

	cache['N'] = N
	cache['D_in'] = D_in
	cache['H'] = H
	cache['D_out'] = D_out

	# Create random input and output data
	X = np.array(range(D_in*N))
	X = np.reshape(X, (D_in, N)) 
	y = np.array([3,4]) 
				#	[1,0],
				#	[0,1],
				#	[0,0]]	)


		#np.random.randn(D_out,N)

	# Randomly initialize weights
	cache['W1'] = np.ones((H, D_in))
	cache['W1'][:,1] *= 2.
	cache['W1'][:,2] *= 3.
	cache['W2'] = np.ones((D_out, H)) * 2.

	cache['b1'] = np.ones(cache['H'])
	cache['b2'] = np.ones(cache['D_out']) * 2.
	run_pytorch_fc_net(X, y, cache)
	run_python_fc_net(X, y, cache)

Numerical Gradient Evaluation

From CS231N:

def eval_numerical_gradient(f: Callable, x: np.ndarray, verbose=True, h=0.00001):
	"""A naive implementation of numerical gradient of f at x

	Args:
	    f: a function that takes a single argument
	    x: the point (numpy array) to evaluate the gradient at
	"""
	assert x.dtype in [np.float16, np.float32, np.float64]
	fx = f(x)  # evaluate function value at original point
	grad = np.zeros_like(x)
	# iterate over all indexes in x
	it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
	while not it.finished:

		# evaluate function at x+h
		ix = it.multi_index
		oldval = x[ix]
		x[ix] = oldval + h  # increment by h
		fxph = f(x)  # evalute f(x + h)
		x[ix] = oldval - h
		fxmh = f(x)  # evaluate f(x - h)
		x[ix] = oldval  # restore

		# compute the partial derivative with centered formula
		grad[ix] = (fxph - fxmh) / (2 * h)  # the slope
		if verbose:
			print(ix, grad[ix])
		it.iternext()  # step to next dimension

	return grad


def f_numpy(w: np.ndarray) -> np.ndarray:
	""" """
	w1, w2 = w
	a = np.cos(w1)
	b = np.cos(w2)
	c = a * b
	d = sigmoid_numpy(w2)
	g = w1 + w2
	h = np.log(g)
	i = np.square(w1)
	j = i * w2

	f1 = c + d
	f2 = h + j
	#return f1
	return f2



def numerical_autodiff_toy_problem():
	""" """
	# x = np.array([1.,2.]).reshape(2,1)
	# grad = eval_numerical_gradient_array(f=f_numpy, x=x, df=1.)
	# print(grad)

	x = np.array([1.,2.]).reshape(2,1)
	grad = eval_numerical_gradient(f_numpy,x)
	print(grad)

	# print(f_numpy(np.array([1.,2.])))

Matrix Calculus: https://ocw.mit.edu/courses/18-s096-matrix-calculus-for-machine-learning-and-beyond-january-iap-2023/download/

References:

(1) Eric Darve, Stanford University.

https://atmos.washington.edu/~dennis/MatrixCalculus.pdf