import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn import datasets
import time
2-Layer Neural Network (1 Hidden Layer): $(L=2)$
3-Layer Neural Network (2 Hidden Layers): $(L=3)$
$\large A_{<l>}$ indicates that this is $\large A$ (Activated Output) in the $\large l^{th}$ Layer
In the First Hidden Layer: $(l=1)$
In the Second Hidden Layer: $(l=2)$
In the Final (Output) Layer: $(l=L=3)$
In the Final (Output) Layer: $(l=L=3)$
In the Second Hidden Layer: $(l=2)$
In the First Hidden Layer: $(l=1)$
$\large \frac{dCost}{dB_{<l>}} = dZ_{<l>}$
We notice that the Backpropagation of Gradients in a Neural Network works very similarly to that of Linear/Logistic Regressions, except that we have multiple layers and we are stacking the Chain Rule Continuously.
Once Gradients have been passed back through Backpropagation, we can update all the Model Parameters at once with Gradient Descent.
As gradients are continuously multiplied in the backward pass due to the Chain Rule, Neural Networks can suffer from Vanishing/Exploding Gradients as the Network gets Extremely Deep.
iris = datasets.load_iris()
iris.keys()
n_F = len(iris['feature_names'])
n_C = len(iris['target_names'])
iris['data'].shape, iris['target'].shape
pd.DataFrame(iris['data'], columns=iris['feature_names']).head()
iris.target_names
X = iris['data'].T
Y_class = iris['target']
X.shape, Y_class.shape
def one_hot(array, num_classes):
new_array = np.zeros((len(array), num_classes))
for i, val in enumerate(array):
new_array[i, val] = 1
return new_array
Y = one_hot(Y_class, n_C).T
Y.shape
indices = np.arange(iris['target'].shape[0])
np.random.shuffle(indices)
X = X[:,indices]
Y = Y[:,indices]
Y_class = Y_class[indices]
split_ratio = 0.2
split = int(Y.shape[1] * split_ratio)
X_train = X[:, split:]
X_val = X[:, :split]
Y_train = Y[:, split:]
Y_val = Y[:, :split]
Y_class_train = Y_class[split:]
Y_class_val = Y_class[:split]
X_train.shape, X_val.shape
w1 = np.random.randn(16, n_F)
w2 = np.random.randn(32, 16)
w3 = np.random.randn(n_C, 32)
b1 = np.random.randn(16, 1)
b2 = np.random.randn(32, 1)
b3 = np.random.randn(3, 1)
params = np.array([[b1, w1],
[b2, w2],
[b3, w3]])
params.shape
from scipy.special import softmax
def model(params, X):
Z1 = params[0,0] + np.dot(params[0,1], X)
A1 = np.maximum(Z1, 0) # ReLU
Z2 = params[1,0] + np.dot(params[1,1], A1)
A2 = np.maximum(Z2, 0) # ReLU
Z3 = params[2,0] + np.dot(params[2,1], A2)
y_hat = softmax(Z3, 0) # Softmax
cache = {
'Z1': Z1,
'A1': A1,
'Z2': Z2,
'A2': A2,
'Z3': Z3
}
return y_hat, cache
y_hat, cache = model(params, X)
print(y_hat.shape)
print(cache.keys())
def cost(prediction, Y, epsilon=1e-10):
error = np.sum((Y * np.log(prediction + epsilon)) + ((1 - Y) * np.log(1 - prediction + epsilon)), -1)/Y.shape[1]
return - np.sum(error)
def train(X, Y, params, epochs=1, learning_rate=3e-6, iterations=1):
for epoch in range(epochs):
start = time.time()
for iteration in range(iterations):
# Forward Pass
pred, cache = model(params, X)
# Calculate Loss
loss = cost(pred, Y)
# Calculate Gradients (Backpropagation)
# Layer 3
dZ3 = pred - Y # c x m
dw3 = np.dot(dZ3, cache['A2'].T) / dZ3.shape[1] # c x h2
db3 = np.sum(dZ3, -1, keepdims=True) / dZ3.shape[1] # c x 1
# Layer 2
dA2 = np.dot(dZ3.T, params[2,1]).T # h2 x m
dZ2 = dA2 * (cache['Z2'] > 0) # h2 x m
dw2 = np.dot(dZ2, cache['A1'].T) / dZ2.shape[1] # h2 x h1
db2 = np.sum(dZ2, -1, keepdims=True) / dZ2.shape[1] # h2 x 1
# Layer 1
dA1 = np.dot(dZ2.T, params[1,1]).T # h1 x m
dZ1 = dA1 * (cache['Z1'] > 0) # h1 x m
dw1 = np.dot(dZ1, X.T) / dZ1.shape[1] # h1 x I
db1 = np.sum(dZ1, -1, keepdims=True) / dZ1.shape[1] # h1 x 1
gradients = np.array([[db1, dw1], [db2, dw2], [db3, dw3]])
# Update Parameters (Gradient-Descent)
params = params - (learning_rate * gradients)
# Calculate Accuracy
class_pred = np.argmax(pred, 0)
class_y = np.argmax(Y, 0)
acc = (class_pred == class_y).sum() / Y.shape[1]
print('Epoch {}:'.format(epoch+1))
print('Loss: {:.2f} | Accuracy: {:.2f}%\nTime Taken: {:.2f}s\n'.format(loss, acc*100, time.time()-start))
return params
def predict(X, Y, params):
# Forward Pass
pred, _ = model(params, X)
# Calculate Accuracy
class_pred = np.argmax(pred, 0)
class_y = np.argmax(Y, 0)
acc = np.sum(class_pred == class_y)/Y.shape[1]
return acc, pred
params = train(X_train, Y_train, params, epochs=20, iterations=5000)
acc, _ = predict(X_val, Y_val, params)
print('Accuracy of Prediction on Validation Data: {:.2f}%'.format(acc*100))
wine = datasets.load_wine()
wine.keys()
df = pd.DataFrame(wine['data'], columns=wine['feature_names'])
df.head()
wine['target_names']
n_F = len(wine['feature_names'])
n_C = len(wine['target_names'])
X = wine['data'].T
Y_class = wine['target']
Y = one_hot(Y_class, n_C).T
X.shape, Y_class.shape, Y.shape
indices = np.arange(wine['target'].shape[0])
np.random.shuffle(indices)
X = X[:,indices]
Y = Y[:,indices]
Y_class = Y_class[indices]
split_ratio = 0.2
split = int(Y.shape[1] * split_ratio)
X_train = X[:, split:]
X_val = X[:, :split]
Y_train = Y[:, split:]
Y_val = Y[:, :split]
Y_class_train = Y_class[split:]
Y_class_val = Y_class[:split]
X_train.shape, X_val.shape
w1 = np.random.randn(16, n_F)
w2 = np.random.randn(32, 16)
w3 = np.random.randn(n_C, 32)
b1 = np.random.randn(16, 1)
b2 = np.random.randn(32, 1)
b3 = np.random.randn(3, 1)
params = np.array([[b1, w1],
[b2, w2],
[b3, w3]])
params.shape
params = train(X_train, Y_train, params, epochs=20, iterations=5000, learning_rate=1e-6)
acc, _ = predict(X_val, Y_val, params)
print('Accuracy of Prediction on Validation Data: {:.2f}%'.format(acc*100))