Machine Learning Programming Workshop

2.2 Logistic Regression in Machine Learning

Prepared By: Cheong Shiu Hong (FTFNCE)



In [1]:
import numpy as np # Linear Algebra
import pandas as pd # Data Frames
import matplotlib.pyplot as plt # Visualization
import matplotlib.cm as cm # Color Mapping
from mpl_toolkits.mplot3d import axes3d # 3D Visualization
import ipywidgets as widgets # Interactivity
from IPython.display import display # Display Widgets
In [2]:
%matplotlib notebook


1) What is Logistic Regression?

return to top

Introduction to Logistic Regression:

Pretty much the same as Linear Regression, but we add Sigmoid at the end to scale it to between 0 and 1, a.k.a Probabilistic Output

- Used for Classification Tasks

- Change in Cost Function

Denoted by: $ \sigma(x)$

$\sigma(x) = \frac{1}{1 + e^{-x}}$

$\sigma'(x) = \sigma(x) (1 - \sigma(x))$

In [3]:
def sigmoid(x, grad=False):
    if grad:
        return sigmoid(x) * (1-sigmoid(x))
    return 1/(1+np.exp(-x))
In [4]:
print('Sigmoid 6.5: {:.4f}'.format(sigmoid(6.5)))
print('Sigmoid 1.2: {:.4f}'.format(sigmoid(1.2)))
print('Sigmoid -8: {:.4f}'.format(sigmoid(-8)))
print('Sigmoid -4.5: {:.4f}'.format(sigmoid(-4.5)))
Sigmoid 6.5: 0.9985
Sigmoid 1.2: 0.7685
Sigmoid -8: 0.0003
Sigmoid -4.5: 0.0110


Create Fake Dataset

In [5]:
x = np.array([1., 1.9, 1.2, 4., 3.2, 2.4, 3.6, 4.6])
y = np.array([0, 0, 0, 1, 0, 1, 1, 1])
In [6]:
x = np.array([1., 1.9, 1.2, 4., 3.2, 2.4, 3.6, 4.6]) 
x = x - x.mean()
y = np.array([0, 0, 0, 1, 0, 1, 1, 1])
In [7]:
pd.DataFrame({'x':x,'y':y})
Out[7]:
x y
0 -1.7375 0
1 -0.8375 0
2 -1.5375 0
3 1.2625 1
4 0.4625 0
5 -0.3375 1
6 0.8625 1
7 1.8625 1
In [8]:
fig = plt.figure(figsize=(4,3))
plt.scatter(x,y)
plt.show()


2) Model with 1 Parameter (Gradient Only)

$$\huge pred = \sigma(b + w . x) $$

$$$$

Y-Intercept (b) as Zero, Therefore:

$$$$$$\huge pred = \sigma(w . x) $$

return to top

In [9]:
def model(w, x):
    return sigmoid(w * x)
In [10]:
def plot_model(title, init_weight=None):
    line_x = np.arange(-3, 8)
    if not init_weight:
        init_weight = np.random.randn(1)

    fig = plt.figure(figsize=(7,4))
    ax = fig.add_subplot(1,1,1)

    plt.suptitle(title, fontsize=15)
    plt.xlabel('X', fontsize=15)
    plt.ylabel('Y', fontsize=15)

#     plt.axis([-4, 9, -1, 2])
    ax.axhline(0, color='black')
    ax.axvline(0, color='black')
    ax.axhline(0.5, color='black', linewidth=0.5)

    ax.scatter(x, y, c='red')
    line, = ax.plot(line_x, model(init_weight, line_x), c='green')

    def update(weight=init_weight):
        line.set_ydata(model(weight, line_x))
        fig.canvas.draw()

    gradient = widgets.FloatText(value=init_weight, description='Gradient', step=0.1)
    display(gradient)
    widgets.interactive(update, weight=gradient)
In [11]:
plot_model('Model with 1 Variable')


2.1 Simulation for Visualization

return to top

In [12]:
size = 50

w_sim = np.linspace(-10, 15, size)

w_matrix = w_sim.reshape(size, 1)
x_matrix = x.reshape(1, x.shape[0])
y_matrix = y.reshape(1, y.shape[0])

# Mean Squared Error as Cost Function
pred_sim = np.array([sigmoid(z) for z in w_matrix * x_matrix])
error_sim = (pred_sim - y_matrix)
cost_sim = np.sum(error_sim**2, 1)/len(y)

# Cross Entropy Loss as Cost Function
epsilon = 1e-8 # To Prevent -inf in log(pred)
cel_error = ((y_matrix * np.log(pred_sim + epsilon)) + ((1-y_matrix) * np.log(1-pred_sim + epsilon)))
cel_cost = -np.sum(cel_error, len(cel_error.shape)-1)/len(y)
In [13]:
print('W Sim:\n', w_sim, '\n\nMean Squared Error Sim:\n', cost_sim, '\n\nCross Entropy Loss Sim', cel_cost)
W Sim:
 [-10.          -9.48979592  -8.97959184  -8.46938776  -7.95918367
  -7.44897959  -6.93877551  -6.42857143  -5.91836735  -5.40816327
  -4.89795918  -4.3877551   -3.87755102  -3.36734694  -2.85714286
  -2.34693878  -1.83673469  -1.32653061  -0.81632653  -0.30612245
   0.20408163   0.71428571   1.2244898    1.73469388   2.24489796
   2.75510204   3.26530612   3.7755102    4.28571429   4.79591837
   5.30612245   5.81632653   6.32653061   6.83673469   7.34693878
   7.85714286   8.36734694   8.87755102   9.3877551    9.89795918
  10.40816327  10.91836735  11.42857143  11.93877551  12.44897959
  12.95918367  13.46938776  13.97959184  14.48979592  15.        ] 

Mean Squared Error Sim:
 [0.75004524 0.7500498  0.75004832 0.75003373 0.74999341 0.74990525
 0.74973112 0.74940593 0.74881952 0.74778616 0.74599372 0.74291782
 0.73767614 0.72877803 0.71369006 0.68809713 0.64481785 0.57324038
 0.46391338 0.32710187 0.20775584 0.14260362 0.12171093 0.12209105
 0.13087666 0.14268282 0.15519888 0.16735001 0.17862734 0.1888114
 0.19783977 0.20573706 0.2125751  0.21844919 0.22346363 0.22772278
 0.23132589 0.23436419 0.23691964 0.23906458 0.24086206 0.24236649
 0.24362443 0.24467554 0.24555336 0.24628621 0.24689791 0.24740844
 0.24783452 0.24819015] 

Cross Entropy Loss Sim [9.98656458 9.54669928 9.07016821 8.5726584  8.06538902 7.55423114
 7.04185782 6.52947237 6.01769061 5.50693773 4.99763504 4.49031608
 3.98574556 3.48509897 2.9902836  2.50454754 2.03364178 1.587892
 1.18501176 0.85053599 0.60795248 0.45996346 0.3858157  0.35912859
 0.36046306 0.37824192 0.4059994  0.44010443 0.47844466 0.51973409
 0.56315234 0.60815278 0.6543568  0.70149329 0.7493626  0.79781433
 0.84673298 0.89602854 0.94563001 0.99548081 1.04553554 1.09575753
 1.14611701 1.19658971 1.24715581 1.29779905 1.34850608 1.3992659
 1.45006945 1.50090924]


In [14]:
fig = plt.figure(figsize=(7,4))
ax = fig.add_subplot(111)

plt.title('Mean Squared Error Cost Curve', fontsize=20)
ax.set_xlabel('w', fontsize=15)
ax.set_ylabel('Cost', fontsize=15)

ax.plot(w_sim, cost_sim)
plt.show()


In [15]:
fig = plt.figure(figsize=(7,4))
ax = fig.add_subplot(111)

plt.title('Cross Entropy Loss Cost Curve', fontsize=20)
ax.set_xlabel('w', fontsize=15)
ax.set_ylabel('Cost', fontsize=15)

ax.plot(w_sim, cel_cost)
plt.show()
In [16]:
minimum_cost_index = np.argmin(cost_sim)
best_w = w_sim[minimum_cost_index]

print('Minimum Index:', minimum_cost_index, \
      '\nCost:', cost_sim[minimum_cost_index], \
      '\nGradient:', w_sim[minimum_cost_index])
Minimum Index: 22 
Cost: 0.12171092629904746 
Gradient: 1.2244897959183678
In [17]:
plot_model('Best Fit based on Simulation', init_weight=best_w)


3) Model with 2 Parameters (Gradient and Y-Intercept)

$$\huge pred = \sigma(b + w . x) $$

$$$$

Expressed as:

$$$$$$\huge pred = \sigma(\theta_0 + \theta_1 . x)$$

return to top

In [18]:
def model(b, w, x):
    return sigmoid(b + w * x)
In [19]:
def plot_model(title, init_weight=None, init_bias=None):
    line_x = np.arange(-3, 8)
    if not init_weight:
        init_weight = np.random.randn(1)
    if not init_bias:
        init_bias = np.random.randn(1)

    fig = plt.figure(figsize=(7,4))
    ax = fig.add_subplot(1,1,1)

    plt.suptitle(title, fontsize=15)
    plt.xlabel('X', fontsize=15)
    plt.ylabel('Y', fontsize=15)

#     plt.axis([-2, 7, -1, 2])
    ax.axhline(0, color='black')
    ax.axvline(0, color='black')
    ax.axhline(0.5, color='black', linewidth=0.5)

    ax.scatter(x, y, c='red')
    line, = ax.plot(line_x, model(init_bias, init_weight, line_x), c='green')

    def update(weight=init_weight, bias=init_bias):
        line.set_ydata(model(bias, weight, line_x))
        fig.canvas.draw()

    gradient = widgets.FloatText(value=init_weight, description='Gradient')
    bias = widgets.FloatText(value=init_bias, description='Y-Intercept')
    display(gradient)
    display(bias)
    widgets.interactive(update, weight=gradient, bias=bias)
In [20]:
plot_model('Model with 2 Variables')


3.1 Simulation for Visualization

Create Simulation Data for Each Possible Point of $\theta_0$ and $\theta_1$

return to top

In [21]:
size = 100

w_sim = np.linspace(-10, 20, size)
b_sim = np.linspace(-50, 50, size)
W, B = np.meshgrid(w_sim, b_sim)

w_matrix = w_sim.reshape(size, 1)
b_matrix = b_sim.reshape(1, size)
x_matrix = x.reshape(1, x.shape[0])
y_matrix = y.reshape(1, y.shape[0])

plot_w = np.repeat(w_sim, size)
plot_b = np.tile(b_sim, size)

# (Sw,1) x (1xN) --> (Sw,N) --reshape--> (Sw,N,1)
wx = np.dot(w_matrix, x_matrix).reshape(size, x.shape[0], 1)

# (Sw,N,1) + (1,Sb) --> (Sw,N,Sb) --tranpose--> (Sw,Sb,N)
pred_sim = sigmoid((wx + b_matrix).transpose(0, 2, 1))


Using Mean Squared Error (MSE) as our Cost Function

In [22]:
# ((Sw,Sb,N) - (1,N))^2 --> (Sw,Sb,N)
error_sim = pred_sim - y_matrix
squared_error_sim = error_sim**2

# Flatten (Sw,Sb) --> (Sw x Sb)
cost_sim_mse = np.sum(squared_error_sim, len(squared_error_sim.shape)-1)/len(y) # Sum (50, 50, 8) in Dim 2 --> (50, 50)
plot_cost = cost_sim_mse.reshape(-1) # Flatten (50,50) --> (2500,)
In [23]:
pd.DataFrame({'Gradient': plot_w, 'Intercept': plot_b, 'Cost': plot_cost}).head(5)
Out[23]:
Gradient Intercept Cost
0 -10.0 -50.000000 0.5
1 -10.0 -48.989899 0.5
2 -10.0 -47.979798 0.5
3 -10.0 -46.969697 0.5
4 -10.0 -45.959596 0.5
In [24]:
# 3D Visualization
out = widgets.Output(layout={'border': '1px solid black'})
out.append_stdout('Using Mean Squared Error gives us a Non-Convex Cost Curve')
display(out)

fig = plt.figure(figsize=(9,5))
ax = fig.add_subplot(111, projection='3d')

plot = ax.plot_surface(W, B, cost_sim_mse, cmap=cm.nipy_spectral)

cbar=plt.colorbar(plot)
cbar.set_label('\nCost', fontsize=20)

plt.title('Cost Curve\n', fontsize=20)
ax.set_xlabel('Gradient', fontsize=15)
ax.set_ylabel('Y-Intercept', fontsize=15)
ax.set_zlabel(' Cost', fontsize=15)

plt.show()
In [25]:
minimum_cost_index = np.argmin(plot_cost)
MSE_C = plot_cost[minimum_cost_index]
MSE_b = plot_b[minimum_cost_index]
MSE_w = plot_w[minimum_cost_index]

print('Minimum Index:', minimum_cost_index, \
      '\nCost:', MSE_C, \
      '\nY-Intercept:', MSE_b, \
      '\nGradient:', MSE_w)
Minimum Index: 9961 
Cost: 0.12501159512648735 
Y-Intercept: 11.61616161616162 
Gradient: 20.0


Using Cross Entropy Loss as our Cost Function

$$Cost = - \frac{1}{n}\sum\limits_{i=1}^{n} y_{i} . \log{(pred_{i})} + (1-y_{i}) . \log{(1-pred_{i})}$$

if y is 0:

$y . \log{(pred)} = 0$

$\therefore Cost = - \frac{1}{n}\sum\limits_{i=1}^{n} (1-y) . \log{(1-pred)}$

if y is 1:

$(1-y) . \log{(1-pred)} = 0$

$\therefore Cost = - \frac{1}{n}\sum\limits_{i=1}^{n} y . \log{(pred)}$

In [26]:
epsilon = 1e-8 # To Prevent -inf in log(pred)

# ((Sw,Sb,N) - (1,N))^2 --> (Sw,Sb,N)
error = ((y_matrix * np.log(pred_sim + epsilon)) + ((1-y_matrix) * np.log(1-pred_sim + epsilon)))

# Flatten (Sw,Sb) --> (Sw x Sb)
cost_sim_ce = -np.sum(error, len(error.shape)-1)/len(y) # Sum (50, 50, 8) in Dim 2 --> (50, 50)
plot_cost = cost_sim_ce.reshape(-1) # Flatten (50,50) --> (2500,)

Epsilon

In [27]:
# We add Epsilon because log(0) = -inf 
# Which will give us error in further calculations
np.log(0)
C:\Users\cheon\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: RuntimeWarning: divide by zero encountered in log
  This is separate from the ipykernel package so we can avoid doing imports until
Out[27]:
-inf
In [28]:
pd.DataFrame({'Gradient': plot_w, 'Intercept': plot_b, 'Cost': plot_cost}).head(5)
Out[28]:
Gradient Intercept Cost
0 -10.0 -50.000000 9.21034
1 -10.0 -48.989899 9.21034
2 -10.0 -47.979798 9.21034
3 -10.0 -46.969697 9.21034
4 -10.0 -45.959596 9.21034
In [29]:
pd.DataFrame({'Gradient': plot_w, 'Intercept': plot_b, 'Cost': plot_cost}).tail(5)
Out[29]:
Gradient Intercept Cost
9995 20.0 45.959596 7.902536
9996 20.0 46.969697 8.146718
9997 20.0 47.979798 8.378751
9998 20.0 48.989899 8.587394
9999 20.0 50.000000 8.763603
In [30]:
# 3D Visualization
out = widgets.Output(layout={'border': '1px solid black'})
out.append_stdout('Using Cross Entropy Loss gives us a Convex Cost Curve')
display(out)

fig = plt.figure(figsize=(9,5))
ax = fig.add_subplot(111, projection='3d')

plot = ax.plot_surface(W, B, cost_sim_ce, cmap=cm.nipy_spectral)

cbar=plt.colorbar(plot)
cbar.set_label('\nCost', fontsize=20)

plt.title('Cost Curve\n', fontsize=20)
ax.set_xlabel('Gradient', fontsize=15)
ax.set_ylabel('Y-Intercept', fontsize=15)
ax.set_zlabel(' Cost', fontsize=15)


plt.show()
In [31]:
minimum_cost_index = np.argmin(plot_cost)
CE_C = plot_cost[minimum_cost_index]
CE_b = plot_b[minimum_cost_index]
CE_w = plot_w[minimum_cost_index]

print('Minimum Index:', minimum_cost_index, \
      '\nCost:', plot_cost[minimum_cost_index], \
      '\nY-Intercept:', plot_b[minimum_cost_index], \
      '\nGradient:', plot_w[minimum_cost_index])
Minimum Index: 4049 
Cost: 0.36859797123478977 
Y-Intercept: -0.5050505050505052 
Gradient: 2.121212121212121
In [32]:
plot_model('Best Variables from Mean Squared Error', init_weight=MSE_w, init_bias=MSE_b)
In [33]:
plot_model('Best Variables from Cross Entropy Loss', init_weight=CE_w, init_bias=CE_b)


3.2 Derivatives of Cost Function

return to top

$$ z = \theta_0 + \theta_1.x $$

$$ pred = \sigma(z) $$

$$ Cost = - \frac{1}{n}\sum\limits_{i=1}^{n} y . \log{(pred)} + (1-y) . \log{(1-pred)}$$

$$ \frac{dCost}{d\theta_0} = \frac{dCost}{dPred} \times \frac{dPred}{dZ} \times \frac{dZ}{d\theta_0} $$ $$$$ $$ \frac{dCost}{d\theta_0} = \frac{dCost}{dPred} \times \frac{dPred}{dZ} \times \frac{dZ}{d\theta_1} $$

x = Input Data (Independent Variable)

$\theta$ = Weights to be Optimized

y = True Values of Each Input Data (Dependent Variable)

$ z = \theta_0 + \theta_1.x $

$ pred = \sigma(Z) $

$ Cost = - \frac{1}{n}\sum\limits_{i=1}^{n} y . \log{(pred)} + (1-y) . \log{(1-pred)}$

Derivatives:

Find $\frac{dCost}{d\theta_0} and \frac{dCost}{d\theta_1}$

$\frac{dCost}{dPred} = \frac{pred - y}{pred(1 - pred)}$

$\frac{dPred}{dZ} = \sigma(z) \times (1 - \sigma(z))$

or

$\frac{dPred}{dZ} = pred \times (1 - pred)$

Simplify:

$\frac{dCost}{dZ} = \frac{pred - y}{pred(1 - pred)} \times pred(1 - pred)$

$\frac{dCost}{dZ} = pred - y$

$\frac{dZ}{d\theta_0} = 1$

$\frac{dZ}{d\theta_1} = x$

$\frac{dCost}{d\theta_0} = \frac{dCost}{dZ} \times \frac{dZ}{d\theta_0}$

$\frac{dCost}{d\theta_0} = (pred - y) \times (1) $

$\frac{dCost}{d\theta_0} = (\theta_0 + (\theta_1.x)) - y)$

$\frac{dCost}{d\theta_1} = (pred - y) \times (x)$

$\frac{dCost}{d\theta_1} = x(\theta_0 + (\theta_1.x)) - y)$


3.3 Demonstration of Gradient Descent

return to top

Mean Squared Error

In [34]:
def one_iteration(x, y, learning_rate=1e-1, first=False):
    global theta, prev_theta
    prev_theta = theta
    
    X = np.vstack([np.ones(y.shape[0]), x])

    # Model
    pred = sigmoid(np.dot(theta, X))
    
    # Calculations for Backpropagation

    error = pred - y
    cost = sum(error**2)/len(error)
    
    dcost_dtheta = np.array([sum(2 * Xi * error)/len(error) for Xi in X])
    theta = theta - (dcost_dtheta * learning_rate)
    
    if len(x.shape) == 1:
        data = "Cost: {}\nBias: {}\nWeight: {}\nGradient of Cost Curve to Bias: {}\
            \nGradient of Cost Curve to Weight: {}\n".format(cost, theta[0], theta[1], dcost_dtheta[0], dcost_dtheta[1])
    elif len(x.shape) == 2:
        data = "Cost: {}\nBias: {}\nWeight 1: {}\nWeight 2: {}\nGradient of Cost Curve to Bias: {}\
            \nGradient of Cost Curve to Weight 1: {}\nGradient of Cost Curve to Weight 2: {}\n"\
            .format(cost, theta[0], theta[1], theta[2], dcost_dtheta[0], dcost_dtheta[1], dcost_dtheta[2])
    
    return cost, dcost_dtheta, data
In [35]:
def find_nearest(array, value):
    idx = (np.abs(array - value)).argmin()
    return idx

def update(iterations, nested=0):
    # Run Iteration
    if nested:
        for i in range(nested):
            cost, dcost_dtheta, data = one_iteration(x, y)
    else:
        cost, dcost_dtheta, data = one_iteration(x, y)
    # Replot Cost Curves
    b_nearest = find_nearest(b_sim, prev_theta[0])
    w_nearest = find_nearest(w_sim, prev_theta[1])
    curve_b.set_ydata(cost_sim_mse[w_nearest])
    curve_w.set_ydata(cost_sim_mse[:, b_nearest])
    # Replot Tangents
    intercept_b = cost - (prev_theta[0] * dcost_dtheta[0])
    intercept_w = cost - (prev_theta[1] * dcost_dtheta[1])
    tangent_b.set_ydata((dcost_dtheta[0]*b_sim)+intercept_b)
    tangent_w.set_ydata((dcost_dtheta[1]*w_sim)+intercept_w)
    # Replot Regression Line
    line.set_ydata(model(theta[0], theta[1], line_x))
    # Clear and Redraw
    out.clear_output()
    fig.canvas.draw()
    out.append_stdout(data)

def update_ten(iterations):
    if visualize.value:
        for i in range(10):
            update(1)
    else:
        update(1, nested=10)

def update_hundred(iterations):
    if visualize.value:
        for i in range(100):
            update(1)
    else:
        update(1, nested=100)

##### Widgets #####
out = widgets.Output(layout={'border': '1px solid black'})
button_one = widgets.Button(description="1 Iteration")
button_one.on_click(update)
button_ten = widgets.Button(description="10 Iterations")
button_ten.on_click(update_ten)
button_hundred = widgets.Button(description="!!!100 Iterations!!!")
button_hundred.on_click(update_hundred)
visualize = widgets.Checkbox(value=False, description='Visualize Changes', disabled=False)
hbox = widgets.HBox([button_one, button_ten, button_hundred, visualize])

# Initialize Random Weight & Bias
theta = np.random.uniform(-8,8, size=2)
# theta = np.array([-5, 5])

# Run First Iteration
cost, dcost_dtheta, data = one_iteration(x, y)

b_nearest = find_nearest(b_sim, theta[0])
w_nearest = find_nearest(w_sim, theta[1])

############################
### Visualize Cost Curve ###
############################
fig = plt.figure(figsize=(9,6))
ax = fig.add_subplot(111)
plt.suptitle('Cost Curves (MSE)')

##### Bias Subplot #####
plt.subplot(2,2,1)
# plt.axis([-10.5,10.5,-500,2000])
plt.title('Bias')
plt.xlabel('Bias')
plt.ylabel('Cost')
# Cost Curve
curve_b, = plt.plot(b_sim, cost_sim_mse[w_nearest], label='Cost Curve')
x1, x2, y1, y2 = plt.axis()
plt.axis([x1,x2,-0.3,y2])
# Tangent
intercept_b = cost - (prev_theta[0] * dcost_dtheta[0])
tangent_b, = plt.plot(b_sim, (dcost_dtheta[0]*b_sim)+intercept_b, label='Gradient (Tangent Line)', linestyle='--')
plt.legend();

##### Weight Subplot #####
plt.subplot(2,2,2)
plt.title('Weight')
plt.xlabel('Weight')
# Cost Curve
curve_w, = plt.plot(w_sim, cost_sim_mse[:, b_nearest], label='Cost Curve')
x1, x2, y1, y2 = plt.axis()
plt.axis([x1,x2,-0.3,y2])
# Tangent
intercept_w = cost - (prev_theta[1] * dcost_dtheta[1])
tangent_w, = plt.plot(w_sim, (dcost_dtheta[1]*w_sim)+intercept_w, label='Gradient (Tangent Line)', linestyle='--')
plt.legend();

##### Regression Line Subplot #####
plt.subplot(2,2,(3,4))
# plt.axis([-1,8,-3,10])
plt.title('Regression')
plt.xlabel('X', fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.axhline(0, color='black')
plt.axvline(0, color='black')
plt.axhline(0.5, color='black', linewidth=0.5)
line_x = np.arange(-3, 8)
##### Data Points #####
plt.scatter(x, y, label='True Values', c='red')
##### Regression #####
line, = plt.plot(line_x, model(theta[0], theta[1], line_x), label='Logistic Regression Line', c='green')
plt.legend();

update(1)
display(hbox)   
out


Cross Entropy Loss

In [36]:
def one_iteration(x, y, learning_rate=1e-1, first=False):
    global theta, prev_theta
    prev_theta = theta
    
    X = np.vstack([np.ones(y.shape[0]), x])

    # Model
    pred = sigmoid(np.dot(theta, X))
    
    # Calculations for Backpropagation

    error = ((y * np.log(pred + epsilon)) + ((1-y) * np.log(1-pred + epsilon)))
    cost = - np.sum(error, len(error.shape)-1)/len(error)
    
    dcost_dtheta = np.array([sum(Xi * (pred-y))/len(error) for Xi in X])
    theta = theta - (dcost_dtheta * learning_rate)
    
    if len(x.shape) == 1:
        data = "Cost: {}\nBias: {}\nWeight: {}\nGradient of Cost Curve to Bias: {}\
            \nGradient of Cost Curve to Weight: {}\n".format(cost, theta[0], theta[1], dcost_dtheta[0], dcost_dtheta[1])
    elif len(x.shape) == 2:
        data = "Cost: {}\nBias: {}\nWeight 1: {}\nWeight 2: {}\nGradient of Cost Curve to Bias: {}\
            \nGradient of Cost Curve to Weight 1: {}\nGradient of Cost Curve to Weight 2: {}\n"\
            .format(cost, theta[0], theta[1], theta[2], dcost_dtheta[0], dcost_dtheta[1], dcost_dtheta[2])
    
    return cost, dcost_dtheta, data
In [37]:
def find_nearest(array, value):
    idx = (np.abs(array - value)).argmin()
    return idx

def update(iterations, nested=0):
    # Run Iteration
    if nested:
        for i in range(nested):
            cost, dcost_dtheta, data = one_iteration(x, y)
    else:
        cost, dcost_dtheta, data = one_iteration(x, y)
    # Replot Cost Curves
    b_nearest = find_nearest(b_sim, prev_theta[0])
    w_nearest = find_nearest(w_sim, prev_theta[1])
    curve_b.set_ydata(cost_sim_ce[w_nearest])
    curve_w.set_ydata(cost_sim_ce[:, b_nearest])
    # Replot Tangents
    intercept_b = cost - (prev_theta[0] * dcost_dtheta[0])
    intercept_w = cost - (prev_theta[1] * dcost_dtheta[1])
    tangent_b.set_ydata((dcost_dtheta[0]*b_sim)+intercept_b)
    tangent_w.set_ydata((dcost_dtheta[1]*w_sim)+intercept_w)
    # Replot Regression Line
    line.set_ydata(model(theta[0], theta[1], line_x))
    # Clear and Redraw
    out.clear_output()
    fig.canvas.draw()
    out.append_stdout(data)

def update_ten(iterations):
    if visualize.value:
        for i in range(10):
            update(1)
    else:
        update(1, nested=10)

def update_hundred(iterations):
    if visualize.value:
        for i in range(100):
            update(1)
    else:
        update(1, nested=100)

##### Widgets #####
out = widgets.Output(layout={'border': '1px solid black'})
button_one = widgets.Button(description="1 Iteration")
button_one.on_click(update)
button_ten = widgets.Button(description="10 Iterations")
button_ten.on_click(update_ten)
button_hundred = widgets.Button(description="!!!100 Iterations!!!")
button_hundred.on_click(update_hundred)
visualize = widgets.Checkbox(value=False, description='Visualize Changes', disabled=False)
hbox = widgets.HBox([button_one, button_ten, button_hundred, visualize])

# Initialize Random Weight & Bias
theta = np.random.uniform(-8,8, size=2)
# theta = np.array([-5, 5])

# Run First Iteration
cost, dcost_dtheta, data = one_iteration(x, y)

b_nearest = find_nearest(b_sim, theta[0])
w_nearest = find_nearest(w_sim, theta[1])

############################
### Visualize Cost Curve ###
############################
fig = plt.figure(figsize=(9,6))
ax = fig.add_subplot(111)
plt.suptitle('Cost Curves (Cross Entropy)')

##### Bias Subplot #####
plt.subplot(2,2,1)
# plt.axis([-10.5,10.5,-500,2000])
plt.title('Bias')
plt.xlabel('Bias')
plt.ylabel('Cost')
# Cost Curve
curve_b, = plt.plot(b_sim, cost_sim_ce[w_nearest], label='Cost Curve')
x1, x2, y1, y2 = plt.axis()
plt.axis([x1,x2,-1,y2])
# Tangent
intercept_b = cost - (prev_theta[0] * dcost_dtheta[0])
tangent_b, = plt.plot(b_sim, (dcost_dtheta[0]*b_sim)+intercept_b, label='Gradient (Tangent Line)', linestyle='--')
plt.legend();

##### Weight Subplot #####
plt.subplot(2,2,2)
plt.title('Weight')
plt.xlabel('Weight')
# Cost Curve
curve_w, = plt.plot(w_sim, cost_sim_ce[:, b_nearest], label='Cost Curve')
x1, x2, y1, y2 = plt.axis()
plt.axis([x1,x2,-1,y2])
# Tangent
intercept_w = cost - (prev_theta[1] * dcost_dtheta[1])
tangent_w, = plt.plot(w_sim, (dcost_dtheta[1]*w_sim)+intercept_w, label='Gradient (Tangent Line)', linestyle='--')
plt.legend();

##### Regression Line Subplot #####
plt.subplot(2,2,(3,4))
# plt.axis([-1,8,-3,10])
plt.title('Regression')
plt.xlabel('X', fontsize=15)
plt.ylabel('Y', fontsize=15)
plt.axhline(0, color='black')
plt.axvline(0, color='black')
plt.axhline(0.5, color='black', linewidth=0.5)
line_x = np.arange(-3, 8)
##### Data Points #####
plt.scatter(x, y, label='True Values', c='red')
##### Regression #####
line, = plt.plot(line_x, model(theta[0], theta[1], line_x), label='Logistic Regression Line', c='green')
plt.legend();

update(1)
display(hbox)   
out


4) Model with Three Variables (Multiple Regression)

$$\huge pred = \sigma(b + w_1.x_1 + w_2.x_2) $$

$$$$$$\huge pred = \sigma(\theta_0 + \theta_1 . x_1 + \theta_2 . x_2) $$

return to top

In [38]:
def model(b, w1, x1, w2, x2):
    return sigmoid(b + (w1 * x1) + (w2 * x2))

New Fake Data for Multiple Logistic Regression

In [39]:
x1 = np.array([1., 2.8, 1.2, 2.7, 3.2, 2.4, 3.6, 4.6])
x1 = (x1 - x1.mean()) * 3
x2 = np.array([8.1, 6.7, 6.5, 3.2, 5.1, 4.9, 3.9, 4.1])
x2 = (x2 - x2.mean()) * 3
y = np.array([0, 0, 0, 1, 1, 0, 1, 1 ])

X = np.vstack([x1, x2])
In [40]:
fig = plt.figure(figsize=(4,3))
plt.title('x1 vs Y'); plt.xlabel('x1'); plt.ylabel('Y')
plt.scatter(x1, y, c='r')
plt.show()
In [41]:
fig = plt.figure(figsize=(4,3))
plt.title('x2 vs Y'); plt.xlabel('x2'); plt.ylabel('Y')
plt.scatter(x2, y, c='g')
plt.show()


In [42]:
size = 100

line_x1 = np.linspace(-5,5,size)
line_x2 = np.linspace(-5,5,size)
lx1, lx2 = np.meshgrid(line_x1, line_x2)

init = np.random.randn(3)

fig = plt.figure(figsize=(8,5))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(x1, x2, y, s=100, c='r')
z = model(init[0], init[1], lx1, init[2], lx2)
line = ax.plot_surface(lx1, lx2, z, color='green')

plt.suptitle('Model with 3 Variables', fontsize=15)

plt.axes([0, 10, 0, 10])

def update(intercept=init[0], weight_1=init[1], weight_2=init[2]):
    global ax, line
    ax.clear()
    ax.set_xlabel('X1', fontsize=15)
    ax.set_ylabel('X2', fontsize=15)
    ax.set_zlabel('Y', fontsize=15)
    ax.scatter(x1, x2, y, s=100, c='r')
    z = model(intercept, weight_1, lx1, weight_2, lx2)
    line = ax.plot_surface(lx1, lx2, z, color='green')
    fig.canvas.draw()
    
intercept = widgets.FloatText(value=init[0], description='Y-Intercept', step=0.1)
gradient_1 = widgets.FloatText(value=init[1], description='Gradient 1', step=0.1)
gradient_2 = widgets.FloatText(value=init[2], description='Gradient 2', step=0.1)

widgets.interactive(update, intercept=intercept, weight_1=gradient_1, weight_2=gradient_2)


4.1 Simulation for Visualization

Create Simulation Data for Each Possible Point of $\theta_0$, $\theta_1$, and $\theta_2$

return to top

In [43]:
size = 100

w1_raw = np.linspace(-15,15,size)
w2_raw = np.linspace(-15,15,size)

w1_sim = np.repeat(w1_raw, size)
w2_sim = np.tile(w2_raw, size)
b_sim = np.linspace(-20, 20, size)

w1_matrix = w1_sim.reshape(size*size, 1)
w2_matrix = w2_sim.reshape(size*size, 1)
b_matrix = b_sim.reshape(1, size)
x1_matrix = x1.reshape(1, x1.shape[0])
x2_matrix = x2.reshape(1, x2.shape[0])
y_matrix = y.reshape(1, y.shape[0])

plot_w1 = np.repeat(w1_raw, size*size)
plot_w2 = np.tile(np.repeat(w2_raw, size), size)
plot_b = np.tile(b_sim, size*size)

# (Sw2xSw1,1) x (1xN) --> (Sw2xSw1,N) --reshape--> (Sw1xSw2,N,1)
w_matrix = np.hstack([w1_matrix, w2_matrix])
x_matrix = np.vstack([x1_matrix, x2_matrix])
wx = np.dot(w_matrix, x_matrix).reshape(size*size, x1.shape[0], 1)

# (Sw2xSw1,N,1) + (1,Sb) --> (Sw2xSw1,N,Sb) --tranpose--> (Sw2xSw1,Sb,N)
pred_sim = sigmoid((wx + b_matrix).transpose(0, 2, 1))
In [44]:
# ((Sw1xSw2,Sb,N) - (1,N))^2 --> (Sw1xSw2,Sb,N)
error_sim = pred_sim - y_matrix
squared_error_sim = error_sim**2 
print(squared_error_sim.shape)
# Sum (Sw1xSw2,Sb,N) in Dim 2 --> (Sw1xSw2,Sb)
cost_sim_mse = np.sum(squared_error_sim, len(squared_error_sim.shape)-1)/len(y)

# Flatten (Sw1xSw2,Sb) --> (Sw1xSw2 x Sb)
plot_cost = cost_sim_mse.reshape(-1) # Flatten (50,50) --> (2500,)
(10000, 100, 8)
In [45]:
pd.DataFrame({'Gradient_1': plot_w1, 'Gradient_2': plot_w2, 'Intercept': plot_b, 'Cost': plot_cost}).head(5)
Out[45]:
Gradient_1 Gradient_2 Intercept Cost
0 -15.0 -15.0 -20.000000 0.375717
1 -15.0 -15.0 -19.595960 0.375336
2 -15.0 -15.0 -19.191919 0.375156
3 -15.0 -15.0 -18.787879 0.375073
4 -15.0 -15.0 -18.383838 0.375039
In [46]:
# 3D Visualization
fig = plt.figure(figsize=(9,6))
ax = fig.add_subplot(111, projection='3d')

plot = ax.scatter(plot_w1, plot_w2, plot_b, c=plot_cost, cmap='nipy_spectral')

cbar=plt.colorbar(plot)
cbar.set_label('\nCost', fontsize=20)

plt.title('Cost Curve (MSE)\n', fontsize=30)
ax.set_xlabel('Gradient 1 (Theta 1)', fontsize=15)
ax.set_ylabel('Gradient 2 (Theta 2)', fontsize=15)
ax.set_zlabel('Y-Intercept (Theta 0)', fontsize=15)

plt.show()
In [47]:
minimum_cost_index = np.argmin(plot_cost)
print('Minimum Index:', minimum_cost_index, \
      '\nCost:', plot_cost[minimum_cost_index], \
      '\nY-Intercept:', plot_b[minimum_cost_index], \
      '\nGradient 1:', plot_w1[minimum_cost_index], \
      '\nGradient 2:', plot_w2[minimum_cost_index])
Minimum Index: 993628 
Cost: 7.792144492678969e-16 
Y-Intercept: -8.686868686868687 
Gradient 1: 15.0 
Gradient 2: -4.09090909090909

Cross Entropy Loss

In [48]:
epsilon = 1e-8 # To Prevent -inf in log(pred)

# ((Sw,Sb,N) - (1,N))^2 --> (Sw,Sb,N)
error = ((y_matrix * np.log(pred_sim + epsilon)) + ((1-y_matrix) * np.log(1-pred_sim + epsilon)))

# Flatten (Sw,Sb) --> (Sw x Sb)
cost_sim_ce = -np.sum(error, len(error.shape)-1)/len(y) # Sum (50, 50, 8) in Dim 2 --> (50, 50)
plot_cost = cost_sim_ce.reshape(-1) # Flatten (50,50) --> (2500,)
In [49]:
# 3D Visualization
fig = plt.figure(figsize=(9,6))
ax = fig.add_subplot(111, projection='3d')

plot = ax.scatter(plot_w1, plot_w2, plot_b, c=plot_cost, cmap='nipy_spectral')

cbar=plt.colorbar(plot)
cbar.set_label('\nCost', fontsize=20)

plt.title('Cost Curve (Cross Entropy)\n', fontsize=30)
ax.set_xlabel('Gradient 1 (Theta 1)', fontsize=15)
ax.set_ylabel('Gradient 2 (Theta 2)', fontsize=15)
ax.set_zlabel('Y-Intercept (Theta 0)', fontsize=15)

plt.show()
In [50]:
minimum_cost_index = np.argmin(plot_cost)
print('Minimum Index:', minimum_cost_index, \
      '\nCost:', plot_cost[minimum_cost_index], \
      '\nY-Intercept:', plot_b[minimum_cost_index], \
      '\nGradient 1:', plot_w1[minimum_cost_index], \
      '\nGradient 2:', plot_w2[minimum_cost_index])
Minimum Index: 993527 
Cost: 4.820846148008532e-09 
Y-Intercept: -9.090909090909092 
Gradient 1: 15.0 
Gradient 2: -4.3939393939393945


4.2 Derivatives of Cost Function

return to top

$$ Pred = \theta_0 + \theta_1.x_1 + \theta_2.x_2 $$

$$ Error = Pred - y $$

$$ Cost = Error^2 $$

$$ \frac{Cost}{d\theta_0} = \frac{dCost}{dError} \times \frac{dError}{dPred} \times \frac{dPred}{d\theta_0} $$ $$$$ $$ \frac{dCost}{d\theta_1} = \frac{dCost}{dError} \times \frac{dError}{dPred} \times \frac{dPred}{d\theta_1} $$ $$$$ $$ \frac{dCost}{d\theta_2} = \frac{dCost}{dError} \times \frac{dError}{dPred} \times \frac{dPred}{d\theta_2} $$

x = Input Data (Independent Variable)

w = Weights to be Optimized

y = True Values of Each Input Data (Dependent Variable)

$pred = \theta_0 + (\theta_1 . x_1) + (\theta_2 . x2)$

$error = pred - y$

$cost = error^2$

Derivatives:

Find $\frac{dCost}{d\theta_0}, \frac{dCost}{d\theta_1}, and \frac{dCost}{d\theta_2}$

Recap: $\frac{dCost}{d\theta_0} = 2 \times (\theta_0 + (\theta_1.x_1) + (\theta_2.x_2) - y)$

Recap: $\frac{dCost}{d\theta_1} = 2x_1 \times (\theta_0 + (\theta_1.x_1) + (\theta_2.x_2) - y)$

$\frac{dCost}{dError} = 2 \times error$

$\frac{dError}{dPred} = 1$

$\frac{dPred}{d\theta_2} = x_2$

$\frac{dCost}{d\theta_2} = \frac{dCost}{dError} \times \frac{dError}{dPred} \times \frac{dPred}{d\theta_2}$

$\frac{dCost}{d\theta_2} = (2 \times error) \times (1) \times (x_2)$

$\frac{dCost}{d\theta_2} = 2x_2 \times error$

$\frac{dCost}{d\theta_2} = 2x_2 \times (pred - y)$

$\frac{dCost}{d\theta_2} = 2x_2 \times (\theta_0 + (\theta_1.x_1) + (\theta_2.x_2) - y)$

Therefore:

$\frac{dCost}{d\theta_0} = 2 \times (\theta_0 + (\theta_1.x_1) + (\theta_2.x_2) - y)$

$\frac{dCost}{d\theta_1} = 2x_1 \times (\theta_0 + (\theta_1.x_1) + (\theta_2.x_2) - y)$

$\frac{dCost}{d\theta_2} = 2x_2 \times (\theta_0 + (\theta_1.x_1) + (\theta_2.x_2) - y)$


4.3) Demonstration of Gradient Descent

return to top

Mean Squared Error

In [51]:
def one_iteration(x, y, learning_rate=1e-1, first=False):
    global theta, prev_theta
    prev_theta = theta
    
    X = np.vstack([np.ones(y.shape[0]), x])

    # Model
    pred = sigmoid(np.dot(theta, X))
    
    # Calculations for Backpropagation
    error = pred - y
    cost = sum(error**2)/len(error)
    
    dcost_dtheta = np.array([sum(2 * Xi * error)/len(error) for Xi in X])
    theta = theta - (dcost_dtheta * learning_rate)
    
    if len(x.shape) == 1:
        data = "Cost: {}\nBias: {}\nWeight: {}\nGradient of Cost Curve to Bias: {}\
            \nGradient of Cost Curve to Weight: {}\n".format(cost, theta[0], theta[1], dcost_dtheta[0], dcost_dtheta[1])
    elif len(x.shape) == 2:
        data = "Cost: {}\nBias: {}\nWeight 1: {}\nWeight 2: {}\nGradient of Cost Curve to Bias: {}\
            \nGradient of Cost Curve to Weight 1: {}\nGradient of Cost Curve to Weight 2: {}\n"\
            .format(cost, theta[0], theta[1], theta[2], dcost_dtheta[0], dcost_dtheta[1], dcost_dtheta[2])
    
    return cost, dcost_dtheta, data
In [52]:
def find_nearest(array, value):
    idx = (np.abs(array - value)).argmin()
    return idx

def match(array, value):
    cond = array == value
    return np.where(cond)[0]

def match_2(array1, array2, value1, value2):
    cond1 = array1 == value1
    cond2 = array2 == value2
    a = np.where(cond1)[0]
    b = np.where(cond2)[0]
    return a[np.where(np.in1d(a, b))][0]
    
def update(iterations, nested=0):
    # Run Iteration
    if nested:
        for i in range(nested):
            cost, dcost_dtheta, data = one_iteration(X, y)
    else:
        cost, dcost_dtheta, data = one_iteration(X, y)
    # Replot Cost Curves
    b_nearest = find_nearest(b_sim, prev_theta[0])
    w1_nearest = find_nearest(w1_raw, prev_theta[1])
    w2_nearest = find_nearest(w2_raw, prev_theta[2])
    curve_b.set_ydata(cost_sim_mse[match_2(w1_sim, w2_sim, w1_raw[w1_nearest], w2_raw[w2_nearest])])
    curve_w1.set_ydata(cost_sim_mse[match(w2_sim, w2_raw[w2_nearest]), b_nearest])
    curve_w2.set_ydata(cost_sim_mse[match(w1_sim, w1_raw[w1_nearest]), b_nearest])
    # Replot Tangents
    intercept_b = cost - (prev_theta[0] * dcost_dtheta[0])
    intercept_w1 = cost - (prev_theta[1] * dcost_dtheta[1])
    intercept_w2 = cost - (prev_theta[2] * dcost_dtheta[2])
    tangent_b.set_ydata((dcost_dtheta[0]*b_sim)+intercept_b)
    tangent_w1.set_ydata((dcost_dtheta[1]*w1_raw)+intercept_w1)
    tangent_w2.set_ydata((dcost_dtheta[2]*w2_raw)+intercept_w2)
    # Clear and Redraw
    out.clear_output()
    out.append_stdout(data)
    fig.canvas.draw()

def update_ten(iterations):
    if visualize.value:
        for i in range(10):
            update(1)
    else:
        update(1, nested=10)

def update_hundred(iterations):
    if visualize.value:
        for i in range(100):
            update(1)
    else:
        update(1, nested=100)
        
def update_thousand(iterations):
    if visualize.value:
        print('Not Allowed, Unselect Visualize to Proceed.')
    else:
        update(1, nested=1000)
        
##### Widgets #####
out = widgets.Output(layout={'border': '1px solid black'})
button_one = widgets.Button(description="1 Iteration")
button_one.on_click(update)
button_ten = widgets.Button(description="10 Iterations")
button_ten.on_click(update_ten)
button_hundred = widgets.Button(description="!!!100 Iterations!!!")
button_hundred.on_click(update_hundred)
button_thousand = widgets.Button(description="!!!Warning - 1000!!!")
button_thousand.on_click(update_thousand)
visualize = widgets.Checkbox(value=False, description='Visualize Changes', disabled=False)
hbox = widgets.HBox([button_one, button_ten, button_hundred, button_thousand, visualize])

# Initialize Random Weights & Bias
theta = np.random.uniform(-5,5,size=3)

# Run First Iteration
cost, dcost_dtheta, data = one_iteration(X, y)
                           
b_nearest = find_nearest(b_sim, prev_theta[0])
w1_nearest = find_nearest(w1_raw, prev_theta[1])
w2_nearest = find_nearest(w2_raw, prev_theta[2])

############################
### Visualize Cost Curve ###
############################
fig = plt.figure(figsize=(10.5,5))
ax = fig.add_subplot(111)
plt.suptitle('Cost Curves (MSE)')

##### Bias Subplot #####
plt.subplot(1,3,1)
plt.title('Bias')
plt.xlabel('Bias')
plt.ylabel('Cost')
# Cost Curve
curve_b, = plt.plot(b_sim, cost_sim_mse[match_2(w1_sim, w2_sim, w1_raw[w1_nearest], w2_raw[w2_nearest])], label='Cost Curve')
d1,d2,d3,d4 = plt.axis()
# plt.axis([d1,d2,-1,10])
# Tangent
intercept_b = cost - (prev_theta[0] * dcost_dtheta[0])
tangent_b, = plt.plot(b_sim, (dcost_dtheta[0]*b_sim)+intercept_b, label='Gradient (Tangent Line)', linestyle='--')
plt.legend();

##### Weight 1 Subplot #####
plt.subplot(1,3,2)
# plt.axis([-30.5,30.5,-1000,1000])
plt.title('Weight 1')
plt.xlabel('Weight 1')
# Cost Curve
curve_w1, = plt.plot(w1_raw, cost_sim_mse[match(w2_sim, w2_raw[w2_nearest]), b_nearest], label='Cost Curve')
d1,d2,d3,d4 = plt.axis()
# plt.axis([d1,d2,-1,12])
# Tangent
intercept_w1 = cost - (prev_theta[1] * dcost_dtheta[1])
tangent_w1, = plt.plot(w1_raw, (dcost_dtheta[1]*w1_raw)+intercept_w1, label='Gradient (Tangent Line)', linestyle='--')
plt.legend();

##### Weight 2 Subplot #####
plt.subplot(1,3,3)
plt.title('Weight 2')
plt.xlabel('Weight 2')
# Cost Curve
curve_w2, = plt.plot(w2_raw, cost_sim_mse[match(w1_sim, w1_raw[w1_nearest]), b_nearest], label='Cost Curve')
d1,d2,d3,d4 = plt.axis()
# plt.axis([d1,d2,-1,12])
# Tangent
intercept_w2 = cost - (prev_theta[2] * dcost_dtheta[2])
tangent_w2, = plt.plot(w2_raw, (dcost_dtheta[2]*w2_raw)+intercept_w2, label='Gradient (Tangent Line)', linestyle='--')
plt.legend();

update(1)
display(hbox)   
out
In [53]:
def update(iterations, nested=1):
    for _ in range(nested):
        cost, dcost_dtheta, data = one_iteration(X, y)
    ax.clear()
    ax.set_xlabel('x1', fontsize=15)
    ax.set_ylabel('x2', fontsize=15)
    ax.set_zlabel('Y', fontsize=15)
    ax.scatter(x1, x2, y, s=100, c='r')
    z = model(theta[0], theta[1], lx1, theta[2], lx2)
    line = ax.plot_surface(lx1, lx2, z, color='green')
#     line = ax.scatter(line_x_1, line_x_2, z, s=5, c=z)
    out.clear_output()
    out.append_stdout(data)
    fig.canvas.draw()
    
def update_ten(iterations):
    if visualize.value:
        for i in range(10):
            update(1)
    else:
        update(1, nested=10)

def update_hundred(iterations):
    if visualize.value:
        for i in range(100):
            update(1)
    else:
        update(1, nested=100)
        
def update_ten_thousand(iterations):
    if visualize.value:
        print('Not Allowed, Unselect Visualize to Proceed.')
    else:
        update(1, nested=10000)

##### Widgets #####
out = widgets.Output(layout={'border': '1px solid black'})
button_one = widgets.Button(description="1 Iteration")
button_one.on_click(update)
button_ten = widgets.Button(description="10 Iterations")
button_ten.on_click(update_ten)
button_hundred = widgets.Button(description="!!!100 Iterations!!!")
button_hundred.on_click(update_hundred)
button_ten_thousand = widgets.Button(description="!!!Warning - 10000!!!")
button_ten_thousand.on_click(update_ten_thousand)
visualize = widgets.Checkbox(value=False, description='Visualize Changes', disabled=False)
hbox = widgets.HBox([button_one, button_ten, button_hundred, button_ten_thousand, visualize])

# Initialize Random Weights & Bias
theta = np.random.uniform(-5,5,size=3)

line_x_1 = np.repeat(np.linspace(-5,5,size), size)
line_x_2 = np.tile(np.linspace(-5,5,size), size)

##################################
### Visualize Regression Plane ###
##################################
fig = plt.figure(figsize=(8,5))
ax = fig.add_subplot(111, projection='3d')

plt.suptitle('Regression Plane', fontsize=15)
ax.set_xlabel('x1', fontsize=15)
ax.set_ylabel('x2', fontsize=15)
ax.set_zlabel('Y', fontsize=15)

ax.scatter(x1, x2, y, s=100, c='r')
z = model(theta[0], theta[1], lx1, theta[2], lx2)
line = ax.plot_surface(lx1, lx2, z, color='green')

update(1)
display(hbox)
out


Cross Entropy Loss

In [54]:
def one_iteration(x, y, learning_rate=1e-1, first=False):
    global theta, prev_theta
    prev_theta = theta
    
    X = np.vstack([np.ones(y.shape[0]), x])

    # Model
    pred = sigmoid(np.dot(theta, X))
    
    # Calculations for Backpropagation
    error = ((y * np.log(pred + epsilon)) + ((1-y) * np.log(1-pred + epsilon)))
    cost = - np.sum(error, len(error.shape)-1)/len(error)
    
    dcost_dtheta = np.array([sum(Xi * (pred-y))/len(error) for Xi in X])
    theta = theta - (dcost_dtheta * learning_rate)
    
    if len(x.shape) == 1:
        data = "Cost: {}\nBias: {}\nWeight: {}\nGradient of Cost Curve to Bias: {}\
            \nGradient of Cost Curve to Weight: {}\n".format(cost, theta[0], theta[1], dcost_dtheta[0], dcost_dtheta[1])
    elif len(x.shape) == 2:
        data = "Cost: {}\nBias: {}\nWeight 1: {}\nWeight 2: {}\nGradient of Cost Curve to Bias: {}\
            \nGradient of Cost Curve to Weight 1: {}\nGradient of Cost Curve to Weight 2: {}\n"\
            .format(cost, theta[0], theta[1], theta[2], dcost_dtheta[0], dcost_dtheta[1], dcost_dtheta[2])
    
    return cost, dcost_dtheta, data
In [55]:
def find_nearest(array, value):
    idx = (np.abs(array - value)).argmin()
    return idx

def match(array, value):
    cond = array == value
    return np.where(cond)[0]

def match_2(array1, array2, value1, value2):
    cond1 = array1 == value1
    cond2 = array2 == value2
    a = np.where(cond1)[0]
    b = np.where(cond2)[0]
    return a[np.where(np.in1d(a, b))][0]
    
def update(iterations, nested=0):
    # Run Iteration
    if nested:
        for i in range(nested):
            cost, dcost_dtheta, data = one_iteration(X, y)
    else:
        cost, dcost_dtheta, data = one_iteration(X, y)
    # Replot Cost Curves
    b_nearest = find_nearest(b_sim, prev_theta[0])
    w1_nearest = find_nearest(w1_raw, prev_theta[1])
    w2_nearest = find_nearest(w2_raw, prev_theta[2])
    curve_b.set_ydata(cost_sim_ce[match_2(w1_sim, w2_sim, w1_raw[w1_nearest], w2_raw[w2_nearest])])
    curve_w1.set_ydata(cost_sim_ce[match(w2_sim, w2_raw[w2_nearest]), b_nearest])
    curve_w2.set_ydata(cost_sim_ce[match(w1_sim, w1_raw[w1_nearest]), b_nearest])
    # Replot Tangents
    intercept_b = cost - (prev_theta[0] * dcost_dtheta[0])
    intercept_w1 = cost - (prev_theta[1] * dcost_dtheta[1])
    intercept_w2 = cost - (prev_theta[2] * dcost_dtheta[2])
    tangent_b.set_ydata((dcost_dtheta[0]*b_sim)+intercept_b)
    tangent_w1.set_ydata((dcost_dtheta[1]*w1_raw)+intercept_w1)
    tangent_w2.set_ydata((dcost_dtheta[2]*w2_raw)+intercept_w2)
    # Clear and Redraw
    out.clear_output()
    out.append_stdout(data)
    fig.canvas.draw()

def update_ten(iterations):
    if visualize.value:
        for i in range(10):
            update(1)
    else:
        update(1, nested=10)

def update_hundred(iterations):
    if visualize.value:
        for i in range(100):
            update(1)
    else:
        update(1, nested=100)
        
def update_thousand(iterations):
    if visualize.value:
        print('Not Allowed, Unselect Visualize to Proceed.')
    else:
        update(1, nested=1000)
        
##### Widgets #####
out = widgets.Output(layout={'border': '1px solid black'})
button_one = widgets.Button(description="1 Iteration")
button_one.on_click(update)
button_ten = widgets.Button(description="10 Iterations")
button_ten.on_click(update_ten)
button_hundred = widgets.Button(description="!!!100 Iterations!!!")
button_hundred.on_click(update_hundred)
button_thousand = widgets.Button(description="!!!Warning - 1000!!!")
button_thousand.on_click(update_thousand)
visualize = widgets.Checkbox(value=False, description='Visualize Changes', disabled=False)
hbox = widgets.HBox([button_one, button_ten, button_hundred, button_thousand, visualize])

# Initialize Random Weights & Bias
theta = np.random.uniform(-5,5,size=3)

# Run First Iteration
cost, dcost_dtheta, data = one_iteration(X, y)
                           
b_nearest = find_nearest(b_sim, prev_theta[0])
w1_nearest = find_nearest(w1_raw, prev_theta[1])
w2_nearest = find_nearest(w2_raw, prev_theta[2])

############################
### Visualize Cost Curve ###
############################
fig = plt.figure(figsize=(10.5,5))
ax = fig.add_subplot(111)
plt.suptitle('Cost Curves')

##### Bias Subplot #####
plt.subplot(1,3,1)
plt.title('Bias')
plt.xlabel('Bias')
plt.ylabel('Cost')
# Cost Curve
curve_b, = plt.plot(b_sim, cost_sim_ce[match_2(w1_sim, w2_sim, w1_raw[w1_nearest], w2_raw[w2_nearest])], label='Cost Curve')
d1,d2,d3,d4 = plt.axis()
plt.axis([-20,20,-5,15])
# Tangent
intercept_b = cost - (prev_theta[0] * dcost_dtheta[0])
tangent_b, = plt.plot(b_sim, (dcost_dtheta[0]*b_sim)+intercept_b, label='Gradient (Tangent Line)', linestyle='--')
plt.legend();

##### Weight 1 Subplot #####
plt.subplot(1,3,2)
# plt.axis([-30.5,30.5,-1000,1000])
plt.title('Weight 1')
plt.xlabel('Weight 1')
# Cost Curve
curve_w1, = plt.plot(w1_raw, cost_sim_ce[match(w2_sim, w2_raw[w2_nearest]), b_nearest], label='Cost Curve')
d1,d2,d3,d4 = plt.axis()
plt.axis([-15,15,-5,15])
# Tangent
intercept_w1 = cost - (prev_theta[1] * dcost_dtheta[1])
tangent_w1, = plt.plot(w1_raw, (dcost_dtheta[1]*w1_raw)+intercept_w1, label='Gradient (Tangent Line)', linestyle='--')
plt.legend();

##### Weight 2 Subplot #####
plt.subplot(1,3,3)
plt.title('Weight 2')
plt.xlabel('Weight 2')
# Cost Curve
curve_w2, = plt.plot(w2_raw, cost_sim_ce[match(w1_sim, w1_raw[w1_nearest]), b_nearest], label='Cost Curve')
d1,d2,d3,d4 = plt.axis()
# plt.axis([d1,d2,-1,12])
# Tangent
intercept_w2 = cost - (prev_theta[2] * dcost_dtheta[2])
tangent_w2, = plt.plot(w2_raw, (dcost_dtheta[2]*w2_raw)+intercept_w2, label='Gradient (Tangent Line)', linestyle='--')
plt.legend();

update(1)
display(hbox)   
out


In [56]:
def update(iterations, nested=1):
    for _ in range(nested):
        cost, dcost_dtheta, data = one_iteration(X, y)
    ax.clear()
    ax.set_xlabel('x1', fontsize=15)
    ax.set_ylabel('x2', fontsize=15)
    ax.set_zlabel('Y', fontsize=15)
    ax.scatter(x1, x2, y, s=100, c='r')
    z = model(theta[0], theta[1], lx1, theta[2], lx2)
    line = ax.plot_surface(lx1, lx2, z, color='green')
    out.clear_output()
    out.append_stdout(data)
    fig.canvas.draw()
    
def update_ten(iterations):
    if visualize.value:
        for i in range(10):
            update(1)
    else:
        update(1, nested=10)

def update_hundred(iterations):
    if visualize.value:
        for i in range(100):
            update(1)
    else:
        update(1, nested=100)
        
def update_ten_thousand(iterations):
    if visualize.value:
        print('Not Allowed, Unselect Visualize to Proceed.')
    else:
        update(1, nested=10000)

##### Widgets #####
out = widgets.Output(layout={'border': '1px solid black'})
button_one = widgets.Button(description="1 Iteration")
button_one.on_click(update)
button_ten = widgets.Button(description="10 Iterations")
button_ten.on_click(update_ten)
button_hundred = widgets.Button(description="!!!100 Iterations!!!")
button_hundred.on_click(update_hundred)
button_ten_thousand = widgets.Button(description="!!!Warning - 10000!!!")
button_ten_thousand.on_click(update_ten_thousand)
visualize = widgets.Checkbox(value=False, description='Visualize Changes', disabled=False)
hbox = widgets.HBox([button_one, button_ten, button_hundred, button_ten_thousand, visualize])

# Initialize Random Weights & Bias
theta = np.random.uniform(-5,5,size=3)

line_x_1 = np.repeat(np.linspace(-5,5,size), size)
line_x_2 = np.tile(np.linspace(-5,5,size), size)

##################################
### Visualize Regression Plane ###
##################################
fig = plt.figure(figsize=(8,5))
ax = fig.add_subplot(111, projection='3d')

plt.suptitle('Logistic Regression Plane', fontsize=15)
ax.set_xlabel('x1', fontsize=15)
ax.set_ylabel('x2', fontsize=15)
ax.set_zlabel('Y', fontsize=15)

ax.scatter(x1, x2, y, s=100, c='r')
z = model(theta[0], theta[1], lx1, theta[2], lx2)
line = ax.plot_surface(lx1, lx2, z, color='green')

update(1)
display(hbox)
out
C:\Users\cheon\Anaconda3\lib\site-packages\ipykernel_launcher.py:57: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).