Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 25 additions & 34 deletions backprop/classification.ipynb

Large diffs are not rendered by default.

162 changes: 162 additions & 0 deletions backprop/classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# ---
# jupyter:
# jupytext:
# formats: ipynb,py:percent
# text_representation:
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.16.1
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
# name: python3
# ---

# %%
import matplotlib.pyplot as plt

# %%
import numpy as np
import sklearn.datasets

np.random.seed(0)

# %%
# load a classification dataset from sklearn
x, y = sklearn.datasets.make_moons(200, noise=0.20)
plt.scatter(x[:, 0], x[:, 1], s=40, c=y, cmap=plt.cm.Spectral)

# %%
# network architecture
i_size = x.shape[1]
h_size = 30000
o_size = 2

# hyperparameters
# learning rate
lr = 0.01
# regularization
reg = 1.0

# %%
# sample weights from a normal distribution
w0 = np.random.randn(i_size, h_size)
b0 = np.zeros((1, h_size))

w1 = np.random.randn(h_size, o_size)
b1 = np.zeros((1, o_size))

w = [w0, w1]
b = [b0, b1]

# %%
# work with a sample of data
n_samples = 4
sample_x, sample_y = x[:n_samples], y[:n_samples]


# %%
def forward(x, w, b):
# input -> hidden
z0 = x.dot(w[0]) + b[0]
a0 = np.tanh(z0)

# hidden -> output
z1 = a0.dot(w[1]) + b[1]

# softmax
probs = np.exp(z1) / np.sum(np.exp(z1), axis=1, keepdims=True)
preds = np.argmax(probs, axis=1)

return z0, a0, z1, probs, preds


z0, a0, z1, probs, preds = forward(sample_x, w, b)

# %%
probs

# %%
preds

# %%
sample_y


# %%
def log_likelihood_loss(probs, y):
# select probs for the correct classes
errors = probs[range(y.shape[0]), y]
log_probs = -np.log(errors)
loss = np.mean(log_probs)
return errors, log_probs, loss

errors, log_probs, loss = log_likelihood_loss(probs, sample_y)
loss


# %%
def backward(lr, reg, w, b, z0, a0, z1, probs, preds, y, x):
d2 = probs
d2[range(y.shape[0]), y] -= 1
dw1 = (a0.T).dot(d2)
db1 = d2

d1 = d2.dot(w[1].T) * (1 - np.power(a0, 2))
dw0 = np.dot(x.T, d1)
db0 = d1

dw1 += reg * w1
dw0 += reg * w0

w[0] += -lr * dw0
b[0] += -lr * np.sum(db0, 0, keepdims=True)
w[1] += -lr * dw1
b[1] += -lr * np.sum(db1, 0, keepdims=True)
return w, b


# %%
hist = []
for _ in range(100):
z0, a0, z1, probs, preds = forward(x, w, b)
errors, log_probs, loss = log_likelihood_loss(probs, sample_y)
hist.append(loss)
w, b = backward(lr, reg, w, b, z0, a0, z1, probs, preds, y, x)

# %%
z0, a0, z1, probs, preds = forward(x, w, b)


# %%
def plot_decision_boundary(pred_func):
# Set min and max values and give it some padding
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
h = 0.01
# Generate a grid of points with distance h between them
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# Predict the function value for the whole gid
Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot the contour and training examples
plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)


# %%
# these refer to the axes of features only - not feature and target
x_min, x_max = x[:, 0].min() - 0.5, x[:, 0].max() + 0.5
y_min, y_max = x[:, 1].min() - 0.5, x[:, 1].max() + 0.5

h = 0.01

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

z0, a0, z1, probs, preds = forward(np.c_[xx.ravel(), yy.ravel()], w, b)
preds = preds.reshape(xx.shape)
plt.contourf(xx, yy, preds, cmap=plt.cm.Spectral, alpha=0.2)
plt.scatter(x[:, 0], x[:, 1], c=y, cmap=plt.cm.Spectral)

# %%
13 changes: 11 additions & 2 deletions backprop/intro-to-backprop.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@
"\n",
"Derivative of the error:\n",
"\n",
"$$E' = \\frac{dE}{d\\theta} = 2(\\theta x^2 - y) \\cdot x^2 $$\n",
"$$E' = \\frac{dE}{d\\theta} = (\\theta x^2 - y) \\cdot x^2 $$\n",
"\n",
"We can now perform an iterative process to update our parameter $\\theta$, starting from an initial $\\theta = 0 $:\n",
"\n",
Expand All @@ -195,7 +195,9 @@
"\n",
"As we are minimizing the error, we take the negative of the gradient and use it to update our parameter:\n",
"\n",
"$$\\theta_{1} = \\theta_{0} + E' = 0 + 4 = 4.0 $$\n",
"learning rate\n",
"alpha = 0.1\n",
"$$\\theta_{1} = \\theta_{0} + E' = 0 + 0.1 * 4 = 0.4 $$\n",
"\n",
"Which is not so far away from the true value of $3$.\n",
"\n",
Expand Down Expand Up @@ -336,6 +338,13 @@
"\n",
"You now have all the tools to derive update equations for all our weights and biases - do so on paper."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Loading