diff --git a/fizz_buzz_pytorch.py b/fizz_buzz_pytorch.py
new file mode 100644
index 0000000..06e6a49
--- /dev/null
+++ b/fizz_buzz_pytorch.py
@@ -0,0 +1,62 @@
+import numpy as np
+
+import torch
+from torch.autograd import Variable
+
+NUM_DIGITS = 10
+NUM_HIDDEN = 100
+BATCH_SIZE = 128
+
+# Represent each input by an array of its binary digits.
+def binary_encode(i, num_digits):
+    return np.array([i >> d & 1 for d in range(num_digits)])
+
+# One-hot encode the desired outputs: [number, "fizz", "buzz", "fizzbuzz"]
+def fizz_buzz_encode(i):
+    if   i % 15 == 0: return 3
+    elif i % 5  == 0: return 2
+    elif i % 3  == 0: return 1
+    else:             return 0
+
+def fizz_buzz_decode(i, prediction):
+    return [str(i), "fizz", "buzz", "fizzbuzz"][prediction]
+
+trX = Variable(torch.Tensor([binary_encode(i, NUM_DIGITS) for i in range(101, 2 ** NUM_DIGITS)]))
+trY = Variable(torch.LongTensor([fizz_buzz_encode(i) for i in range(101, 2 ** NUM_DIGITS)]))
+
+
+# Define the model
+model = torch.nn.Sequential(
+    torch.nn.Linear(NUM_DIGITS, NUM_HIDDEN),
+    torch.nn.ReLU(),
+    torch.nn.Linear(NUM_HIDDEN, 4)
+)
+loss_fn = torch.nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr = 0.05)
+
+
+# Start training it
+for epoch in xrange(10000):
+    for start in range(0, len(trX), BATCH_SIZE):
+        end = start + BATCH_SIZE
+        batchX = trX[start:end]
+        batchY = trY[start:end]
+
+        y_pred = model(batchX)
+        loss = loss_fn(y_pred, batchY)
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+    # Find loss on training data
+    loss = loss_fn(model(trX), trY).data[0]
+    print 'Epoch:', epoch, 'Loss:', loss
+
+
+# Output now
+testX = Variable(torch.Tensor([binary_encode(i, NUM_DIGITS) for i in range(1, 101)]))
+testY = model(testX)
+predictions = zip(range(1, 101), list(testY.max(1)[1].data.tolist()))
+
+print [fizz_buzz_decode(i, x) for (i, x) in predictions]