We have to write a simple 3-layer NN that learn f(x)=x² with the softplus activation function at the end.

In my implementation the reults are just rubbish and I don't know what I'm doing wrong.

```
import autograd.numpy as np
from autograd import grad
from autograd import elementwise_grad
from autograd import hessian
import random
class Neural_Net(object):
def __init__(self, inputSize, hiddenSize, outputSize,
learning_rate=0.0001, epochs=100,
activation1="sigmoid", activation2="softplus"):
self.inputSize = inputSize
self.outputSize = outputSize
self.hiddenSize = hiddenSize
self.learning_rate = learning_rate
self.epochs = epochs
if activation1 == 'softplus':
self.activation1 = softplus
self.activation1_grad = softplus_grad
if activation1 == 'sigmoid':
self.activation1 = sigmoid
self.activation1_grad = sigmoid_grad
if activation1 == 'tanh':
self.activation1 = np.tanh
self.activation1_grad = tanh_grad
if activation2 == 'softplus':
self.activation2 = softplus
self.activation2_grad = softplus_grad
if activation2 == 'sigmoid':
self.activation2 = sigmoid
self.activation2_grad = sigmoid_grad
if activation2 == 'tanh':
self.activation2 = np.tanh
self.activation2_grad = tanh_grad
self.W1 = np.random.randn(self.inputSize, self.hiddenSize)
self.b1 = np.ones((1, self.hiddenSize))
self.W2 = np.random.randn(self.hiddenSize, self.outputSize)
self.b2 = np.ones((1, self.outputSize))
def forward_prop(self, X):
self.Z1 = np.dot(X, self.W1) + self.b1
self.A1 = self.activation1(self.Z1)
self.Z2 = np.dot(self.A1, self.W2) + self.b2
self.A2 = self.activation2(self.Z2)
return self.A2
def back_prop(self, X, Y):
self.dA2 = (self.A2 - Y)*self.activation2_grad(self.Z2)
self.dA1 = (np.dot(self.dA2,self.W2.T))*self.activation1_grad(self.Z1)
self.W1 -= self.learning_rate*X.T.dot(self.dA1)
self.b1 -= self.learning_rate*self.dA1
self.W2 -= self.learning_rate*np.dot(self.A1.T, self.dA2)
self.b2 -= self.learning_rate*self.dA2
def train(self, X, Y):
self.forward_prop(X)
self.back_prop(X, Y)
def softplus(x):
return np.log(1 + np.exp(x))
def sigmoid(x):
return 1/(1+np.exp(-x))
softplus_grad = elementwise_grad(softplus)
sigmoid_grad = elementwise_grad(sigmoid)
tanh_grad = elementwise_grad(np.tanh)
NN1 = Neural_Net(inputSize=1, hiddenSize=1, outputSize=1, epochs=10000)
for epoch in range(NN1.epochs):
X = np.array(([[random.randint(1, 100)]]))
Y = np.square(X)
A2 = NN1.forward_prop(X)
print("Input: " + str(X))
print("Actual Output: " + str(Y))
print("Predicted Output: " + str(A2))
print("Loss: " + str(np.mean(np.square(Y - A2))))
print("\n")
NN1.train(X, Y)
```

The expected output just increases and depending on which parameters I choose it becomes NaN or inf before it finishes

Just like at forward step you calculate Z1 then A1 then Z2 then A2, at backward step you should calculate gradients at opposite order: dA2 then dZ2 then dA1 then dZ1. You don't calculate dZ2 or dZ1, therefore it cannot work. Maybe you have other problems as well, but this one is the most obvious.

To check that the gradients are correct, calculate them directly (for each weight or bias, increase it by a small value epsilon, see how much the error changed, divide by epsilon). Such a direct calculation should be close to weight gradients. You don't calculate them explicitly, but should for test purposes.