I'm trying to code my own implementation of Adam optimization algorithm, but when I try to find the optimum for function f(x,y) = xx + yy, method generates an unexpected output.
Here is the code and graph for each point on Adam's path and more simple algorithm - SGD's path.
class optimizer:
def __init__(self, params):
self.parameters = list(params)
def zero_grad(self):
for param in self.parameters: # Have to be an iter object.
try:
param.grad.zero_()
except:
pass
def step(self):
pass
class Adam(optimizer):
def __init__(self, params, lr, beta1=0.9, beta2=0.999):
self.parameters = list(params)
self.lr = lr
self.beta1 = beta1
self.beta2 = beta2
self.EMA1 = [torch.zeros_like(param) for param in self.parameters]
self.EMA2 = [torch.zeros_like(param) for param in self.parameters]
self.iter_num = 0
self.eps = 1e-9
def step(self):
self.iter_num += 1
correct1 = 1 - self.beta1**self.iter_num # EMA1 bias correction.
correct2 = 1 - self.beta2**self.iter_num # EMA2 bias correction.
with torch.no_grad():
for param, EMA1, EMA2 in zip(self.parameters, self.EMA1, self.EMA2):
EMA1.set_((1 - self.beta1) * param.grad + self.beta1 * EMA1)
EMA2.set_((1 - self.beta2) * (param.grad**2) + self.beta2 * EMA2)
numenator = EMA1 / correct1
denominator = (EMA2 / correct2).sqrt() + self.eps
param -= self.lr * numenator / denominator