I am comparing the Adam - Algorithm to SGD with Momentum. I realised that the convergence rate of Adam is way worse than the convergence rate of SGD with Momentum if applied to the Rosenbrock function. This finding is in contrast to this visualisation. You can read the underlying code here.
Too ensure that I did not have an implementation error I compared the results of my algorithm to the Pytorch implementation. Pytorch and my implementation return the same result.
Therefore either Pytorch and my implementation is incorrect or the implementation in the link is incorrect. If you check out the code from the link above you will find that the Bias correction step is missing. After adapting my code in the same way the results did not significantly improve.
So my question is why does it work in the linked scenario but not in my/Pytorch implementation? Even though all of the three should return the same result.
import numpy as np
import torch
# Rosenbrock function
class Rosenbrock:
a_f = 1.
b_f = 2.
# The minimum is at (a_f, a_f**2)
class Adam_para:
beta1 = 0.9 # 0.7 # modified because of github: https://gist.github.com/EmilienDupont/f97a3902f4f3a98f350500a3a00371db
beta2 = 0.999
eps = 1e-8
lr = 2e-2
iterations = 100
def f(x,y):
return ( Rosenbrock.a_f - x ) ** 2 + Rosenbrock.b_f * (y - x ** 2 ) ** 2
def grad_f(x,y):
grad_x = - 1. * 2 * (Rosenbrock.a_f - x) + Rosenbrock.b_f * (- 2 * x) * 2 * ( y - x ** 2 )
grad_y = Rosenbrock.b_f * ( 1. ) * 2 * (y - x ** 2)
return np.array([grad_x, grad_y])
def adam_inner(p: np.ndarray,t,exp_avg,exp_avg_sqr, lr):
# inner loop of adam algorithm
# p current point
# exp_avg first moment estimate
# exp_avg_sqr second moment estimate
# lr learning rate
# the following values are taken from the ADAM Paper
beta1 = Adam_para.beta1
beta2 = Adam_para.beta2
eps = Adam_para.eps
t = t+1
g = grad_f(*p)
exp_avg = beta1 * exp_avg + ( 1 - beta1 ) * g
exp_avg_sqr = beta2 * exp_avg_sqr + ( 1 - beta2 ) * np.square(g)
bias_corr_1 = 1 - beta1 ** t
bias_corr_2 = 1 - beta2 ** t
exp_avg_hat = exp_avg / bias_corr_1
exp_avg_sqr_hat = exp_avg_sqr / bias_corr_2
denom = np.sqrt(exp_avg_sqr_hat) + eps
p = p - lr * exp_avg_hat / denom
return {'p': p, 'first_mom': exp_avg, 'second_mom': exp_avg_sqr}
def adam(p, it, lr=0.001):
# it number of iterations
# m first moment estimate
# v second moment estimate
# init
m = 0
v = 0
p_list = [p]
for i in range(it):
tmp = adam_inner(p_list[-1],i,m,v,lr)
p_list.append(tmp['p'])
m = tmp['first_mom']
v = tmp['second_mom']
return np.asarray(p_list)
x0 = np.array([3.,3.])
t = adam(x0,Adam_para.iterations,Adam_para.lr)
x0_torch = torch.tensor(x0, requires_grad=True)
f_torch = f(x0_torch[0],x0_torch[1])
optimizer = torch.optim.Adam([x0_torch], lr = Adam_para.lr, betas=(Adam_para.beta1,Adam_para.beta2))
for i in range(Adam_para.iterations):
optimizer.zero_grad()
f_torch = f(x0_torch[0],x0_torch[1])
f_torch.backward()
optimizer.step()
print("pytorch result:", x0_torch)
print("my result:", t[-1])