Using Adam to find the minimum of the Rosenbrock function using Pytorch

605 views Asked by At

I am comparing the Adam - Algorithm to SGD with Momentum. I realised that the convergence rate of Adam is way worse than the convergence rate of SGD with Momentum if applied to the Rosenbrock function. This finding is in contrast to this visualisation. You can read the underlying code here.

Too ensure that I did not have an implementation error I compared the results of my algorithm to the Pytorch implementation. Pytorch and my implementation return the same result.

Therefore either Pytorch and my implementation is incorrect or the implementation in the link is incorrect. If you check out the code from the link above you will find that the Bias correction step is missing. After adapting my code in the same way the results did not significantly improve.

So my question is why does it work in the linked scenario but not in my/Pytorch implementation? Even though all of the three should return the same result.

import numpy as np
import torch

# Rosenbrock function
class Rosenbrock:
    a_f = 1.
    b_f = 2.
    # The minimum is at (a_f, a_f**2)
class Adam_para:
    beta1 = 0.9 # 0.7 # modified because of github: https://gist.github.com/EmilienDupont/f97a3902f4f3a98f350500a3a00371db
    beta2 = 0.999
    eps = 1e-8
    lr = 2e-2
    iterations = 100

def f(x,y):
    return ( Rosenbrock.a_f - x ) ** 2 + Rosenbrock.b_f * (y - x ** 2 ) ** 2

def grad_f(x,y):
    grad_x = - 1. * 2 * (Rosenbrock.a_f - x) + Rosenbrock.b_f * (- 2 * x) * 2 * ( y - x ** 2 )
    grad_y = Rosenbrock.b_f * ( 1. ) * 2 * (y - x ** 2)
    return np.array([grad_x, grad_y])

def adam_inner(p: np.ndarray,t,exp_avg,exp_avg_sqr, lr):
    # inner loop of adam algorithm 
    # p current point
    # exp_avg first moment estimate
    # exp_avg_sqr second moment estimate
    # lr learning rate
    # the following values are taken from the ADAM Paper
    beta1 = Adam_para.beta1
    beta2 = Adam_para.beta2
    eps = Adam_para.eps
    t = t+1
    g = grad_f(*p)
    exp_avg = beta1 * exp_avg + ( 1 - beta1 ) * g 
    exp_avg_sqr = beta2 * exp_avg_sqr + ( 1 - beta2 ) * np.square(g)
    bias_corr_1 =  1 - beta1 ** t
    bias_corr_2 =  1 - beta2 ** t
    exp_avg_hat = exp_avg / bias_corr_1
    exp_avg_sqr_hat = exp_avg_sqr / bias_corr_2
    denom = np.sqrt(exp_avg_sqr_hat) + eps
    p = p - lr * exp_avg_hat / denom
    return {'p': p, 'first_mom': exp_avg, 'second_mom': exp_avg_sqr}

def adam(p, it, lr=0.001):    
    # it number of iterations
    # m first moment estimate
    # v second moment estimate
    # init 
    m = 0 
    v = 0 
    p_list = [p]
    for i in range(it):
        tmp = adam_inner(p_list[-1],i,m,v,lr)
        p_list.append(tmp['p'])
        m = tmp['first_mom']
        v = tmp['second_mom']
    return np.asarray(p_list)
    
x0 = np.array([3.,3.])
t = adam(x0,Adam_para.iterations,Adam_para.lr)
x0_torch = torch.tensor(x0, requires_grad=True)
f_torch = f(x0_torch[0],x0_torch[1])
optimizer = torch.optim.Adam([x0_torch], lr = Adam_para.lr, betas=(Adam_para.beta1,Adam_para.beta2))
for i in range(Adam_para.iterations):    
    optimizer.zero_grad()
    f_torch = f(x0_torch[0],x0_torch[1])
    f_torch.backward()
    optimizer.step()

print("pytorch result:", x0_torch)
print("my result:", t[-1])
0

There are 0 answers