I'm trying to code my own implementation of Adam optimization algorithm, but when I try to find the optimum for function f(x,y) = xx + yy, method generates an unexpected output.
Here is the code and graph for each point on Adam's path and more simple algorithm - SGD's path.
class optimizer:
    def __init__(self, params):
        self.parameters = list(params)
        
    def zero_grad(self):        
        for param in self.parameters: # Have to be an iter object.
            try:
                param.grad.zero_()
            except:
                pass
    
    def step(self):
        pass
class Adam(optimizer):
    def __init__(self, params, lr, beta1=0.9, beta2=0.999):
        self.parameters = list(params)
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.EMA1 = [torch.zeros_like(param) for param in self.parameters]
        self.EMA2 = [torch.zeros_like(param) for param in self.parameters]
        self.iter_num = 0
        self.eps = 1e-9
    
    def step(self):
        self.iter_num += 1
                    
        correct1 = 1 - self.beta1**self.iter_num # EMA1 bias correction.
        correct2 = 1 - self.beta2**self.iter_num # EMA2 bias correction.
        
        with torch.no_grad():
                
            for param, EMA1, EMA2 in zip(self.parameters, self.EMA1, self.EMA2):
            
                EMA1.set_((1 - self.beta1) * param.grad + self.beta1 * EMA1)
                EMA2.set_((1 - self.beta2) * (param.grad**2) + self.beta2 * EMA2)
                
                numenator = EMA1 / correct1
                denominator = (EMA2 / correct2).sqrt() + self.eps                
            
                param -= self.lr * numenator / denominator

