I'm working on a multivariate optimization problem using the gradient descent algorithm. The algorithm does an okay job, but I noticed the cost function is not following a monotonic descending trend as desired.
It blows up right after the minimal value.
My suspicion is this has to do with step size bing too big. My dilemma is that the algorithm will be slow if the step size is too small. It will step out of the solution space if it's too big.
Can someone provide some advice on what would be a preferred approach? Any suggestion is greatly appreciated.
here is my implemention of the optimizer part
class spdg:
def __init__(self, loss_function,
# a, c,
alpha_val,
gamma_val, max_iter, img_target, zernike,
momentum = 0.2,
cal_tolerance=1e-6):
# Initialize gain parameters and decay factors
# self.a = a
# self.c = c
self.alpha_val = alpha_val
self.gamma_val = gamma_val
self.loss = loss_function
self.max_iter = max_iter
self.A = max_iter / 10
self.img_target = img_target
self.zernike = zernike
self.initial_entropy = image_entropy(self.img_target)
self.momentum = momentum
self.cal_tolerance = cal_tolerance
def calc_loss(self, current_Aw):
# print('current_Aw: %s' %str(current_Aw))
Az = self.update_AZ(current_Aw)
# print(Az[0][0])
# print('Az: %s' %str(Az))
"""Evaluate the cost/loss function with a value of theta"""
return self.loss(Az, self.img_target)
def update_AZ(self, current_Aw):
return zernike_plane(current_Aw, self.zernike)
def minimise(self, current_Aw, optimizer_type='vanilla', verbose=False):
k = 0 # initialize count
cost_func_val = []
Aw_values = []
vk = 0
previous_Aw = 0
# while k < self.max_iter:
while k < self.max_iter and \
np.linalg.norm(previous_Aw - current_Aw) > self.cal_tolerance:
previous_Aw = current_Aw
cost_val = self.calc_loss(current_Aw)
# get the current values for gain sequences
# a_k = self.a / (k + 1 + self.A) ** self.alpha_val
# c_k = self.c / (k + 1) ** self.gamma_val
if verbose:
print('iteration %d: %s with cost function value %.2f' % (k, str(current_Aw), cost_val))
else:
pass
# get the random perturbation vector Bernoulli distribution with p=0.5
delta = (np.random.randint(0, 2, current_Aw.shape) * 2 - 1) * self.gamma_val
Aw_plus = current_Aw + delta
Aw_minus = current_Aw - delta
# measure the loss function at perturbations
loss_plus = self.calc_loss(Aw_plus)
loss_minus = self.calc_loss(Aw_minus)
loss_delta = (loss_plus - loss_minus)
# Aw_delta = Aw_plus - Aw_minus
# compute the estimate of the gradient
g_hat = loss_delta * delta
# # update the estimate of the parameter
if optimizer_type == 'vanilla':
current_Aw = current_Aw - self.alpha_val * g_hat
elif optimizer_type == 'momentum':
vk_next = self.alpha_val * g_hat + self.momentum * vk
current_Aw = current_Aw - vk_next
vk = vk_next
else:
pass
cost_val = self.calc_loss(current_Aw)
cost_func_val.append(cost_val.squeeze())
Aw_values.append(current_Aw)
k += 1
sol_idx = np.argmin(cost_func_val)
Aw_estimate = Aw_values[sol_idx]
print('optimal solution is found at %d iteration' % sol_idx)
return Aw_estimate, cost_func_val