I am working on a simple q learning code in python. After running several iterations the program suggest a valid path, but not always the shortest -which is the point of the program. I am not sure what I am overlooking. I am using a jupyter notebook.
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
iterations = 200
goalState = 5
gamma = 0.8
qValues = []
#actions
R = np.array([[-1, 0,-1,-1, 0,-1],
[ 0,-1,-1,-1,-1,100],
[-1,-1,-1,-1,-1,100],
[-1,-1,-1,-1, 0,100],
[ 0,-1,-1, 0,-1,-1],
[-1,-1,-1,-1,-1,100]])
#inital Q matrix
Q = np.zeros(R.shape)
for i in range(iterations):
state = np.random.randint(goalState + 1)
while state != goalState:
possibleActions = np.where(R[state] >= 0)[0]
action = possibleActions[np.random.randint(len(possibleActions+1))]
nextPossibleActions = np.where(R[action] >= 0)[0]
for k in nextPossibleActions:
qValues.append(Q[action][k])
qMax = max(qValues)
Q[state][action] += R[state][action] + gamma * qMax
state = action
Q = Q/Q.max() #normalising the matrix to percentage values
sns.set()
f, ax = plt.subplots(figsize=(8, 6))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(Q, cmap = cmap, annot=True, linewidths=.5, ax=ax)