I want to implement a Markov Decision Process for the robot example. I used the numpy arrays to store the transition probabilities matrices per action: e.g.
P[:,:,2] = np.array([ [0.9, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0],
[0.8, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0.8, 0.1, 0, 0, 0, 0.1, 0, 0, 0, 0, 0],
[0, 0, 0.8, 0.1, 0, 0, 0, 0.1, 0, 0, 0, 0],
[0.1, 0, 0, 0, 0.8, 0, 0, 0, 0.1, 0, 0, 0],
[0, 0.1, 0, 0, 0.8, 0, 0, 0, 0, 0.1, 0, 0],
[0, 0, 0.1, 0, 0, 0, 0.8, 0, 0, 0, 0.1, 0],
[0, 0, 0, 0.1, 0, 0, 0.8, 0, 0, 0, 0, 0.1],
[0, 0, 0, 0, 0.1, 0, 0, 0, 0.9, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0.8, 0.2, 0, 0],
[0, 0, 0, 0, 0, 0, 0.1, 0, 0, 0.8, 0.1, 0],
[0, 0, 0, 0, 0, 0, 0, 0.1, 0, 0, 0.8, 0.1]
])
#That is the code I tried to implement to solve the MDP using the Bellman equation. However there seems to #be an error with the dimensions and when I am slicing through the numpy arrays, the matrix multiplication #is not executed well. My struggle is how to store the probabilities and make my code more modular.
numActions = P.shape[2]
numStates = P.shape[0]
R = -0.04 * np.ones(( numStates,1))
R[7] = -1
R[-1] = 1
discount = 1
policy_list = np.zeros((numEpochs,numStates))
#Q1 = np.zeros((numStates,1))
for E in range(numEpochs):
for action in range(numActions):
Q1 = np.dot(P[:,:,action],U_e)
Q1_r = Q1.reshape(-1,1)
Q1_t = Q1_r + R
Q1_f = Q1.flatten()
Q[:,action] = Q1_f
#Q[:,action] = R + discount*np.dot(P[:,:,i],U_e)
U_e, policy = np.max(Q, axis=1), np.argmax(Q, axis=1)
U_e[7] = -1
U_e[11] = 1
U_e = U_e.reshape(-1,1)
print(policy)
policy_fl = policy.flatten()
policy_list[E, :] = policy[:]
print(policy_list)
print(U_e)
U_e\[11\] = 1
U_e = U_e.reshape(-1,1)
print(policy)
policy_fl = policy.flatten()
policy_list\[E, :\] = policy\[:\]
print(policy_list)
print(U_e)