I want to implement a Markov Decision Process for the robot example. I used the numpy arrays to store the transition probabilities matrices per action: e.g.
P[:,:,2] = np.array([ [0.9, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0],
    [0.8, 0.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0.8, 0.1, 0, 0, 0, 0.1, 0, 0, 0, 0, 0],
    [0, 0, 0.8, 0.1, 0, 0, 0, 0.1, 0, 0, 0, 0],
    [0.1, 0, 0, 0, 0.8, 0, 0, 0, 0.1, 0, 0, 0],
    [0, 0.1, 0, 0, 0.8, 0, 0, 0, 0, 0.1, 0, 0],
    [0, 0, 0.1, 0, 0, 0, 0.8, 0, 0, 0, 0.1, 0],
    [0, 0, 0, 0.1, 0, 0, 0.8, 0, 0, 0, 0, 0.1],
    [0, 0, 0, 0, 0.1, 0, 0, 0, 0.9, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0.8, 0.2, 0, 0],
    [0, 0, 0, 0, 0, 0, 0.1, 0, 0, 0.8, 0.1, 0],
    [0, 0, 0, 0, 0, 0, 0, 0.1, 0, 0, 0.8, 0.1]
])
#That is the code I tried to implement to solve the MDP using the Bellman equation. However there seems to #be an error with the dimensions and when I am slicing through the numpy arrays, the matrix multiplication #is not executed well. My struggle is how to store the probabilities and make my code more modular.
numActions = P.shape[2]
numStates = P.shape[0]
R = -0.04 * np.ones(( numStates,1))
R[7] = -1
R[-1] = 1
discount = 1
policy_list = np.zeros((numEpochs,numStates))
#Q1 = np.zeros((numStates,1))
for E in range(numEpochs):
    for action in range(numActions):
        Q1 = np.dot(P[:,:,action],U_e)
        Q1_r = Q1.reshape(-1,1)
        Q1_t = Q1_r + R
        Q1_f = Q1.flatten()
        Q[:,action] = Q1_f
        #Q[:,action] = R + discount*np.dot(P[:,:,i],U_e)
    U_e, policy = np.max(Q, axis=1), np.argmax(Q, axis=1)
    U_e[7] = -1
    U_e[11] = 1
    U_e = U_e.reshape(-1,1)    
    print(policy)
    policy_fl = policy.flatten()
    policy_list[E, :] = policy[:]
    print(policy_list)
    print(U_e)
U_e\[11\] = 1
U_e = U_e.reshape(-1,1)  
print(policy)
policy_fl = policy.flatten()
policy_list\[E, :\] = policy\[:\]
print(policy_list)
print(U_e)