I am in an AI class and am trying to design an MDP that starts at random start points and finds a path to the goal state while avoiding blocked states and staying within the boundaries of the grid my agent seems to pick different actions and then gets stuck in a loop only choosing to stay where it is
the following is the code i currently have written in python on google colab ... import numpy as np import matplotlib.pyplot as plt
class GridMDP:
def __init__(self, rows, cols):
"""
Initializes the GridMDP with the specified number of rows and columns.
"""
self.rows = rows
self.cols = cols
self.grid = [['empty' for _ in range(cols)] for _ in range(rows)]
self.grid[0][0] = 'goal'
self.discount_factor = 0.9
def set_blocked_cells(self, blocked_cells):
"""
Sets the specified cells as blocked in the grid.
"""
for cell in blocked_cells:
row, col = cell
self.grid[row][col] = 'blocked'
def set_goal_cell(self, cell):
row, col = cell
self.grid[row][col] = 'goal'
def get_states(self):
states = [(i, j) for i in range(self.rows) for j in range(self.cols)]
return states
def get_actions(self):
return ['up', 'down', 'left', 'right', 'stay']
def get_transition_prob(self, state, action):
"""
Calculates the transition probabilities to next states based on the current state and action.
"""
next_state_probs = {}
# Example: Deterministic transition
if action == 'up' and state[0] > 0 and self.grid[state[0] - 1][state[1]] != 'blocked':
next_state_probs[(state[0] - 1, state[1])] = 0.7
elif action == 'down' and state[0] < self.rows - 1 and self.grid[state[0] + 1][state[1]] != 'blocked':
next_state_probs[(state[0] + 1, state[1])] = 0.7
elif action == 'left' and state[1] > 0 and self.grid[state[0]][state[1] - 1] != 'blocked':
next_state_probs[(state[0], state[1] - 1)] = 0.7
elif action == 'right' and state[1] < self.cols - 1 and self.grid[state[0]][state[1] + 1] != 'blocked':
next_state_probs[(state[0], state[1] + 1)] = 0.7
elif action == 'stay':
next_state_probs[state] = 0.3
return next_state_probs
def get_reward(self, state, action, next_state):
"""
Calculates the reward based on the current state, action, and next state.
"""
return -1
def visualize_results(mdp, policy, values):
"""
Visualizes the grid world, optimal policy, and optimal value function using matplotlib.
"""
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
# Plot grid world with goal and blocked cells
grid_data = np.zeros((mdp.rows, mdp.cols))
for i in range(mdp.rows):
for j in range(mdp.cols):
if mdp.grid[i][j] == 'goal':
grid_data[i, j] = 2
elif mdp.grid[i][j] == 'blocked':
grid_data[i, j] = -1
else:
grid_data[i, j] = 0
axes[0].imshow(grid_data, cmap='viridis')
axes[0].set_title('Grid World')
axes[0].grid(True, which='both', color='black', linewidth=1.5)
# Plot optimal policy
policy_data = np.zeros((mdp.rows, mdp.cols), dtype=int)
for i in range(mdp.rows):
for j in range(mdp.cols):
action = policy[i][j]
if action == 'up':
policy_data[i, j] = 0
elif action == 'down':
policy_data[i, j] = 1
elif action == 'left':
policy_data[i, j] = 2
elif action == 'right':
policy_data[i, j] = 3
elif action == 'stay':
policy_data[i, j] = 4
axes[1].imshow(policy_data, cmap='viridis', vmin=0, vmax=4)
axes[1].set_title('Optimal Policy')
axes[1].grid(True, which='both', color='black', linewidth=1.5)
plt.show()
def greedy_policy_evaluation(mdp, policy, start_state, num_trials=3):
"""
Greedily evaluates a policy by simulating the agent's actions.
Parameters:
- mdp: GridMDP instance
- policy: 2D array representing the policy
- start_state: Tuple representing the starting state (row, col)
- num_trials: Number of trials for evaluation
Returns:
- median_steps: Median number of steps to reach the goal over the trials
"""
steps_to_goal = []
for _ in range(num_trials):
current_state = start_state
steps = 0
while current_state != (0, 0): # Continue until reaching the goal
action = policy[current_state[0]][current_state[1]]
# Debugging prints
print("Current State:", current_state)
print("Action:", action)
next_state_probs = mdp.get_transition_prob(current_state, action)
next_states = list(next_state_probs.keys())
print("Next State Probabilities:", next_state_probs)
print("Next States:", next_states)
next_state_index = np.random.choice(len(next_states))
next_state = next_states[next_state_index]
print("Next State Chosen:", next_state)
steps += 1
current_state = next_state
steps_to_goal.append(steps)
return np.median(steps_to_goal)
# Task 1: Define the MDP
mdp = GridMDP(rows=25, cols=25)
blocked_cells = [(7,2),(7,3),(7,4),(6,4),
(3,9),(2,9),(2,10),
(5,11),(5,12),(6,11),(7,9),(7,10),(7,11),
(6,16),(5,16),(5,17),(5,18),(4,18),(3,17),(3,18),
(8,18),(8,19),(8,20),(8,21),(8,22),(8,23),(9,18),(9,23),(10,18),(10,23),(10,24),(11,18),(12,18),
(11,8),(11,10),(12,8),(12,10),(13,8),(13,9),(13,10),
(17,7),(17,8),(17,9),(17,10),(17,11),
(17,18),(17,19),(17,20),(17,21),(17,22),(18,19),(19,19),(20,19),(21,19),
(19,15),(19,16),(19,17),(20,15),(21,15),(22,12),(22,13),(22,14),(22,15),
(20,3),(21,3),(22,3),(23,3),(23,4),(23,5),(23,6),(24,6)
]
mdp.set_blocked_cells(blocked_cells)
# Task 2: Implement Policy Iteration
def policy_iteration(mdp, num_iterations=100):
policy = np.random.choice(mdp.get_actions(), size=(mdp.rows, mdp.cols))
for _ in range(num_iterations):
values = policy_evaluation(mdp, policy)
policy = policy_improvement(mdp, values)
return policy, values
def policy_evaluation(mdp, policy, num_iterations=100):
values = np.zeros((mdp.rows, mdp.cols))
for _ in range(num_iterations):
new_values = np.zeros_like(values)
for state in mdp.get_states():
action = policy[state[0]][state[1]]
next_state_probs = mdp.get_transition_prob(state, action)
expected_value = sum(prob * (mdp.get_reward(state, action, next_state) +
mdp.discount_factor * values[next_state[0]][next_state[1]])
for next_state, prob in next_state_probs.items())
new_values[state[0]][state[1]] = expected_value
values = new_values
return values
def policy_improvement(mdp, values):
new_policy = np.zeros((mdp.rows, mdp.cols), dtype=object)
for state in mdp.get_states():
action_values = {}
for action in mdp.get_actions():
next_state_probs = mdp.get_transition_prob(state, action)
expected_value = sum(prob * (mdp.get_reward(state, action, next_state) +
mdp.discount_factor * values[next_state[0]][next_state[1]])
for next_state, prob in next_state_probs.items())
action_values[action] = expected_value
best_action = max(action_values, key=action_values.get)
new_policy[state[0]][state[1]] = best_action
return new_policy
# Task 3: Visualize the Results
optimal_policy, optimal_values = policy_iteration(mdp)
visualize_results(mdp, optimal_policy, optimal_values)
# Random Policy Performance
start_states_random = [(21, 9), (20, 20), (11, 21)] # Example start states
random_policy = np.array([[np.random.choice(mdp.get_actions()) for _ in range(mdp.cols)] for _ in range(mdp.rows)])
random_policy_performance = [greedy_policy_evaluation(mdp, random_policy, start_state) for start_state in start_states_random]
# Intermediate Policy Performance
intermediate_policy, _ = policy_iteration(mdp, num_iterations=50) # Adjust the number of iterations as needed
intermediate_policy_performance = [greedy_policy_evaluation(mdp, intermediate_policy, start_state) for start_state in start_states_random]
# Optimal Policy Performance
optimal_policy_performance = [greedy_policy_evaluation(mdp, optimal_policy, start_state) for start_state in start_states_random]
# Print or log the performance metrics
print("Random Policy Performance:", random_policy_performance)
print("Intermediate Policy Performance:", intermediate_policy_performance)
print("Optimal Policy Performance:", optimal_policy_performance)
... the issue is that it seems to get stuck choosing the next state because it continously chooses ot stay in the state that it is in