Im new to RL and PyTorch but for my first project im training a DDPG DQN to solve the inverted pendulum problem. The thing is after referencing code online and fixing some issues, I finally got the model to run but it never improves. It seems like the model is not training at all. Is there something wrong with my code implementation? Or does it have to do with my hyperparameters not being just right. Thank you!
This is my code
class ReplayBuffer:
def __init__(self, buffer_limit):
self.buffer = deque(maxlen=buffer_limit)
def put(self, transition):
self.buffer.append(transition)
def sample(self, n):
mini_batch = random.sample(self.buffer, n)
s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
for transition in mini_batch:
s, a, r, s_prime, done = transition
s_lst.append(s)
a_lst.append([a])
r_lst.append([r])
s_prime_lst.append(s_prime)
done_mask = 0.0 if done else 1.0
done_mask_lst.append([done_mask])
s_batch = torch.tensor(s_lst, dtype=torch.float)
a_batch = torch.tensor(a_lst, dtype=torch.float)
r_batch = torch.tensor(r_lst, dtype=torch.float)
s_prime_batch = torch.tensor(s_prime_lst, dtype=torch.float)
done_batch = torch.tensor(done_mask_lst, dtype=torch.float)
return s_batch, a_batch, r_batch, s_prime_batch, done_batch
def size(self):
return len(self.buffer)
class Actor(nn.Module):
"""
Actor (Policy) Model for the DDPG algorithm.
"""
def __init__(
self, state_size, action_size
):
"""
Initialize actor model.
Args:
state_size (int): Dimension of each state.
action_size (int): Dimension of each action.
fc1_units (int): Number of nodes in the first hidden layer.
fc2_units (int): Number of nodes in the second hidden layer.
"""
super(Actor, self).__init__()
self.fc1 = nn.Linear(state_size, 256)
self.fc2 = nn.Linear(256, 256)
self.fc3 = nn.Linear(256, action_size)
self.reset_parameters()
def reset_parameters(self):
"""
Reset model weights with appropriate initialization.
"""
# self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
# self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
self.fc3.weight.data.uniform_(-3e-3, 3e-3)
def forward(self, state):
"""
Build an actor (policy) network that maps states to actions.
Args:
state (torch.Tensor): Input state.
Returns:
torch.Tensor: Output actions.
"""
x = F.relu(self.fc1(state))
x = F.relu(self.fc2(x))
outputs = F.tanh(self.fc3(x))
outputs = torch.mul(2, outputs) # Action space between -2.0 and 2.0
return outputs
class Critic(nn.Module):
"""
Critic (Value) Model for the DDPG algorithm.
"""
def __init__(self, state_size, action_size):
"""
Initialize critic model.
Args:
state_size (int): Dimension of each state.
action_size (int): Dimension of each action.
fcs1_units (int): Number of nodes in the first hidden layer.
fc2_units (int): Number of nodes in the second hidden layer.
"""
super(Critic, self).__init__()
# State input
self.fcs1 = nn.Linear(state_size, 16)
self.fcs2 = nn.Linear(16, 32)
# Action input
self.fca1 = nn.Linear(action_size, 32)
self.fc1 = nn.Linear(64, 256)
self.fc2 = nn.Linear(256, 256)
self.fc3 = nn.Linear(256, 1)
# self.reset_parameters()
# def reset_parameters(self):
# """
# Reset model weights with appropriate initialization.
# """
# self.fcs1.weight.data.uniform_(*hidden_init(self.fcs1))
# self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
# self.fc3.weight.data.uniform_(*hidden_init(self.fc3))
# self.fc3.weight.data.uniform_(-3e-3, 3e-3)
def forward(self, state, action):
"""
Build a critic (value) network that maps (state, action) pairs to Q-values.
Args:
state (torch.Tensor): Input state.
action (torch.Tensor): Input action.
Returns:
torch.Tensor: Output Q-values.
"""
state_out = F.relu(self.fcs1(state))
state_out = F.relu(self.fcs2(state_out))
action_out = F.relu(self.fca1(action))
x = torch.cat((state_out, action_out), dim=-1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
class OUNoise:
"""Ornstein-Uhlenbeck process."""
def __init__(self, size, mu=0.0, theta=0.15, sigma=0.2):
"""Initialize parameters and noise process."""
self.mu = mu * np.ones(size)
self.theta = theta
self.sigma = sigma
self.reset()
def reset(self):
"""Reset the internal state (= noise) to mean (mu)."""
self.state = copy.copy(self.mu)
def sample(self):
"""Update internal state and return it as a noise sample."""
x = self.state
dx = self.theta * (self.mu - x) + self.sigma * np.array(
[random.random() for i in range(len(x))]
)
self.state = x + dx
return self.state
class DDPGAgent:
"""Interacts with and learns from the environment."""
def __init__(self):
"""Initialize an Agent object."""
self.state_dim = 3
self.action_dim = 1
self.actor_lr = 0.001
self.critic_lr = 0.001
self.batch_size = 64
self.buffer_limit = 50000
self.tau = 0.005
self.gamma = 0.99
# Actor Network (w/ Target Network)
self.actor_local = Actor(self.state_dim, self.action_dim)
self.actor_target = Actor(self.state_dim, self.action_dim)
self.actor_optimizer = optim.Adam(
self.actor_local.parameters(), lr=self.actor_lr
)
# Critic Network (w/ Target Network)
self.critic_local = Critic(self.state_dim, self.action_dim)
self.critic_target = Critic(self.state_dim, self.action_dim)
self.critic_optimizer = optim.Adam(
self.critic_local.parameters(),
lr=self.critic_lr,
)
# Noise process
self.noise = OUNoise(self.action_dim)
# Replay memory
self.memory = ReplayBuffer(self.buffer_limit)
def step(self, state, action, reward, next_state, done):
"""Save experience in replay memory, and use random sample from buffer to learn."""
self.memory.put((state, action, reward, next_state, done))
# Learn, if enough samples are available in memory
if self.memory.size() > self.batch_size:
experiences = self.memory.sample(self.batch_size)
self.learn(experiences, self.gamma)
def act(self, state, add_noise=True):
"""Returns actions for given state as per current policy."""
state = torch.from_numpy(state).float()
self.actor_local.eval()
with torch.no_grad():
action = self.actor_local(state).cpu().data.numpy()
self.actor_local.train()
if add_noise:
action += self.noise.sample()
return np.clip(action, -2, 2)
def reset(self):
"""Reset the noise process."""
self.noise.reset()
def learn(self, experiences, gamma):
"""Update policy and value parameters using given batch of experience tuples.
Params:
experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
gamma (float): discount factor
"""
states, actions, rewards, next_states, dones = experiences
actions = actions.squeeze(dim=1)
# Update critic
actions_next = self.actor_target(next_states)
Q_targets_next = self.critic_target(next_states, actions_next)
Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
Q_expected = self.critic_local(states, actions)
critic_loss = F.mse_loss(Q_expected, Q_targets)
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# Update actor
actions_pred = self.actor_local(states)
actor_loss = -self.critic_local(states, actions_pred).mean()
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# Update target networks
self.soft_update(self.critic_local, self.critic_target, self.tau)
self.soft_update(self.actor_local, self.actor_target, self.tau)
def soft_update(self, local_model, target_model, tau):
"""Soft update model parameters.
Params:
local_model: PyTorch model (weights will be copied from)
target_model: PyTorch model (weights will be copied to)
tau (float): interpolation parameter
"""
for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
target_param.data.copy_(target_param.data * (1.0 - self.tau) + local_param.data * self.tau)
def train_DDPGAgent():
# Initalize the DDPG Agent and related variables required
agent = DDPGAgent()
env = gym.make('Pendulum-v1', g=9.81)
episodes = 800
total_rewards = []
no_of_steps = []
success_count = 0
frames = []
best_episode = 0
best_reward = float('-inf')
for episode in range(episodes):
state = env.reset()
score, done = 0.0, False
start_time = datetime.datetime.now()
counter = 0
while not done:
counter += 1
action = agent.act(state, add_noise=True)
next_state, reward, done, _ = env.step(action)
agent.step(state, action, reward, next_state, done)
score += reward
state = next_state
# if counter % 50 == 0 and score > -50:
# screen = env.render(mode='rgb_array')
# frames.append(screen)
# Recording results
if len(total_rewards) > 0:
success_count += (score - total_rewards[-1]) >= 200
total_rewards.append(score)
no_of_steps.append(counter)
if score > best_reward:
best_reward = score
best_episode = episode
# Saving the Models
save_folder = "DDPG"
if not os.path.exists(save_folder):
os.makedirs(save_folder)
if episode == best_episode:
model_actor = os.path.join(save_folder, "DDPG_actor" + str(episode) + ".pt")
model_critic = os.path.join(save_folder, "DDPG_critic" + str(episode) + ".pt")
torch.save(agent.actor_local.state_dict(), model_actor)
torch.save(agent.critic_local.state_dict(), model_critic)
if episode % 10 == 0:
elapsed_time = datetime.datetime.now() - start_time
print('Episode {:>4} | Total Reward: {:>8.2f} | Elapsed: {}'.format(episode, score, elapsed_time))
env.close()
return {
'total_rewards': total_rewards,
'no_of_steps': no_of_steps,
'success_count': success_count,
'frames': frames
}
DDPG_results = train_DDPGAgent()
I've tried tuning the hyperparameters from critic lr, actor lr, batch size, buffer size, but it doesnt seem to help. Also the tensorflow code I had referenced for this had incredible results which is why I was confused when I could not get anything close to it. I've been trying for so long to get it to work but it just isnt working. Any help is appreciated!
References:
- Tensorflow code: https://keras.io/examples/rl/ddpg_pendulum/
- Pytorch article on DDPG for mountain cart: https://archive.is/2nuI8
- Pytorch DDPG gym-pendulum problem: https://github.com/udacity/deep-reinforcement-learning/blob/master/ddpg-pendulum/DDPG.ipynb