I have some bug in my code that apparently prevents my actors from learning the game properly. The code is an implementation of tabular q-learning, where the intention is to simultaneously train two actors playing against each other. The expected result is two q-tables, one for each player, that enables each actor to perform optimally (i.e. all games end in draw). I am using the epsilon-greedy policy, where epsilon is the exploration probability, which decreases exponentially after 10k epochs. Furthermore, I use this equation to update the Q factors:
I have tried a lot of parameters, so I do not think it is down to anything like that. I suspect there may be either something related to the training algorithm or something related to how I perform the updates of the Q-values. If anyone have the time to take a look and point out any obvious blunders, I would be very grateful. :) Code is included below.
import numpy as np
ROWS = 3
COLS = 3
class TicTacToe:
def __init__(self, player1, player2):
self.player_symbol = 1
self.player1 = player1
self.player2 = player2
self.current_player = player1
self.other_player = player2
self.board = np.zeros([ROWS, COLS])
self.game_over = False
self.winner = None
def get_available_positions(self):
available_positions = []
for row in range(ROWS):
for col in range(COLS):
if self.board[row, col] == 0:
available_positions.append((row, col))
return available_positions
def get_hash(self):
board_hash = str(self.board.reshape(ROWS * COLS))
return board_hash
def update_board(self, position):
self.board[position] = self.player_symbol
def check_winner(self):
for row in range(ROWS):
rowsum = np.sum(self.board[row, :])
if rowsum == 3:
self.game_over = True
return 1
if rowsum == -3:
self.game_over = True
return -1
for col in range(COLS):
colsum = np.sum(self.board[:, col])
if colsum == 3:
self.game_over = True
return 1
if colsum == -3:
self.game_over = True
return -1
diag_sum1 = 0
diag_sum2 = 0
for index in range(ROWS):
diag_sum1 += self.board[index, index]
diag_sum2 += self.board[index, COLS - 1 - index]
if diag_sum1 == 3 or diag_sum2 == 3:
self.game_over = True
return 1
if diag_sum1 == -3 or diag_sum2 == -3:
self.game_over = True
return -1
if len(self.get_available_positions()) == 0:
self.game_over = True
return 0
self.game_over = False
return None
def give_reward(self):
if self.winner == 1:
if self.player_symbol == 1:
self.current_player.assign_reward(1)
self.other_player.assign_reward(-1)
else:
self.current_player.assign_reward(-1)
self.other_player.assign_reward(1)
elif self.winner == -1:
if self.player_symbol == -1:
self.current_player.assign_reward(1)
self.other_player.assign_reward(-1)
else:
self.current_player.assign_reward(-1)
self.other_player.assign_reward(1)
elif self.winner == 0:
if self.player_symbol == 1:
self.current_player.assign_reward(0)
self.other_player.assign_reward(0)
else:
self.current_player.assign_reward(0)
self.other_player.assign_reward(0)
def reset_board(self):
self.board = np.zeros([ROWS, COLS])
self.game_over = False
self.player_symbol = 1
self.winner = None
self.current_player = 1
self.player1.reward = 0
self.player2.reward = 0
def switch_players(self):
self.player_symbol *= -1
if self.current_player == self.player1:
self.current_player = self.player2
self.other_player = self.player1
else:
self.current_player = self.player1
self.other_player = self.player2
def keep_score(self, epoch):
if self.winner == 1 or self.winner == -1:
self.current_player.won_rounds.append(epoch)
self.current_player.wins += 1
self.other_player.losses += 1
if self.winner == 0:
self.current_player.draw_rounds.append(epoch)
self.current_player.draws += 1
self.other_player.draw_rounds.append(epoch)
self.other_player.draws += 1
def train(self, epochs=1000):
for epoch in range(epochs):
self.current_player = self.player1
self.other_player = self.player2
self.player_symbol = 1
if epoch % 1000 == 0:
print(f'Epoch {epoch}.')
if epoch > 10000:
if epoch % 100 == 0:
self.current_player.epsilon = self.current_player.epsilon * 0.9
self.other_player.epsilon = self.other_player.epsilon * 0.9
counter = 0
while not self.game_over:
self.current_player.add_state(self.board)
self.current_player.add_qtable(self.board)
available_positions = self.get_available_positions()
player_action = self.current_player.choose_move(available_positions, self.board)
self.update_board(player_action)
self.winner = self.check_winner()
if counter not in (0, 1):
self.current_player.update_qtable()
self.current_player.last_move = player_action
if self.winner is not None:
self.give_reward()
self.keep_score(epoch)
self.current_player.update_endgame_qtable()
self.other_player.update_endgame_qtable()
self.switch_players()
counter += 1
self.current_player.reset_game_state()
self.other_player.reset_game_state()
self.reset_board()
class Player:
def __init__(self, player_id, epsilon=0.9):
self.game_states = []
self.qtable = {}
self.id = player_id
self.alpha = 0.1
self.gamma = 1.0
self.epsilon = epsilon
self.wins = 0
self.losses = 0
self.reward = 0
self.draws = 0
self.action = None
self.last_move = None
self.won_rounds = []
self.lost_rounds = []
self.draw_rounds = []
@staticmethod
def get_hash(board):
board_hash = str(board.reshape(ROWS * COLS))
return board_hash
def choose_move(self, available_positions, current_board):
rand = np.random.uniform()
if rand <= self.epsilon:
chosen_index = np.random.choice(len(available_positions))
chosen_move = available_positions[chosen_index]
else:
largest_value = -np.inf
new_board = np.copy(current_board)
new_board_hash = self.get_hash(new_board)
self.add_qtable(current_board)
value_table = self.qtable.get(new_board_hash)
position_list = []
for position in available_positions:
value = value_table[position]
if value > largest_value:
largest_value = value
position_list = [position]
elif value == largest_value:
position_list.append(position)
chosen_index = np.random.choice(len(position_list))
chosen_move = position_list[chosen_index]
return chosen_move
def assign_reward(self, reward):
self.reward = reward
@staticmethod
def create_new_qtable(new_board):
new_qtable = np.zeros([ROWS, COLS])
for row in range(ROWS):
for col in range(COLS):
if np.abs(new_board[row, col]) == 1:
new_qtable[row, col] = np.nan
return new_qtable
def update_qtable(self):
new_hash = self.get_hash(self.game_states[-1])
if self.qtable.get(new_hash) is None:
self.add_qtable(self.game_states[-1])
new_state = self.get_hash(self.game_states[-1])
old_state = self.get_hash(self.game_states[-2])
future_best_q = np.nanmax(self.qtable[new_state])
past_best_q = self.qtable[old_state][self.last_move]
updated_qtable = np.copy(self.qtable[old_state])
new_value = past_best_q + self.alpha * (self.gamma * future_best_q - past_best_q)
updated_qtable[self.last_move] = new_value
self.qtable[old_state] = updated_qtable
def update_endgame_qtable(self):
winner_key_hash = self.get_hash(self.game_states[-1])
self.qtable[winner_key_hash][self.last_move] = self.reward
def add_qtable(self, board):
new_hash = self.get_hash(board)
if self.qtable.get(new_hash) is None:
self.qtable[new_hash] = self.create_new_qtable(board)
def add_state(self, game_state):
gs = np.copy(game_state)
self.game_states.append(gs)
def reset_game_state(self):
self.reward = 0
self.game_states = []
if __name__ == '__main__':
n_avg = 1
n_epochs = 30000
avg_results = np.zeros([n_avg, n_epochs])
for i in range(n_avg):
player1 = Player(1, epsilon=1.0)
player2 = Player(-1, epsilon=1.0)
game = TicTacToe(player1, player2)
game.train(epochs=n_epochs)
# print(player1.qtable)
print(player2.qtable)
avg_results[i, player1.won_rounds] = 1
avg_results[i, player2.won_rounds] = -1
print(f"Player 1 win ratio: {player1.wins / n_epochs}")
print(f"Player 2 win ratio: {player2.wins / n_epochs}")
print(f"Draw ratio: {player1.draws / n_epochs}")
EDIT: Oh, it DOES work! Apparently I had forgotten to update epsilon for one player! Also, my wife found some bugs (credit where credit is due...)