flappy bird linear q leanring approximation don't learn

22 views Asked by At

Before asking for help, I apologize for my English. I'm from Switzerland, so it is not my first language. I am currently building a reinforcement learning bot to learn how to play Flappy Bird. I am using linear Q-learning approximation for this purpose, but it doesn't seem to be learning. Can someone help me? My agent consistently receives the same reward, and I am unsure whether my code is broken or if I need to train my agent for a longer period, like 8 hours. Below is my code. Please refrain from judging, as I am just starting to learn about reinforcement learning:

``import flappy_bird_gymnasium  # Assuming this is a custom environment
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from random import *
env = gym.make("FlappyBird-v0", render_mode="rgb_array")

lr = 0.01
decay = 0.0000001
max_steps = 1000
gamma = 0.95
min_eps = 0.001
max_eps = 1

neps = 100000
W = np.random.rand(12, 2)
print(env.observation_space)

def plot_win_rate(rewards_all_episodes):
    rewards_optimal = np.array(rewards_all_episodes)
    rewards_optimal = np.array([0 if x == -1 else x for x in rewards_optimal])
    rewards_optimal = rewards_optimal.cumsum()
    win_rate_optimal = rewards_optimal / np.arange(1, len(rewards_all_episodes) + 1)

    plt.plot(np.arange(1, len(rewards_all_episodes) + 1), win_rate_optimal)
    plt.xlabel('Episode')
    plt.ylabel('Win Rate')
    plt.title('Win Rate per Episode')
    plt.show()


def f(state):
    
    return np.array(state).reshape(12, -1)


def epsilon_greedy(W, epsilon, state):
    features = f(state)
    if np.random.uniform(0, 1) > epsilon:
        action = np.argmax(np.dot(W.T, features))
        retur=0
    else:
        action = 0 if uniform(0,1)>0.07 else 1
        retur=1

    return action, retur

def gradient(X,W,Y):
    m = X.shape[0]
    return (1/m) * (X.T @ (X @ W - Y))

def regression(X, W,Y):
    W-=lr*gradient(X,W,Y)
    return W
    
def update(W, state, reward, newstate):
    q_values = np.dot(f(state).T, W)
    next_q_values = np.dot(f(newstate).T, W)

    W += lr * np.outer(f(state), (reward + gamma * np.max(next_q_values) - q_values))
    

    return W


def train(neps, min_eps, max_eps, decay, env, max_steps, W, fr):
    rewards_all_episodes = []

    for episode in tqdm(range(1, neps + 1)):
        epsilon = min_eps + (max_eps - min_eps) * np.exp(-decay * episode)
        state, _ = env.reset()
        rewards = 0
        X=[]
        Y=[]
        while True:
            
            action, retur= epsilon_greedy(W, epsilon, state)
            
            new_state, reward, done, _, _ = env.step(action)
            W=update(W, state, reward, new_state)
            rewards += reward
            X.append(reward)
            Y.append(state)
            if done:
                rewards_all_episodes.append(rewards)
                break

            state = new_state
        
        if episode % fr == 0:
            q_values = np.dot(f(state).T, W)
            print(W)
            print("epsilon ", epsilon)
            print("\nQ-values:", q_values)
            print("Episode {}: Average Reward: {}".format(episode, rewards))

    return W, rewards_all_episodes


W, rewards_all_episodes = train(neps, min_eps, max_eps, decay, env, max_steps, W, 2500)
plot_win_rate(rewards_all_episodes)
np.save("trained_weightsQL.npy", W)


print(rewards_all_episodes, "gamma**step*")
while True:
    pass
`

so i tried even using a DQN algorithme but nothing works

0

There are 0 answers