I am working on a grid optimization model where I am importing data from a data-set and I am training my RL-model on a custom Reinforcement Learning environment. In this model I also want to extract my grid values and pv-values step by step. While training, my custom Reinforcement learning environment does give me values for grid and pv but when I do prediction, it outputs an empty list. I wanted to ask why is it so.
What's the problem?
For Importing the dataset
def get_data(start = '2017-01-01 00:00:00', end = '2017-01-01 23:55:00'):
# import standard load profiles
slp = pd.read_csv('df_p.csv', index_col=0, parse_dates=True)['0'] / 1000
slp = slp.resample('15min').mean() * 3
pv = pd.read_csv('Solar_Data-2011.csv', delimiter=';',
index_col=0, parse_dates=False)["Generation"] * 3
pv.index = slp.index
print("Load values:")
print(slp.values)
print("PV values:")
print(pv.values)
start = pd.to_datetime(start)
end = pd.to_datetime(end)
return slp[start:end], pv[start:end]
This is my custom-environment
class CostEnv(Env):
def __init__(self):
# Actions we can take increase in cost, lowering of cost
self.action_space = Discrete(2)
### Get input data, just choose one day for now
self.load, self.pv = get_data(start = '2017-01-01 00:00:00', end = '2017-01-01 23:55:00')
self.pv_price = 0.10
self.grid_price = 0.40
self.line_max = 15
self.grid_penalty = 100
self.battery_max = 18
self.battery_state = 10
self.pv_values = []
self.grid_values = []
###
# Set episode length
self.episode_length = len(self.load)
self.observation_space = Dict(
{
"load": Box(0, self.episode_length - 1, shape=(2,), dtype=int),
"pv": Box(0, self.episode_length - 1, shape=(2,), dtype=int),
}
)
def step(self, action):
# Apply action
# 0 -1 = -1 decrease in cost
# 1 -1 = 0 Increase in cost
# self.state += action -1
# Reduce episode length by 1 second
# self.episode_length -= 1 ###Move to the end of the action
### We calculate the reward based on the price for the electricity,
#lower price, "higher" reward
if action == 0:
# Take all electricity from grid
if self.load[len(self.load)-self.episode_length] > self.line_max:
reward = self.load[len(self.load)-self.episode_length] * self.grid_price * -1 - abs(self.load[len(self.load)-self.episode_length] - self.line_max) * self.grid_penalty
self.grid_values.append(self.load[len(self.load)-self.episode_length])
else:
reward = self.load[len(self.load)-self.episode_length] * self.grid_price * -1
self.grid_values.append(self.load[len(self.load)-self.episode_length])
elif action == 1:
# Take all electricity from pv
if self.pv[len(self.pv)-self.episode_length] >= self.load[len(self.load)-self.episode_length]:
if abs(self.pv[len(self.pv)-self.episode_length] - self.load[len(self.load)-self.episode_length]) > self.line_max:
reward = self.load[len(self.load)-self.episode_length] * self.pv_price * -1 - (self.pv[len(self.pv)-self.episode_length] - self.load[len(self.load)-self.episode_length]) * self.grid_penalty
self.pv_values.append(self.pv[len(self.pv)-self.episode_length])
else:
reward = self.load[len(self.load)-self.episode_length] * self.pv_price * -1
self.pv_values.append(self.pv[len(self.pv)-self.episode_length])
else:
if (self.load[len(self.load)-self.episode_length] - self.pv[len(self.pv)-self.episode_length]) > self.line_max:
reward = self.pv[len(self.pv)-self.episode_length] * self.pv_price * -1 - (self.load[len(self.load)-self.episode_length] - self.pv[len(self.pv)-self.episode_length]) * self.grid_price - (self.load[len(self.load)-self.episode_length] - self.pv[len(self.pv)-self.episode_length]) * self.grid_penalty
self.pv_values.append(self.pv[len(self.pv)-self.episode_length])
else:
reward = self.pv[len(self.pv)-self.episode_length] * self.pv_price * -1
self.pv_values.append(self.pv[len(self.pv)-self.episode_length])
### This may lead the agent to always choose action 1,
###because it will always supply the demand and will always be cheaper.
else:
reward = -300000
# Invalid action
#raise ValueError(f'Invalid action: {action}')
info = {}
### Observation
observation = {
"load": (0,self.load[len(self.load)-self.episode_length]),
"pv": (0,self.pv[len(self.pv)-self.episode_length]),
}
### Either here or before checking self.episode_length
self.episode_length -= 1
### Check if timeseries is over
if self.episode_length <= 0:
done = True
else:
done = False
# Return step information
# return self.state, reward, done, info
return observation, reward, done, info
def render(self):
# Implement viz
pass
def reset(self):
self.done=False
# Set episode length
self.episode_length = len(self.load)
observation = {
"load": (0, self.load[len(self.load)-self.episode_length]),
"pv": (0, self.pv[len(self.pv)-self.episode_length]),
}
return observation
Here is my model-training
log_path = os.path.join('Training', 'Logs')
model = A2C("MultiInputPolicy", env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=300000)
The values extracted by this model are as follows:
Env.grid_values: [0.4014,
0.342,
0.5357999999999999,
0.4698,
0.44999999999999996,
0.376,
0.521,
0.4293999999999999,
0.25140000000000007,
0.7412000000000001,
env.pv_values:[0.0,
0.0,
0.0,
0.0,
0.0,
0.0607460715,
0.0678108435,
0.07642341180000001,
Now for predicting, I am using another dataset which is used as follows:
def get_data(start = '2017-01-01 00:00:00', end = '2017-01-01 23:55:00'):
# import standard load profiles
slp = pd.read_csv('df_p.csv', index_col=0, parse_dates=True)['2'] / 1000
slp = slp.resample('15min').mean() * 3
pv = pd.read_csv('Solar_Data-2011.csv', delimiter=';',
index_col=0, parse_dates=False)["Generation"] * 3
pv.index = slp.index
print("Load values:")
print(slp.values)
print("PV values:")
print(pv.values)
start = pd.to_datetime(start)
end = pd.to_datetime(end)
return slp[start:end], pv[start:end]
For prediction the code is this
episodes = 20
for ep in range(episodes):
obs = env.reset()
done = False
while not done:
action = model.predict(obs)
obs, rewards, done, info = env.step(action)
env.close()
The lists are as following:
env.pv_values
[]
env.grid_values
[]
Please tell me what I am doing wrong. Also, I load the model in a separate Jupyter notebook and environment is the same that I use for training my model. The code is as follows:
model = A2C("MultiInputPolicy",env=env)
model = model.load("A2C_Multi_Input_Policy_Improved_1",env=env)