如何解决Keras Double DQN 平均奖励随时间减少且无法收敛
我试图教一个双 DQN 代理运行一个网格世界,其中有一个搜索者(代理)将尝试收集所有随机生成的隐藏者。每一步的 path_cost 为 -0.1,如果收集到躲藏者,则会收到 1 的奖励。 DQN 网络接收一个形状为 (world_width,world_height,1) 的数组作为状态,它是从上方观察的环境的完整平移,其中空白空间被描述为 0,搜索者为 2,隐藏者为 3。代理是然后应该选择一个动作,向左、向上、向右或向下。下图显示了环境的示例配置。
然而,当训练我的代理时,奖励最初会随着探索的减少而减少,因此可以假设当代理遵循 DQN 网络时,它的表现会比随机选择动作时的表现更差。以下是我在使用不同超参数进行训练时收到的奖励图的一些示例(y 轴是总步数,其中每集是 100 步,除非它完成)。
正如所见,智能体在解决环境问题时变得更糟,并且大约在 epsilon
等于我的 min_epsilon
时曲线稳定(意味着几乎没有探索或随机移动)。
我尝试了不同的超参数,但结果没有任何明显差异,如果有人能给我指出问题所在,我将不胜感激。
我主要使用的超参数是:
wandb.config.epsilon = 1.0
wandb.config.epsilon_decay = 0.99
wandb.config.batch_size = 32
wandb.config.learning_rate = 1e-3
wandb.config.gamma = 0.8
wandb.config.min_epsilon = 1e-1
wandb.config.buffersize = 10000
wandb.config.epochs = 1
wandb.config.reward_discount = 0.01
wandb.config.episodes = 1000
这是我的代码:
import tensorflow as tf
from tensorflow.keras.layers import Input,Dense,Conv2D,MaxPooling2D,Flatten
from tensorflow.keras.optimizers import Adam
from collections import deque
from termcolor import colored
import wandb
from wandb.keras import WandbCallback
import numpy as np
import copy,os,random
from argparse import ArgumentParser
from plotter import plotter
from HNS import HNS
tf.keras.backend.set_floatx('float64')
wandb.init(name=name,project=project)
wandb.env.name = "HNS"
wandb.env.world_size = (8,8)
wandb.env.state_dim = (8,8,1)
wandb.env.hider_count = 2
wandb.env.action_dim = 4
wandb.env.random_spawn = True
wandb.env.max_steps = 100
wandb.config.node = node
wandb.config.epsilon = 1.0
wandb.config.epsilon_decay = 0.99
wandb.config.batch_size = 32
wandb.config.learning_rate = 1e-3
wandb.config.gamma = 0.8
wandb.config.min_epsilon = 1e-1
wandb.config.buffersize = 10000
wandb.config.epochs = 1
wandb.config.reward_discount = 0.01
wandb.config.episodes = 1000
wandb.config.conv1_kernel = (8,8)
wandb.config.conv1_filters = 16
wandb.config.conv1_strides = 4
wandb.config.conv1_activation = "relu"
wandb.config.conv1_padding = "same"
wandb.config.conv2_kernel = (4,4)
wandb.config.conv2_filters = 32
wandb.config.conv2_strides = 4
wandb.config.conv2_activation = "relu"
wandb.config.conv2_padding = "same"
wandb.config.dense1_neurons = 16
wandb.config.dense1_activation = "relu"
wandb.config.loss = "mse"
parser = ArgumentParser()
parser.add_argument('--hider_count',type=int,default=wandb.env.hider_count)
parser.add_argument('--max_steps',default=wandb.env.max_steps)
parser.add_argument('--epsilon_decay',type=float,default=wandb.config.epsilon_decay)
parser.add_argument('--min_epsilon',default=wandb.config.min_epsilon)
parser.add_argument('--learning_rate',default=wandb.config.learning_rate)
parser.add_argument('--gamma',default=wandb.config.gamma)
parser.add_argument('--reward_discount',default=wandb.config.reward_discount)
parser.add_argument('--episodes',default=wandb.config.episodes)
parser.add_argument('--batch_size',default=wandb.config.batch_size)
args,unkNown = parser.parse_kNown_args()
wandb.config.update(args,allow_val_change=True)
class ReplayBuffer:
def __init__(self):
self.buffer = deque(maxlen=wandb.config.buffersize)
def put(self,state,action,reward,next_state,done):
self.buffer.append([state,done])
def sample(self):
sample = random.sample(self.buffer,wandb.config.batch_size)
states,actions,rewards,next_states,done = map(np.asarray,zip(*sample))
return states,done
def size(self):
return len(self.buffer)
class ActionStatemodel:
def __init__(self):
self.epsilon = wandb.config.epsilon
self.model = self.create_model()
def create_model(self):
# Init model
model = tf.keras.Sequential()
# Set up layers
model.add(Conv2D(filters=wandb.config.conv1_filters,kernel_size=wandb.config.conv1_kernel,activation=wandb.config.conv1_activation,strides=wandb.config.conv1_strides,padding=wandb.config.conv1_padding,name="conv_1",input_shape=wandb.env.state_dim))
model.add(Conv2D(filters=wandb.config.conv2_filters,kernel_size=wandb.config.conv2_kernel,activation=wandb.config.conv2_activation,strides=wandb.config.conv2_strides,padding=wandb.config.conv2_padding,name="conv_2"))
model.add(Flatten())
model.add(Dense(units=wandb.config.dense1_neurons,activation=wandb.config.dense1_activation,name="dense_1"))
model.add(Dense(wandb.env.action_dim,name="dense_2"))
# Finalize model
model.compile(loss=wandb.config.loss,optimizer=Adam(wandb.config.learning_rate))
model.summary()
return model
# Get q-values from state
def predict(self,state):
return self.model.predict(state)
# Get action from
def get_action(self,state):
# Predict action
state = np.expand_dims(state,axis=0)
q_value = self.predict(state)
if np.random.random() < self.epsilon: return random.randint(0,wandb.env.action_dim - 1),1
else: return np.argmax(q_value),0
def train(self,states,targets):
history = self.model.fit(states,targets,epochs=wandb.config.epochs,callbacks=[WandbCallback()],verbose=2,use_multiprocessing=True)
return history.history["loss"][0]
class Agent:
def __init__(self,env):
self.env = env
self.predict_net = ActionStatemodel()
self.target_net = ActionStatemodel()
self.target_update()
self.buffer = ReplayBuffer()
# copy weights from model to target_model
def target_update(self):
weights = self.predict_net.model.get_weights()
self.target_net.model.set_weights(weights)
def replay(self):
loss = 0
for _ in range(5):
states,done = self.buffer.sample()
# Collect predicted actions from predict_net
predicted_q_values = self.predict_net.predict(next_states)
predicted_actions = np.argmax(predicted_q_values,axis=1)
# Get q values from target_net of above predicted actions
target_q_values = self.target_net.predict(next_states)
target_action_q_values = [np.take(target_q_values[i],predicted_actions[i]) for i in range(len(target_q_values))]
# Create targets based on q values,reward and done
targets = predicted_q_values.copy()
targets[range(wandb.config.batch_size),actions] = rewards + (1 - done) * target_action_q_values * args.gamma
loss += self.predict_net.train(states,targets)
return loss
def train(self):
# Save weights for heatmap rendering
# Main training loop
for ep in range(wandb.config.episodes):
# Initialization
done,total_reward,step,loss,exploration = False,0
state = self.env.reset()
while not done and step < wandb.env.max_steps:
# Predict and perform action
action,e = self.predict_net.get_action(state)
exploration += e
next_state,done,_ = self.env.step(action)
self.buffer.put(state,reward * wandb.config.reward_discount,done)
total_reward += reward
if self.buffer.size() >= 1000 and step % 10 == 0:
loss = self.replay()
state = next_state
step += 1
self.target_update()
# Update epsilon
self.predict_net.epsilon = max(wandb.config.epsilon_decay * self.predict_net.epsilon,wandb.config.min_epsilon)
# Calculate weights change and log weights
pre_weights = self.get_weights(self.predict_net.model.layers)
tar_weights = self.get_weights(self.target_net.model.layers)
# LOG
print(colored("EP" + str(ep) + "-Reward: " + str(total_reward) + " Done: " + str(done),"green"))
wandb.log({"episode" : ep,"buffersize" : self.buffer.size(),"EpReward" : total_reward,"epsilon" : self.predict_net.epsilon,"done" : int(done),"Exploration" : exploration / _,"loss" : loss,"pre_weights" : pre_weights,"tar_weights" : tar_weights
})
# "weigthUpdate" : wandb.Image(neuron_map),# Get weights and names for every layer of nn model
def get_weights(self,layers):
weigths = []
names = []
for layer in layers:
wb = layer.get_weights()
if wb:
weigths.append(wb[0].flatten())
names.append(layer.name)
return weigths,names
if __name__ == "__main__":
env = HNS(random_spawn=wandb.env.random_spawn,world_size=wandb.env.world_size,hider_count=wandb.env.hider_count)
agent = Agent(env=env)
agent.train()
agent.target_net.model.save(os.path.join(wandb.run.dir,"model.h5"))
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。