如何解决高速公路环境的深度 Q 学习
目前我正在 OpenAIgym 的 Freeway 上使用深度强化学习算法,我们有一个模型似乎正在学习的周期性,然后它会重置。下面是每集的奖励和时间图表:
class Q_agent():
def __init__(self,environment,value_func,gamma = 0.9,epsilon = 0.001,alpha=0.2):
self.epsilon = epsilon
self.alpha = alpha
self.gamma = gamma
self.value_function = value_func
self.environment = gym.make("Freeway-v0")
state = self.environment.reset()
state = grayscale_resize(state)
states = [state,state,state]
self.value_function.init_models(states)
self.memory_Box = []
pass
def run_episode(self,ep = 2000):
#initialise the environment
self.environment = gym.make("Freeway-v0")
state = self.environment.reset()
state = grayscale_resize(state)
initial = 0
cumulative_return = 0
state_compilations = np.array([state,state])
terminal = False
end = 0
while not terminal:
##choosing the action
if initial < 4: # we need 4 frames/observations to call predict
action = np.random.choice(3)
elif np.random.uniform()<self.epsilon: #epsilon greedy policy
action = np.random.choice(3)
else: #choose actions using the predict function
state_action_values = self.value_function.predict_best(state_compilations)
print('state action value:',state_action_values)
action = np.argmax(state_action_values)
##taking the action
obs,reward,terminal,_ = self.environment.step(action)
if reward == 0:
reward = -1
else:
reward = 100
end += 1
cumulative_return+=reward
obs = grayscale_resize(obs)
##adding the observation to our observation list
if initial < 4:
state_compilations[initial] = obs
else:
state_compilations[:3] = state_compilations[1:]
state_compilations[3] = obs
##calculating the estimated value of our observation
if initial < 4:
initial+=1
V=reward
else:
V = reward + self.gamma*np.max(self.value_function.predict_best(state_compilations))
## taking a sample for replay experience 10% of the time
if np.random.uniform()<0.1 and initial >=4:
self.memory_Box.append([state_compilations[0],state_compilations[1],state_compilations[2],state_compilations[3],action,V])
if len(self.memory_Box) == 25:
self.value_function.update(self.memory_Box)
self.memory_Box = []
if end == 5:
terminal = True
self.environment.close()
return cumulative_return
def run_simulation(self,start_ep = 0,n_episodes=2):
start_time = time.time()
self.episodic_rewards = np.zeros(n_episodes)
self.times = np.zeros(n_episodes)
for ep in range(n_episodes):
start_time = time.time()
self.episodic_rewards[ep] = self.run_episode(ep)
self.times[ep] = time.time() - start_time
print(self.episodic_rewards[ep])
print("--- %s seconds ---" % (time.time() - start_time))
return self.episodic_rewards,self.times
有更多代码,共享 colab 链接可能会更容易,但我们实际上不确定是什么导致了这种周期性。我们的学习率为 0.005,这与之前的 Q-Learning 论文相比似乎相当高。
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。