如何解决双深度 Q 学习:下一个状态总是假的
我试图理解双深度 q 学习。为此,我想读取一个 json,它从可以作为参数传递的文件中读取设置。我确实对使用代理类的 Learning 类中实现的 learning_process 进行了简单调用。
double_dqnlearning.Learning().learning_process("path.to.file.json") 这是实现。
我的问题是我总是在这里获得等于 false 的“完成”状态:
observation,reward,done,info = agent.streets.step(action)
这是实现。
import gym
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import h5py as h5
import time
import pickle
import numpy as np
import random
import matplotlib.pyplot as plt
import json
class Agent:
def __init__(self,settings):
random.seed(42)
self.streets = gym.make("Taxi-v3").env
f = open('settings/double_dqnlearning.json')
if settings is not None:
f = open(settings)
model_settings = json.load(f)
self.streets.reset()
self.streets.render()
initial_state = self.streets.encode(2,3,2,0)
self.streets.s = initial_state
self.streets.render()
# hyper parameters
# the learning rate used by RMSProp in "human-level control through deep reinforcement learning"
learning_rate = model_settings["learning_rate"]
# momentum = model_settings["momentum"]
# taxi environment
state_size = 1
onehot_state_size = model_settings["onehot_state_size"]
action_size = model_settings["action_size"]
# soft target update value used in "continuous control with deep reinforcement learning"
tau = model_settings["tau"]
batch_size = model_settings["batch_size"]
# gamma in Bellman equation
gamma = model_settings["gamma"]
# epsilon in epsilon greedy algorithm
# we implement epsilon decay
epsilon = model_settings["epsilon"]
epsilon_decay = model_settings["epsilon_decay"]
epsilon_min = model_settings["epsilon_min"]
# max step in each episode
self.t_range = model_settings["t_range"]
# training settings
self.episodes = model_settings["episodes"]
self.monitor_interval = model_settings["monitor_interval"]
self.n_in = onehot_state_size
self.n_out = action_size
self.total_reward = 0
self.gamma = gamma
self.tau = tau
self.epsilon = epsilon
self.epsilon_min = epsilon_min
self.epsilon_decay = epsilon_decay
self.batch_size = batch_size
self.replay_buffer = Replay()
# choose network to implement
# self.online_model = Network(self.n_in,self.n_out)._build_model_1(learning_rate)
# self.target_model = Network(self.n_in,self.n_out)._build_model_1(learning_rate)
self.online_model = Network(self.n_in,self.n_out)._build_model_2(learning_rate)
self.target_model = Network(self.n_in,self.n_out)._build_model_2(learning_rate)
def gather_experience(self,last_observation,action,observation):
self.replay_buffer.write((last_observation,observation))
def set_total_reward(self,new_total):
self.total_reward = new_total
def gather_reward(self,reward):
self.total_reward += reward
def get_total_reward(self):
return self.total_reward
def decay_epsilon(self):
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
def one_hot_encode(self,observation):
state = np.zeros(self.n_in)
state[observation] = 1
state = np.reshape(state,(-1,self.n_in))
return state
def choose_action(self,observation):
# exploitation
# np.random.rand is uniform [0,1]
if np.random.rand() > self.epsilon:
state = self.one_hot_encode(observation)
return np.argmax(self.online_model.predict(state)[0])
# exploration
else:
# random action from 0 to 5 out of 6 actions
return int(np.random.randint(low=0,high=self.n_out - 1,size=1,dtype='int'))
# update q values
def q_update(self):
# batch to update q values and train online model
batch = self.replay_buffer.read(self.batch_size)
n = np.shape(batch)[0]
# initialize training data
x_batch = np.zeros([n,self.n_in])
y_batch = np.zeros([n,self.n_out])
counter = 0
for b in batch:
last_state,state = b
last_state = self.one_hot_encode(last_state)
q_last = self.online_model.predict(last_state)[0]
if state is None:
q_this = reward
else:
state = self.one_hot_encode(state)
action_online = np.argmax(self.online_model.predict(state)[0])
# evaluate action by target model
q_this_target = self.target_model.predict(state)[0][action_online]
q_this = reward + self.gamma * q_this_target
# update q values
q_last[action] = q_this
x_batch[counter,:] = last_state
y_batch[counter,:] = q_last
counter += 1
# train online model
history = self.online_model.fit(x_batch,y_batch,epochs=1,verbose=0)
# debug
print("tained online model")
# return online model loss
return history.history['loss'][0]
# update target model
def update_target_model(self):
# get_weights returns list of weights of each layer
theta_online = self.online_model.get_weights()
theta_target = self.target_model.get_weights()
# soft target update from "continuous control with DRL"
counter = 0
for weight_online,weight_target in zip(theta_online,theta_target):
# This equations need to be compared with paper
# target weight is a weighted average of target weight and online weight
weight_target = weight_target * (1 - self.tau) + weight_online * self.tau
# update target weight
theta_target[counter] = weight_target
# iterate
counter += 1
# update target model
self.target_model.set_weights(theta_target)
# debug
print("updated target model")
def plotting_reward(self,ep_rewards):
# plot the reward result
episode = range(0,self.episodes,1)
plt.plot(episode,ep_rewards)
plt.ylabel("total rewards per episode")
plt.xlabel("episode")
plt.title("DQN Taxi rewards")
plt.savefig(SAVE_fig_REWARD)
plt.show()
def plotting_loss(self,losses):
# %%
# plot the loss result
episode = range(0,losses,losses)
plt.ylabel("loss per episode")
plt.xlabel("episode")
plt.title("DQN Taxi rewards")
plt.savefig(SAVE_fig_LOSS)
plt.show()
# experience replay
class Replay:
def __init__(self):
self.buffer = []
self.length = 0
self.max_length = 10000
def write(self,data):
if self.length >= self.max_length:
self.buffer.pop(0)
self.length -= 1
self.buffer.append(data)
self.length += 1
def read(self,batch_size):
# at beginning buffer is almost empty,so batch is smaller than batch_size
return random.sample(self.buffer,min(batch_size,self.length))
class Network:
def __init__(self,n_in,n_out):
self.n_in = n_in
self.n_out = n_out
def _build_model_1(self,learning_rate):
model = Sequential()
model.add(Dense(24,input_shape=(self.n_in,),activation='relu'))
model.add(Dense(48,activation='relu'))
model.add(Dense(self.n_out,activation='linear'))
optimizer = tf.keras.optimizers.RMSprop(learning_rate)
model.compile(loss='mse',optimizer=optimizer)
# debug
# print("compiled model")
return model
def _build_model_2(self,learning_rate):
model = Sequential()
model.add(Dense(self.n_out,activation='linear',use_bias=False))
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(loss='mse',optimizer=optimizer)
return model
class Learning:
def learning_process(self,settings):
agent = Agent(settings)
ep_rewards = []
loss = False
losses = []
start_time = time.time()
for ep in range(agent.episodes):
# initialize
# env.reset() in taxi returns index of states out of 500
last_observation = agent.streets.reset()
agent.set_total_reward(0)
# iterations within an episode
for t in range(agent.t_range):
# draw action
action = agent.choose_action(last_observation)
# draw next state and reward
observation,info = agent.streets.step(action)
# when taxi drop a passenger at destination,done = True
if done:
observation = None
# accumulate reward
agent.gather_reward(reward)
agent.gather_experience(last_observation,observation)
# update q values
if np.random.random() < 0.3 and len(ep_rewards) > 0:
if ep_rewards[-1] < 9.7:
# train online model
loss = agent.q_update()
# update target model
agent.update_target_model()
# iterate
last_observation = observation
# goal
if done:
ep_rewards.append(agent.get_total_reward())
break
# store last loss of online model
if loss:
losses.append(loss)
# In each episode we decay epsilon
agent.decay_epsilon()
# Monitor total reward during episodes
if ep % agent.monitor_interval == 0 and loss:
print("episode:",ep,"reward:",agent.get_total_reward(),"loss:",np.round(loss,decimals=3),"epsilon:",np.round(agent.epsilon,decimals=5),"time: {} seconds".format(np.round(time.time() - start_time,decimals=0)))
# when training finishes
# save weights of neural network
agent.online_model.save(PATH_ONLINE_SAVE_WEIGHTS)
agent.target_model.save(PATH_TARGET_SAVE_WEIGHTS)
agent.plotting_reward(ep_rewards)
agent.plotting_loss(losses)
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。