如何解决张量流中的 A2C 不收敛
我对 A2C 算法的实现显然有问题。这是一个 colab 笔记本,其中包含以下所有代码。我运行了超过 1,000,000 帧的代码,对于 PongNoFrameskip-v4
健身房环境,奖励没有改变 ~= -20 / -21。
a2c.py
from collections import deque
from time import perf_counter
import numpy as np
import tensorflow as tf
from tensorflow.keras.losses import Huber
from tensorflow.keras.optimizers import RMSprop
from models import CNNA2C
from utils import create_gym_env
class A2C:
def __init__(
self,envs,transition_steps=5,reward_buffer_size=100,gamma=0.99,value_coef=0.5,entropy_coef=0.01,):
assert envs,'No environments given'
self.envs = envs
self.n_envs = len(envs)
self.input_shape = self.envs[0].observation_space.shape
self.n_actions = self.envs[0].action_space.n
self.model = CNNA2C(self.input_shape,self.n_actions)
self.transition_steps = transition_steps
self.total_rewards = deque(maxlen=reward_buffer_size)
self.best_reward = -float('inf')
self.mean_reward = -float('inf')
self.states = np.zeros((self.n_envs,*self.input_shape))
self.masks = np.ones(self.n_envs)
self.reset_envs()
self.steps = 0
self.games = 0
self.gamma = gamma
self.episode_rewards = np.zeros(self.n_envs)
self.last_reset_step = 0
self.frame_speed = 0
self.value_loss_coef = value_coef
self.entropy_coef = entropy_coef
def reset_envs(self):
for i,env in enumerate(self.envs):
self.states[i] = env.reset()
def update_returns(self,new_values,returns,masks,rewards):
returns[-1] = new_values
for step in reversed(range(self.transition_steps)):
returns[step] = (
returns[step + 1] * self.gamma * masks[step + 1] + rewards[step]
)
def step_envs(self,actions,done_envs):
observations = []
for (i,env),action in zip(enumerate(self.envs),actions):
state,reward,done,_ = env.step(action)
self.steps += 1
observations.append((state,done))
self.episode_rewards[i] += reward
if done:
self.states[i] = env.reset()
self.games += 1
self.total_rewards.append(self.episode_rewards[i])
self.episode_rewards[i] *= 0
done_envs.append(1)
return [np.array(item) for item in zip(*observations)]
def play_steps(self,done_envs):
state_b,action_b,log_prob_b,value_b,reward_b,mask_b = [
[] for _ in range(6)
]
state_b.append(self.states)
mask_b.append(self.masks)
for step in range(self.transition_steps):
actions,log_probs,entropy,values = self.model(self.states)
states,rewards,dones = self.step_envs(actions,done_envs)
self.states = states
self.masks[np.where(dones)] = 0
state_b.append(states)
action_b.append(actions)
log_prob_b.append(log_probs)
value_b.append(values)
reward_b.append(rewards)
mask_b.append(self.masks)
*_,new_values = self.model(state_b[-1])
results = new_values,state_b,mask_b
return [np.array(item) for item in results]
@tf.function
def train_step(self,states,returns):
with tf.GradientTape() as tape:
actions,values = self.model(
states,actions=actions,training=True
)
advantages = returns - values
actor_loss = -tf.reduce_mean(advantages * log_probs)
critic_loss = Huber()(values,returns)
loss = (
critic_loss * self.value_loss_coef
+ actor_loss
- entropy * self.entropy_coef
)
grads = tape.gradient(loss,self.model.trainable_variables)
self.model.optimizer.apply_gradients(zip(grads,self.model.trainable_variables))
def display_metrics(self):
display_titles = (
'frame','games','speed','mean reward','best reward',)
display_values = (
self.steps,self.games,f'{round(self.frame_speed)} steps/s',self.mean_reward,self.best_reward,)
display = (
f'{title}: {value}' for title,value in zip(display_titles,display_values)
)
print(','.join(display))
def update_metrics(self,start_time):
"""
Update progress metrics.
Args:
start_time: Episode start time,used for calculating fps.
Returns:
None
"""
self.frame_speed = (self.steps - self.last_reset_step) / (
perf_counter() - start_time
)
self.last_reset_step = self.steps
self.mean_reward = np.around(np.mean(self.total_rewards),2)
self.best_reward = max(self.mean_reward,self.best_reward)
def fit(self,target_reward,learning_rate=7e-4):
self.model.compile(RMSprop(learning_rate))
returns = np.zeros((self.transition_steps + 1,self.n_envs),np.float32)
done_envs = []
start_time = perf_counter()
while True:
if len(done_envs) == self.n_envs:
self.update_metrics(start_time)
start_time = perf_counter()
self.display_metrics()
done_envs.clear()
if self.mean_reward >= target_reward:
print(f'Reward achieved in {self.steps} steps!')
break
new_values,*buffers = self.play_steps(done_envs)
state_b,mask_b = buffers
self.update_returns(new_values,mask_b,reward_b)
self.train_step(
state_b[:-1].reshape(-1,*self.input_shape),action_b.reshape(-1),returns[:-1].reshape(-1),)
if __name__ == '__main__':
en = create_gym_env('PongNoFrameskip-v4',16)
agn = A2C(en)
agn.fit(18)
models.py
import tensorflow as tf
from tensorflow.keras.initializers import Orthogonal
from tensorflow.keras.layers import Conv2D,Dense,Flatten
from tensorflow.keras.models import Model,Sequential
from tensorflow_probability.python.distributions import Categorical
class CNNA2C(Model):
def __init__(
self,input_shape,n_actions,relu_gain=tf.math.sqrt(2.0),fc_units=512,actor_gain=0.01,critic_gain=1.0,):
relu_initializer = tf.initializers.Orthogonal(gain=relu_gain)
super(CNNA2C,self).__init__()
l1 = Conv2D(
filters=32,input_shape=input_shape,kernel_size=8,strides=4,activation='relu',kernel_initializer=relu_initializer,)
l2 = Conv2D(
filters=64,kernel_size=4,strides=2,)
l3 = Conv2D(
filters=64,kernel_size=3,strides=1,)
l4 = Flatten()
l5 = Dense(fc_units,kernel_initializer=relu_initializer)
self.common = Sequential([l1,l2,l3,l4,l5])
self.critic = Dense(
1,kernel_initializer=Orthogonal(critic_gain),)
self.actor = Dense(n_actions,kernel_initializer=Orthogonal(gain=actor_gain))
@tf.function
def call(self,inputs,training=True,mask=None,actions=None):
common = self.common(inputs,training=training)
value = tf.squeeze(self.critic(common),axis=1)
actor_features = self.actor(common)
distribution = Categorical(logits=actor_features)
if actions is None:
actions = distribution.sample()
action_log_probs = distribution.log_prob(actions)
return (
actions,action_log_probs,tf.reduce_mean(distribution.entropy()),value,)
def get_config(self):
super(CNNA2C,self).get_config()
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。