张量流中的 A2C 不收敛

如何解决张量流中的 A2C 不收敛

我对 A2C 算法的实现显然有问题。这是一个 colab 笔记本,其中包含以下所有代码。我运行了超过 1,000,000 帧的代码,对于 PongNoFrameskip-v4 健身房环境,奖励没有改变 ~= -20 / -21。

a2c.py

from collections import deque
from time import perf_counter

import numpy as np
import tensorflow as tf
from tensorflow.keras.losses import Huber
from tensorflow.keras.optimizers import RMSprop

from models import CNNA2C
from utils import create_gym_env


class A2C:
    def __init__(
        self,envs,transition_steps=5,reward_buffer_size=100,gamma=0.99,value_coef=0.5,entropy_coef=0.01,):
        assert envs,'No environments given'
        self.envs = envs
        self.n_envs = len(envs)
        self.input_shape = self.envs[0].observation_space.shape
        self.n_actions = self.envs[0].action_space.n
        self.model = CNNA2C(self.input_shape,self.n_actions)
        self.transition_steps = transition_steps
        self.total_rewards = deque(maxlen=reward_buffer_size)
        self.best_reward = -float('inf')
        self.mean_reward = -float('inf')
        self.states = np.zeros((self.n_envs,*self.input_shape))
        self.masks = np.ones(self.n_envs)
        self.reset_envs()
        self.steps = 0
        self.games = 0
        self.gamma = gamma
        self.episode_rewards = np.zeros(self.n_envs)
        self.last_reset_step = 0
        self.frame_speed = 0
        self.value_loss_coef = value_coef
        self.entropy_coef = entropy_coef

    def reset_envs(self):
        for i,env in enumerate(self.envs):
            self.states[i] = env.reset()

    def update_returns(self,new_values,returns,masks,rewards):
        returns[-1] = new_values
        for step in reversed(range(self.transition_steps)):
            returns[step] = (
                returns[step + 1] * self.gamma * masks[step + 1] + rewards[step]
            )

    def step_envs(self,actions,done_envs):
        observations = []
        for (i,env),action in zip(enumerate(self.envs),actions):
            state,reward,done,_ = env.step(action)
            self.steps += 1
            observations.append((state,done))
            self.episode_rewards[i] += reward
            if done:
                self.states[i] = env.reset()
                self.games += 1
                self.total_rewards.append(self.episode_rewards[i])
                self.episode_rewards[i] *= 0
                done_envs.append(1)
        return [np.array(item) for item in zip(*observations)]

    def play_steps(self,done_envs):
        state_b,action_b,log_prob_b,value_b,reward_b,mask_b = [
            [] for _ in range(6)
        ]
        state_b.append(self.states)
        mask_b.append(self.masks)
        for step in range(self.transition_steps):
            actions,log_probs,entropy,values = self.model(self.states)
            states,rewards,dones = self.step_envs(actions,done_envs)
            self.states = states
            self.masks[np.where(dones)] = 0
            state_b.append(states)
            action_b.append(actions)
            log_prob_b.append(log_probs)
            value_b.append(values)
            reward_b.append(rewards)
            mask_b.append(self.masks)
        *_,new_values = self.model(state_b[-1])
        results = new_values,state_b,mask_b
        return [np.array(item) for item in results]

    @tf.function
    def train_step(self,states,returns):
        with tf.GradientTape() as tape:
            actions,values = self.model(
                states,actions=actions,training=True
            )
            advantages = returns - values
            actor_loss = -tf.reduce_mean(advantages * log_probs)
            critic_loss = Huber()(values,returns)
            loss = (
                critic_loss * self.value_loss_coef
                + actor_loss
                - entropy * self.entropy_coef
            )
        grads = tape.gradient(loss,self.model.trainable_variables)
        self.model.optimizer.apply_gradients(zip(grads,self.model.trainable_variables))

    def display_metrics(self):
        display_titles = (
            'frame','games','speed','mean reward','best reward',)
        display_values = (
            self.steps,self.games,f'{round(self.frame_speed)} steps/s',self.mean_reward,self.best_reward,)
        display = (
            f'{title}: {value}' for title,value in zip(display_titles,display_values)
        )
        print(','.join(display))

    def update_metrics(self,start_time):
        """
        Update progress metrics.
        Args:
            start_time: Episode start time,used for calculating fps.
        Returns:
            None
        """
        self.frame_speed = (self.steps - self.last_reset_step) / (
            perf_counter() - start_time
        )
        self.last_reset_step = self.steps
        self.mean_reward = np.around(np.mean(self.total_rewards),2)
        self.best_reward = max(self.mean_reward,self.best_reward)

    def fit(self,target_reward,learning_rate=7e-4):
        self.model.compile(RMSprop(learning_rate))
        returns = np.zeros((self.transition_steps + 1,self.n_envs),np.float32)
        done_envs = []
        start_time = perf_counter()
        while True:
            if len(done_envs) == self.n_envs:
                self.update_metrics(start_time)
                start_time = perf_counter()
                self.display_metrics()
                done_envs.clear()
            if self.mean_reward >= target_reward:
                print(f'Reward achieved in {self.steps} steps!')
                break
            new_values,*buffers = self.play_steps(done_envs)
            state_b,mask_b = buffers
            self.update_returns(new_values,mask_b,reward_b)
            self.train_step(
                state_b[:-1].reshape(-1,*self.input_shape),action_b.reshape(-1),returns[:-1].reshape(-1),)


if __name__ == '__main__':
    en = create_gym_env('PongNoFrameskip-v4',16)
    agn = A2C(en)
    agn.fit(18)

models.py

import tensorflow as tf
from tensorflow.keras.initializers import Orthogonal
from tensorflow.keras.layers import Conv2D,Dense,Flatten
from tensorflow.keras.models import Model,Sequential
from tensorflow_probability.python.distributions import Categorical


class CNNA2C(Model):
    def __init__(
        self,input_shape,n_actions,relu_gain=tf.math.sqrt(2.0),fc_units=512,actor_gain=0.01,critic_gain=1.0,):
        relu_initializer = tf.initializers.Orthogonal(gain=relu_gain)
        super(CNNA2C,self).__init__()
        l1 = Conv2D(
            filters=32,input_shape=input_shape,kernel_size=8,strides=4,activation='relu',kernel_initializer=relu_initializer,)
        l2 = Conv2D(
            filters=64,kernel_size=4,strides=2,)
        l3 = Conv2D(
            filters=64,kernel_size=3,strides=1,)
        l4 = Flatten()
        l5 = Dense(fc_units,kernel_initializer=relu_initializer)
        self.common = Sequential([l1,l2,l3,l4,l5])
        self.critic = Dense(
            1,kernel_initializer=Orthogonal(critic_gain),)
        self.actor = Dense(n_actions,kernel_initializer=Orthogonal(gain=actor_gain))

    @tf.function
    def call(self,inputs,training=True,mask=None,actions=None):
        common = self.common(inputs,training=training)
        value = tf.squeeze(self.critic(common),axis=1)
        actor_features = self.actor(common)
        distribution = Categorical(logits=actor_features)
        if actions is None:
            actions = distribution.sample()
        action_log_probs = distribution.log_prob(actions)
        return (
            actions,action_log_probs,tf.reduce_mean(distribution.entropy()),value,)

    def get_config(self):
        super(CNNA2C,self).get_config()

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐


使用本地python环境可以成功执行 import pandas as pd import matplotlib.pyplot as plt # 设置字体 plt.rcParams['font.sans-serif'] = ['SimHei'] # 能正确显示负号 p
错误1:Request method ‘DELETE‘ not supported 错误还原:controller层有一个接口,访问该接口时报错:Request method ‘DELETE‘ not supported 错误原因:没有接收到前端传入的参数,修改为如下 参考 错误2:cannot r
错误1:启动docker镜像时报错:Error response from daemon: driver failed programming external connectivity on endpoint quirky_allen 解决方法:重启docker -> systemctl r
错误1:private field ‘xxx‘ is never assigned 按Altʾnter快捷键,选择第2项 参考:https://blog.csdn.net/shi_hong_fei_hei/article/details/88814070 错误2:启动时报错,不能找到主启动类 #
报错如下,通过源不能下载,最后警告pip需升级版本 Requirement already satisfied: pip in c:\users\ychen\appdata\local\programs\python\python310\lib\site-packages (22.0.4) Coll
错误1:maven打包报错 错误还原:使用maven打包项目时报错如下 [ERROR] Failed to execute goal org.apache.maven.plugins:maven-resources-plugin:3.2.0:resources (default-resources)
错误1:服务调用时报错 服务消费者模块assess通过openFeign调用服务提供者模块hires 如下为服务提供者模块hires的控制层接口 @RestController @RequestMapping("/hires") public class FeignControl
错误1:运行项目后报如下错误 解决方案 报错2:Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.8.1:compile (default-compile) on project sb 解决方案:在pom.
参考 错误原因 过滤器或拦截器在生效时,redisTemplate还没有注入 解决方案:在注入容器时就生效 @Component //项目运行时就注入Spring容器 public class RedisBean { @Resource private RedisTemplate<String
使用vite构建项目报错 C:\Users\ychen\work>npm init @vitejs/app @vitejs/create-app is deprecated, use npm init vite instead C:\Users\ychen\AppData\Local\npm-
参考1 参考2 解决方案 # 点击安装源 协议选择 http:// 路径填写 mirrors.aliyun.com/centos/8.3.2011/BaseOS/x86_64/os URL类型 软件库URL 其他路径 # 版本 7 mirrors.aliyun.com/centos/7/os/x86
报错1 [root@slave1 data_mocker]# kafka-console-consumer.sh --bootstrap-server slave1:9092 --topic topic_db [2023-12-19 18:31:12,770] WARN [Consumer clie
错误1 # 重写数据 hive (edu)> insert overwrite table dwd_trade_cart_add_inc > select data.id, > data.user_id, > data.course_id, > date_format(
错误1 hive (edu)> insert into huanhuan values(1,'haoge'); Query ID = root_20240110071417_fe1517ad-3607-41f4-bdcf-d00b98ac443e Total jobs = 1
报错1:执行到如下就不执行了,没有显示Successfully registered new MBean. [root@slave1 bin]# /usr/local/software/flume-1.9.0/bin/flume-ng agent -n a1 -c /usr/local/softwa
虚拟及没有启动任何服务器查看jps会显示jps,如果没有显示任何东西 [root@slave2 ~]# jps 9647 Jps 解决方案 # 进入/tmp查看 [root@slave1 dfs]# cd /tmp [root@slave1 tmp]# ll 总用量 48 drwxr-xr-x. 2
报错1 hive> show databases; OK Failed with exception java.io.IOException:java.lang.RuntimeException: Error in configuring object Time taken: 0.474 se
报错1 [root@localhost ~]# vim -bash: vim: 未找到命令 安装vim yum -y install vim* # 查看是否安装成功 [root@hadoop01 hadoop]# rpm -qa |grep vim vim-X11-7.4.629-8.el7_9.x
修改hadoop配置 vi /usr/local/software/hadoop-2.9.2/etc/hadoop/yarn-site.xml # 添加如下 <configuration> <property> <name>yarn.nodemanager.res