微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

Tensorflow 2 Keras 模型策略梯度网络返回超出有效操作范围的值

如何解决Tensorflow 2 Keras 模型策略梯度网络返回超出有效操作范围的值

我对 tensorflow2 和 keras 相当陌生。这是我的第一个项目。 该项目基于教程和适用于机器人模拟的 coppeliaSim。我的学习部分代码如下所示:

import datetime

import base

import config
import numpy as np

import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.optimizers import Adam
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
from random import randrange



class PolicyGradientNetwork(keras.Model):
    def __init__(self,n_actions,fc1_dims=256,fc2_dims=256):
        super(PolicyGradientNetwork,self).__init__()
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions

        self.fc1 = Dense(self.fc1_dims,activation="relu")
        self.fc2 = Dense(self.fc2_dims,activation="relu")
        self.pi = Dense(n_actions,activation="softmax")

    def call(self,state):
        value = self.fc1(state)
        value = self.fc2(value)
        pi = self.pi(value)

        return pi


class RL(base.BaseRLStrategy):
    def __init__(self,*args,**kwargs):
        super(RL,self).__init__(*args,**kwargs)
        self.alpha = config.alpha  # learning rate
        self.gamma = config.gamma
        self.n_actions = 3

        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []

        self.policy = PolicyGradientNetwork(self.n_actions)
        self.policy.compile(optimizer=Adam(learning_rate=self.alpha))

    def choose_action(self,state):
        my_state = tf.convert_to_tensor([np.array(state)],dtype=tf.float32)
        probs = self.policy(my_state)
        action_probs = tfp.distributions.Categorical(probs=probs)
        action = action_probs.sample()
        if action.numpy()[0] == 3:
            print("Action was 3....")
            return randrange(3)
        return action.numpy()[0]

    def store_transition(self,state,action,reward):
        self.state_memory.append(state)
        self.action_memory.append(action)
        self.reward_memory.append(reward)

    def train(self):
        print("TRAINING MY AGEND!")
        actions = tf.convert_to_tensor(self.action_memory,dtype=tf.float32)
        rewards = tf.convert_to_tensor(self.reward_memory,dtype=tf.float32)

        G = np.zeros_like(rewards)
        for t in range(len(rewards)):
            G_sum = 0
            discount = 1
            for k in range(t,len(rewards)):
                G_sum += rewards[k]*discount
                discount *= self.gamma
            G[t] = G_sum
        with tf.GradientTape() as tape:
            loss = 0
            for idx,(g,state) in enumerate(zip(G,self.state_memory)):
                state = tf.convert_to_tensor([state],dtype=tf.float32)
                probs = self.policy(state)
                action_probs = tfp.distributions.Categorical(probs=probs)
                log_prob = action_probs.log_prob(actions[idx])
                loss += -g * tf.squeeze(log_prob)
        gradient = tape.gradient(loss,self.policy.trainable_variables)
        self.policy.optimizer.apply_gradients(zip(gradient,self.policy.trainable_variables))
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        print("END TRAINING MY AGEND!")

    def load(self):
        pass

    def save(self):
        pass

你可以看到,选择一个动作的部分是 sus

    def choose_action(self,dtype=tf.float32)
        probs = self.policy(my_state)
        action_probs = tfp.distributions.Categorical(probs=probs)
        action = action_probs.sample()
        if action.numpy()[0] == 3:
            print("Action was 3....")
            return randrange(3)
        return action.numpy()[0]

我知道只添加一个随机范围 3 是行不通的,但我不明白为什么我的模型给我的值为 3。我的操作是 3(0、1 或 2)。前几次迭代一切都像魅力一样,但在 3-4 集之后,我将动作 3 作为“有效”动作返回。如果我只是让它滑动而不抓住它,我会得到一个错误,即 3 不在 [0,3) 的范围内,这应该是正确的,因为我只有 0,1 和 2 作为有效操作。我无法理解为什么我会从网络中得到 3 作为一个动作,当它自己说 3 不在训练的有效动作范围内时。

可能是 tfp.distribution 吗?请帮帮我,我很绝望。

我在 MacO 上使用 tensorflow 2.5.0,状态是一组传感器信息 (3)。

控制台错误

2021-07-04 23:42:39.095530: W tensorflow/core/framework/op_kernel.cc:1767] OP_REQUIRES Failed at sparse_xent_op.cc:90 : Invalid argument: Received a label value of 3 which is outside the valid range of [0,3).  Label values: 3
Traceback (most recent call last):
  File "/Users/mondry/Documents/GIT/RoboticRL/main_pg.py",line 40,in <module>
    rl.train()
  File "/Users/mondry/Documents/GIT/RoboticRL/policyGradient.py",line 81,in train
    log_prob = action_probs.log_prob(actions[idx])
  File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow_probability/python/distributions/distribution.py",line 1296,in log_prob
    return self._call_log_prob(value,name,**kwargs)
  File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow_probability/python/distributions/distribution.py",line 1278,in _call_log_prob
    return self._log_prob(value,**kwargs)
  File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow_probability/python/distributions/categorical.py",line 295,in _log_prob
    return -tf.nn.sparse_softmax_cross_entropy_with_logits(
  File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py",line 206,in wrapper
    return target(*args,**kwargs)
  File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow/python/ops/nn_ops.py",line 4228,in sparse_softmax_cross_entropy_with_logits_v2
    return sparse_softmax_cross_entropy_with_logits(
  File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py",line 4139,in sparse_softmax_cross_entropy_with_logits
    cost,_ = gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
  File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow/python/ops/gen_nn_ops.py",line 11250,in sparse_softmax_cross_entropy_with_logits
    _ops.raise_from_not_ok_status(e,name)
  File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow/python/framework/ops.py",line 6897,in raise_from_not_ok_status
    six.raise_from(core._status_to_exception(e.code,message),None)
  File "<string>",line 3,in raise_from
tensorflow.python.framework.errors_impl.InvalidArgumentError: Received a label value of 3 which is outside the valid range of [0,3).  Label values: 3 [Op:SparsesoftmaxCrossEntropyWithLogits]

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐


Selenium Web驱动程序和Java。元素在(x,y)点处不可单击。其他元素将获得点击?
Python-如何使用点“。” 访问字典成员?
Java 字符串是不可变的。到底是什么意思?
Java中的“ final”关键字如何工作?(我仍然可以修改对象。)
“loop:”在Java代码中。这是什么,为什么要编译?
java.lang.ClassNotFoundException:sun.jdbc.odbc.JdbcOdbcDriver发生异常。为什么?
这是用Java进行XML解析的最佳库。
Java的PriorityQueue的内置迭代器不会以任何特定顺序遍历数据结构。为什么?
如何在Java中聆听按键时移动图像。
Java“Program to an interface”。这是什么意思?