如何解决Tensorflow 2 Keras 模型策略梯度网络返回超出有效操作范围的值
我对 tensorflow2 和 keras 相当陌生。这是我的第一个项目。 该项目基于教程和适用于机器人模拟的 coppeliaSim。我的学习部分代码如下所示:
import datetime
import base
import config
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.optimizers import Adam
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
from random import randrange
class PolicyGradientNetwork(keras.Model):
def __init__(self,n_actions,fc1_dims=256,fc2_dims=256):
super(PolicyGradientNetwork,self).__init__()
self.fc1_dims = fc1_dims
self.fc2_dims = fc2_dims
self.n_actions = n_actions
self.fc1 = Dense(self.fc1_dims,activation="relu")
self.fc2 = Dense(self.fc2_dims,activation="relu")
self.pi = Dense(n_actions,activation="softmax")
def call(self,state):
value = self.fc1(state)
value = self.fc2(value)
pi = self.pi(value)
return pi
class RL(base.BaseRLStrategy):
def __init__(self,*args,**kwargs):
super(RL,self).__init__(*args,**kwargs)
self.alpha = config.alpha # learning rate
self.gamma = config.gamma
self.n_actions = 3
self.state_memory = []
self.action_memory = []
self.reward_memory = []
self.policy = PolicyGradientNetwork(self.n_actions)
self.policy.compile(optimizer=Adam(learning_rate=self.alpha))
def choose_action(self,state):
my_state = tf.convert_to_tensor([np.array(state)],dtype=tf.float32)
probs = self.policy(my_state)
action_probs = tfp.distributions.Categorical(probs=probs)
action = action_probs.sample()
if action.numpy()[0] == 3:
print("Action was 3....")
return randrange(3)
return action.numpy()[0]
def store_transition(self,state,action,reward):
self.state_memory.append(state)
self.action_memory.append(action)
self.reward_memory.append(reward)
def train(self):
print("TRAINING MY AGEND!")
actions = tf.convert_to_tensor(self.action_memory,dtype=tf.float32)
rewards = tf.convert_to_tensor(self.reward_memory,dtype=tf.float32)
G = np.zeros_like(rewards)
for t in range(len(rewards)):
G_sum = 0
discount = 1
for k in range(t,len(rewards)):
G_sum += rewards[k]*discount
discount *= self.gamma
G[t] = G_sum
with tf.GradientTape() as tape:
loss = 0
for idx,(g,state) in enumerate(zip(G,self.state_memory)):
state = tf.convert_to_tensor([state],dtype=tf.float32)
probs = self.policy(state)
action_probs = tfp.distributions.Categorical(probs=probs)
log_prob = action_probs.log_prob(actions[idx])
loss += -g * tf.squeeze(log_prob)
gradient = tape.gradient(loss,self.policy.trainable_variables)
self.policy.optimizer.apply_gradients(zip(gradient,self.policy.trainable_variables))
self.state_memory = []
self.action_memory = []
self.reward_memory = []
print("END TRAINING MY AGEND!")
def load(self):
pass
def save(self):
pass
你可以看到,选择一个动作的部分是 sus
def choose_action(self,dtype=tf.float32)
probs = self.policy(my_state)
action_probs = tfp.distributions.Categorical(probs=probs)
action = action_probs.sample()
if action.numpy()[0] == 3:
print("Action was 3....")
return randrange(3)
return action.numpy()[0]
我知道只添加一个随机范围 3 是行不通的,但我不明白为什么我的模型给我的值为 3。我的操作是 3(0、1 或 2)。前几次迭代一切都像魅力一样,但在 3-4 集之后,我将动作 3 作为“有效”动作返回。如果我只是让它滑动而不抓住它,我会得到一个错误,即 3 不在 [0,3) 的范围内,这应该是正确的,因为我只有 0,1 和 2 作为有效操作。我无法理解为什么我会从网络中得到 3 作为一个动作,当它自己说 3 不在训练的有效动作范围内时。
可能是 tfp.distribution 吗?请帮帮我,我很绝望。
我在 MacO 上使用 tensorflow 2.5.0,状态是一组传感器信息 (3)。
控制台错误:
2021-07-04 23:42:39.095530: W tensorflow/core/framework/op_kernel.cc:1767] OP_REQUIRES Failed at sparse_xent_op.cc:90 : Invalid argument: Received a label value of 3 which is outside the valid range of [0,3). Label values: 3
Traceback (most recent call last):
File "/Users/mondry/Documents/GIT/RoboticRL/main_pg.py",line 40,in <module>
rl.train()
File "/Users/mondry/Documents/GIT/RoboticRL/policyGradient.py",line 81,in train
log_prob = action_probs.log_prob(actions[idx])
File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow_probability/python/distributions/distribution.py",line 1296,in log_prob
return self._call_log_prob(value,name,**kwargs)
File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow_probability/python/distributions/distribution.py",line 1278,in _call_log_prob
return self._log_prob(value,**kwargs)
File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow_probability/python/distributions/categorical.py",line 295,in _log_prob
return -tf.nn.sparse_softmax_cross_entropy_with_logits(
File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py",line 206,in wrapper
return target(*args,**kwargs)
File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow/python/ops/nn_ops.py",line 4228,in sparse_softmax_cross_entropy_with_logits_v2
return sparse_softmax_cross_entropy_with_logits(
File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py",line 4139,in sparse_softmax_cross_entropy_with_logits
cost,_ = gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow/python/ops/gen_nn_ops.py",line 11250,in sparse_softmax_cross_entropy_with_logits
_ops.raise_from_not_ok_status(e,name)
File "/Users/mondry/Documents/GIT/RoboticRL/venv/lib/python3.8/site-packages/tensorflow/python/framework/ops.py",line 6897,in raise_from_not_ok_status
six.raise_from(core._status_to_exception(e.code,message),None)
File "<string>",line 3,in raise_from
tensorflow.python.framework.errors_impl.InvalidArgumentError: Received a label value of 3 which is outside the valid range of [0,3). Label values: 3 [Op:SparsesoftmaxCrossEntropyWithLogits]
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。