如何解决覆盖 TF2 优化器类,在应用梯度时遇到问题?
我正在尝试实现我自己的优化器(SGD + 动量),它覆盖了原始的 keras SGD + 动量类。我试图传入一些预训练的初始化参数和超参数(速度、动量和学习率),并将它们用作 SGD + 动量的超参数。在对 TF2 优化器类进行了一些修补后,我推断用我自己的速度权重覆盖创建槽然后将它们乘以动量常数应该可以解决问题,但是,当我尝试 optim.apply_gradients( )。关于我是否在正确的方向上的任何输入?这是我编辑的课程:
class ParamwiseSGD(tf.keras.optimizers.SGD):
def __init__(self,learning_rate=1.0,lr_multipliers=None,velocity_multipliers = None,momentum_multipliers = None,momentum=1.0,nesterov=False,clipvalue=1.0,name='ParamwiseSGD',**kwargs):
super(ParamwiseSGD,self).__init__(learning_rate,momentum=momentum,nesterov=nesterov,name=name,**kwargs)
self.lr_multipliers = lr_multipliers
self.velocity_multipliers = velocity_multipliers
# import ipdb; ipdb.set_trace()
self.momentum_multipliers = momentum_multipliers
def _get_lr(self,name):
tokens = name.split('/')
# import ipdb; ipdb.set_trace()
return self.lr_multipliers[tokens[0]]
def _create_slots(self,var_list):
if self._momentum:
for var in var_list:
# new_var = tf.Variable()
print(var.name)
new_val = self._get_velocity(var.name)
self.add_slot(var,"momentum",initializer = new_val)
def _get_velocity(self,name,shape = None):
tokens = name.split('/')
# import ipdb; ipdb.set_trace()
return self.velocity_multipliers[tokens[0]]
def _get_mom(self,name):
tokens = name.split('/')
return self.momentum_multipliers[tokens[0]]
def _resource_apply_dense(self,grad,var,apply_state=None):
lr_mult = self._get_lr(var.name)
# print_op = tf.print(var.name,lr_mult)
var_device,var_dtype = var.device,var.dtype.base_dtype
coefficients = ((apply_state or {}).get((var_device,var_dtype))
or self._fallback_apply_state(var_device,var_dtype))
# with tf.control_dependencies([print_op]):
if self._momentum:
# momentum_var = self.get_slot(var,"momentum")
momentum_var = self.get_slot(var,"momentum")
momentum_mult = self._get_mom(name=var.name)
# import ipdb; ipdb.set_trace()
return training_ops.resource_apply_keras_momentum(
var.handle,momentum_var.handle,coefficients["lr_t"] * lr_mult,coefficients["momentum"] * momentum_mult,use_locking=self._use_locking,use_nesterov=self.nesterov)
else:
return training_ops.resource_apply_gradient_descent(
var.handle,use_locking=self._use_locking)
def _resource_apply_sparse(self,indices,apply_state=None):
lr_mult = self._get_lr(var.name)
var_device,var_dtype))
momentum_var = self.get_slot(var,"momentum")
momentum_mult = self._get_mom(var.name)
return training_ops.resource_sparse_apply_keras_momentum(
var.handle,use_nesterov=self.nesterov)
#TODO: how to fix this implementation
def _resource_apply_sparse_duplicate_indices(self,**kwargs):
lr_mult = self._get_lr(var.name)
# import ipdb; ipdb.set_trace()
if self._momentum:
# import ipdb; ipdb.set_trace()
return super(ParamwiseSGD,self)._resource_apply_sparse_duplicate_indices(
grad,**kwargs)
else:
var_device,var.dtype.base_dtype
coefficients = (kwargs.get("apply_state",{}).get((var_device,var_dtype))
or self._fallback_apply_state(var_device,var_dtype))
return resource_variable_ops.resource_scatter_add(
var.handle,-grad * coefficients["lr_t"] * lr_mult)
然后初始化并使用来自 TF2 的简单训练循环和 iris 数据集,抛出一些错误:
optim_2 = ParamwiseSGD(
momentum=.9,lr_multipliers=lr,momentum_multipliers=mom,velocity_multipliers=vel)
train_loss_results = []
train_accuracy_results = []
num_epochs = 201
for epoch in range(num_epochs):
epoch_loss_avg = tf.keras.metrics.Mean()
epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
# Training loop - using batches of 32
for x,y in train_dataset:
# Optimize the model
loss_value,grads = grad(model,x,y)
optim_2.apply_gradients(zip(grads,model.trainable_variables))
# Track progress
epoch_loss_avg.update_state(loss_value) # Add current batch loss
# Compare predicted label to actual label
# training=True is needed only if there are layers with different
# behavior during training versus inference (e.g. Dropout).
epoch_accuracy.update_state(y,model(x,training=True))
# End epoch
train_loss_results.append(epoch_loss_avg.result())
train_accuracy_results.append(epoch_accuracy.result())
if epoch % 50 == 0:
print("Epoch {:03d}: Loss: {:.3f},Accuracy: {:.3%}".format(epoch,epoch_loss_avg.result(),epoch_accuracy.result()))
抛出这个错误:
/usr/local/lib/python3.7/dist-packages/six.py in raise_from(value,from_value)
InvalidArgumentError: var 和 accum 的形状不同 [4,10] [10] [操作:ResourceApplyKerasMomentum]
这是我用来初始化权重的一个玩具问题:
model = tf.keras.Sequential([
tf.keras.layers.Dense(10,activation=tf.nn.relu,input_shape=(4,)),# input shape required
tf.keras.layers.Dense(10,activation=tf.nn.relu),tf.keras.layers.Dense(3)
s_1 = (10,)
s_2 = (10,)
s_3 = (3,)
layer_1 = {"lr": .01,"mom":.9,"vel": tf.zeros(shape = s_1)}
layer_2 = {"lr": .01,"vel": tf.zeros(shape = s_2)}
layer_3 = {"lr": .01,"vel": tf.zeros(shape = s_3)}
vel = {"dense": layer_1['vel'],"dense_1": layer_2['vel'],"dense_2": layer_3['vel']}
mom = {"dense": layer_1['mom'],"dense_1": layer_2['mom'],"dense_2": layer_3['mom']}
lr = {"dense": layer_1['lr'],"dense_1": layer_2['lr'],"dense_2": layer_3['lr']}
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。