如何解决为什么 tf.GradientTape() 在手动观察模型变量时使用较少的 GPU 内存?
因此,当我使用 tf.GradientTape() 自动监控 resnet 模型中的可训练变量时,计算机抛出了内存不足错误。代码如下:
x_mini = preprocess_input(x_train)
with tf.GradientTape() as tape:
outputs = model(x_mini,training=True)
但是,如果我禁用自动监视器并手动观察可训练变量,我可以输入更大的数据而不会出现任何内存问题。代码如下:
x_mini = preprocess_input(x_train)
with tf.GradientTape(watch_accessed_variables=False) as tape:
tape.watch(model.trainable_variables)
outputs = model(x_mini,training=True)
我想知道我手动操作时磁带是否遗漏了一些变量。
以下是可运行代码(如果您评论选项 1,则会显示内存不足错误):我使用 Tesla T4
15G GPU 和 tensorflow 2.3
。
import tensorflow as tf
import numpy as np
from keras.models import Model
import keras.layers as ly
x_train = tf.convert_to_tensor(np.random.randint(0,255,(900,224,3)),dtype=tf.dtypes.float32)
y_train = tf.convert_to_tensor([0,1,0],dtype=tf.dtypes.float32)
print(x_train.shape)
tf.keras.backend.clear_session()
resnet_model = tf.keras.applications.resnet.resnet50(weights='imagenet',include_top=False,input_shape=(224,3))
resnet_model.trainable = False
inputs = tf.keras.Input(shape=(224,3))
x = resnet_model(inputs,training=False)
x = ly.GlobalAveragePooling2D()(x)
x = ly.Dropout(0.2)(x)
outputs = ly.Dense(3,activation='softmax')(x)
model = Model(inputs,outputs)
mcross = tf.keras.losses.categorical_crossentropy
macc = tf.keras.metrics.categorical_accuracy
base_learning_rate = 0.0001
optimizer = tf.keras.optimizers.Adam(base_learning_rate)
def cross_entropy(y_true,y_pred):
y_pred = y_pred / tf.reduce_sum(y_pred,True)
y_pred = tf.clip_by_value(y_pred,1e-3,1-1e-3)
return -tf.reduce_sum(y_true*tf.math.log(y_pred),1)
# option 1
# manually tapping variables
with tf.GradientTape(watch_accessed_variables=False) as tape:
tape.watch(model.trainable_variables)
y_pred = model(x_train,training=True)
loss = cross_entropy(y_train,tf.reduce_mean(y_pred,keepdims=True))
gradients = tape.gradient(loss,model.trainable_variables)
#option 2
# automatically tapping variable
with tf.GradientTape() as tape:
y_pred = model(x_train,model.trainable_variables)
还有错误信息:
--------------------------------------------------------------------------- ResourceExhaustedError Traceback (most recent call last) <ipython-input-4-42e45caeae41> in <module>
31 # automatically tapping variable
32 with tf.GradientTape() as tape:
---> 33 y_pred = model(x_train,training=True)
34 loss = cross_entropy(y_train,keepdims=True))
35 gradients = tape.gradient(loss,model.trainable_variables)
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py in __call__(self,*args,**kwargs)
983
984 with ops.enable_auto_cast_variables(self._compute_dtype_object):
--> 985 outputs = call_fn(inputs,**kwargs)
986
987 if self._activity_regularizer:
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py in call(self,inputs,training,mask)
384 """
385 return self._run_internal_graph(
--> 386 inputs,training=training,mask=mask)
387
388 def compute_output_shape(self,input_shape):
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py in _run_internal_graph(self,mask)
506
507 args,kwargs = node.map_arguments(tensor_dict)
--> 508 outputs = node.layer(*args,**kwargs)
509
510 # Update tensor_dict.
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py in __call__(self,**kwargs)
986
987 if self._activity_regularizer:
/opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/layers/convolutional.py in call(self,inputs)
245 inputs = array_ops.pad(inputs,self._compute_causal_padding(inputs))
246
--> 247 outputs = self._convolution_op(inputs,self.kernel)
248
249 if self.use_bias:
/opt/conda/lib/python3.7/site-packages/tensorflow/python/util/dispatch.py in wrapper(*args,**kwargs)
199 """Call target,and fall back on dispatchers if there is a TypeError."""
200 try:
--> 201 return target(*args,**kwargs)
202 except (TypeError,ValueError):
203 # Note: convert_to_eager_tensor currently raises a ValueError,not a
/opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/nn_ops.py in convolution_v2(input,filters,strides,padding,data_format,dilations,name) 1016 data_format=data_format,1017 dilations=dilations,-> 1018 name=name) 1019 1020
/opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/nn_ops.py in convolution_internal(input,name,call_from_convolution,num_spatial_dims) 1146 data_format=data_format,1147 dilations=dilations,-> 1148 name=name) 1149 else: 1150 if channel_index == 1:
/opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/nn_ops.py in _conv2d_expanded_batch(input,name) 2590 data_format=data_format,2591 dilations=dilations,-> 2592 name=name) 2593 return squeeze_batch_dims( 2594 input,/opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/gen_nn_ops.py in conv2d(input,filter,use_cudnn_on_gpu,explicit_paddings,name)
936 return _result
937 except _core._NotOkStatusException as e:
--> 938 _ops.raise_from_not_ok_status(e,name)
939 except _core._FallbackException:
940 pass
/opt/conda/lib/python3.7/site-packages/tensorflow/python/framework/ops.py in raise_from_not_ok_status(e,name) 6841 message = e.message + (" name: " + name if name is not None else "") 6842 # pylint: disable=protected-access
-> 6843 six.raise_from(core._status_to_exception(e.code,message),None) 6844 # pylint: enable=protected-access 6845
/opt/conda/lib/python3.7/site-packages/six.py in raise_from(value,from_value)
ResourceExhaustedError: OOM when allocating tensor with shape[900,56,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Conv2D]
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。