如何解决从一开始就增加验证损失
我一直在用机器学习做一个非常简单的二元猫/狗分类项目。我理解过度拟合的问题,但对我来说奇怪的是验证损失从一开始就开始上升。我尝试了许多不同的超参数集,包括 L2 正则化、学习率衰减和随机梯度下降,以及大型训练集,但问题仍然存在。这是其中一项试验的学习图(横轴应为每 10 个时期):
超参数为:两个隐藏层,分别为 50 和 10 个单位,初始 alpha = 0.05,alpha 衰减率 = 0.95 每 50 个 epoch,小批量大小 = 64,lambda = 0.05
以下是其他示例学习图:
我根据 Andrew Ng 的 Deep Learning Specialization 中提供的内容开发了我的模型,所以我没想到会有很多错误。根据需要,我的完整代码附在下面:
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
from scipy import special
#Data Preprocessing (the same for dev set,which I omit here)
path = '/Users/bobby/Downloads/kagglecatsanddogs_3367a/PetImages'
train_set = []
img_size = 80
categories = ['dogs_train','cats_train']
epsilon = 1e-8
for category in categories:
path_animal = os.path.join(path,category)
for img in os.listdir(path_animal):
try:
img_array = cv2.imread(os.path.join(path_animal,img),cv2.IMREAD_GRAYSCALE)
new_img_array = cv2.resize(img_array,(img_size,img_size))
flattened_img_array = new_img_array.reshape(img_size*img_size)
train_set.append([flattened_img_array,categories.index(category)])
except:
continue
import random
random.shuffle(train_set)
X_train = []
Y_train = []
for sample in train_set:
X_train.append(sample[0])
Y_train.append(sample[1])
X_train = (np.array(X_train).T)/255
Y_train = np.array(Y_train).reshape((1,np.array(Y_train).shape[0]))
def create_mini_batches(X,Y,mini_batch_size):
m = X.shape[1]
mini_batches = []
num_mini_batches = m // mini_batch_size
permutation = list(np.random.permutation(m))
shuffled_X = X[:,permutation]
shuffled_Y = Y[:,permutation]
for i in range(num_mini_batches):
select_X = shuffled_X[:,mini_batch_size*i : mini_batch_size*(i+1)]
select_Y = shuffled_Y[:,mini_batch_size*i : mini_batch_size*(i+1)]
mini_batch = (select_X,select_Y)
mini_batches.append(mini_batch)
if m % mini_batch_size != 0:
last_X = shuffled_X[:,mini_batch_size*num_mini_batches:m]
last_Y = shuffled_Y[:,mini_batch_size*num_mini_batches:m]
last_mini_batch = (last_X,last_Y)
mini_batches.append(last_mini_batch)
return mini_batches
def initialize_parameters(layers_dims):
L = len(layers_dims) # number of layers (including input layer),in this case L=4.
parameters = {}
for l in range(1,L): # range(1,4).
parameters['W' + str(l)] = np.random.randn(layers_dims[l],layers_dims[l-1]) * np.sqrt(2/layers_dims[l-1])
parameters['b' + str(l)] = np.zeros((layers_dims[l],1))
return parameters
def sigmoid(Z):
A = special.expit(Z)
return A,Z
def relu(Z):
A = np.maximum(0.01*Z,Z)
return A,Z
def forward_propagation(X,parameters):
caches = [] #list containing Z for every node
A = X
L = int(len(parameters)/2)
for l in range(1,L):
A_prev = A
W = parameters['W'+str(l)]
b = parameters['b'+str(l)]
Z = np.dot(W,A_prev) + b
A,activation_cache = relu(Z) #activation_cache contains z[l].
linear_cache = (A_prev,W,b) #linear_cache contains A[l-1],W[l],b[l].
cache = (linear_cache,activation_cache)
caches.append(cache)
W = parameters['W'+str(L)]
b = parameters['b'+str(L)]
Z = np.dot(W,A) + b
AL,activation_cache = sigmoid(Z)
linear_cache = (A,b)
cache = (linear_cache,activation_cache)
caches.append(cache)
return AL,caches
def compute_cost(AL,parameters,lambd):
m = Y.shape[1] # number of examples
L = int(len(parameters)/2) #[6400,100,20,1] L=3 (0,1,2)
reg_cost = 0
for l in range(L):
W = parameters['W' + str(l+1)]
reg_cost += np.sum(np.square(W))
J = (-1/m)*(np.sum(Y*np.log(AL+epsilon)+(1-Y)*np.log(1-AL+epsilon))) + (1/m) * (lambd/2) * reg_cost
J = np.squeeze(J)
return J
def linear_backward(dZ,linear_cache,lambd):
A_prev,b = linear_cache
m = A_prev.shape[1]
dW = (1/m) * np.dot(dZ,A_prev.T) + (lambd/m)*W
db = (1/m) * np.sum(dZ,axis=1,keepdims=True)
dA_prev = np.dot(W.T,dZ)
return dA_prev,dW,db
def relu_gradient(Z):
dZ = np.where(Z > 0,0.01)
return dZ
def sigmoid_gradient(Z):
dZ = special.expit(Z)*(1-special.expit(Z))
return dZ
def linear_activation_backward(dA,cache,lambd,A,activation):
linear_cache,activation_cache = cache
if activation == 'relu':
dZ = dA * relu_gradient(activation_cache)
dA_prev,db = linear_backward(dZ,lambd)
elif activation == 'sigmoid':
dZ = A - Y
dA_prev,lambd)
return dA_prev,db
def L_model_backward(AL,caches,lambd):
grads = {}
L = len(caches)
m = AL.shape[1]
Y = Y.reshape(AL.shape)
cache_final_layer = caches[L-1]
grads["dA" + str(L-1)],grads["dW" + str(L)],grads["db" + str(L)] = linear_activation_backward(_,cache_final_layer,AL,activation='sigmoid')
for l in reversed(range(L-1)):
current_cache = caches[l]
grads["dA" + str(l)],grads["dW" + str(l+1)],grads["db" + str(l+1)] = linear_activation_backward(grads['dA' + str(l+1)],current_cache,_,activation='relu')
return grads
def update_parameters(parameters,grads,learning_rate):
L = len(parameters) // 2
for l in range(L):
parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
return parameters
def Neural_Network_Model(X_train,Y_train,X_dev,Y_dev,layers_dims,learning_rate,num_epoch,mini_batch_size,k):
mini_batches = create_mini_batches(X_train,mini_batch_size) #[(X{1},Y{1}),(X{2},Y{2}),...,(X{n},Y{n})]
costs_train = []
costs_dev = []
parameters = initialize_parameters(layers_dims)
AL_dev,caches_dev = forward_propagation(X_dev,parameters)
J_dev = compute_cost(AL_dev,0)
costs_dev.append(J_dev)
for i in range(num_epoch):
for mini_batch in mini_batches:
(minibatch_X,minibatch_Y) = mini_batch
AL,caches = forward_propagation(minibatch_X,parameters)
J_train = compute_cost(AL,minibatch_Y,lambd)
grads = L_model_backward(AL,lambd)
parameters = update_parameters(parameters,learning_rate)
if i % 10 == 0:
costs_train.append(J_train)
AL_dev,parameters)
J_dev = compute_cost(AL_dev,0)
costs_dev.append(J_dev)
if i % 100 == 0:
print ("Cost after epoch %i: %f" %(i,J_train))
learning_rate = learning_rate * (k**(i/50))
plt.plot(np.squeeze(costs_train),'r')
plt.plot(np.squeeze(costs_dev),'b')
plt.ylabel('cost')
plt.xlabel('epochs (per thirties)')
plt.show()
return parameters,costs_train,costs_dev
parameters_updated,costs_dev = Neural_Network_Model(X_train,[6400,50,10,1],0.05,1000,64,0.95)
我真的很感激任何有耐心阅读我的代码的人。如果问题仍然过拟合,您能否就如何解决此问题提供一些建议?我在这里不知所措,因为验证损失在很早的阶段就上升了,所以提前停止会阻止模型更深入地学习,从而导致欠拟合。任何建议将不胜感激。
解决方法
当 Validation Loss 像您添加的图像一样在早期开始增加时,这意味着模型中存在问题。 由于您没有展示您的模型,因此不清楚它是什么。
您可以查看以下对您有帮助的链接:
- 基本猫 vs 狗 Detailed Example in Colab
- 对Over-fitting in TF Tutorial的详细解释
或添加您的完整代码
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。