RuntimeError:Cost函数在其第一个输出中返回nan值

如何解决RuntimeError:Cost函数在其第一个输出中返回nan值

我正在研究一个时间模型来预测未来的事件。这是我的colab笔记本的link。我在尝试训练模型时遇到问题。我正在获得NaN值的火车和有效损失。损失函数是由交叉熵损失和平方损失组成的联合损失。链接到博客here

尝试了以下无效的解决方案- 学习率较低-0.01、0.001、0.0001

class cost_function():
    def __init__(self,yhat,y,L_2=0.001,logEps=1e-8):
        # logEps : log epsilon,very small positive value greater that 0.0
        # CE = - [ ln(y*)(y) + ln(1-y*)(1-y) ],self.yhat = yhat
        self.y = y
       
        self.logEps = logEps
        self.L_2 = L_2
        
        self.W_out = nn.Parameter(torch.randn(hiddenDimSize,numClass)*0.01)
        
    def cross_entropy(self):
        ce = -(self.y * torch.log(self.yhat + self.logEps) + (1. - self.y) * torch.log(1. - self.yhat + self.logEps))
        print("Inside CrossEntrophy Loss fn : ",ce)
        return ce

    def prediction_loss(self):
        # return  (torch.sum(torch.sum(self.cross_entropy(),dim=0),dim=1)).float()/  lengths.float()
        
        tmp_tensor = torch.sum(self.cross_entropy(),dim=0)
        print("Inside PredictionLoss fn : Sum Dim 0",tmp_tensor)
        print("Inside PredictionLoss fn : Sum Dim 1",torch.sum(tmp_tensor,dim=1))
        print("Inside PredictionLoss fn : Final Result ",(torch.sum(tmp_tensor,dim=1)).float()/  lengths.float())
        return (torch.sum(tmp_tensor,dim=1)).float()/  lengths.float()
        
    def cost(self):
        print("Inside Cost fn :",torch.mean(self.prediction_loss()) + self.L_2 * (self.W_out ** 2).sum())
        return torch.mean(self.prediction_loss()) + self.L_2 * (self.W_out ** 2).sum() # regularize
    

build_EHRNN类-我对前向方法参数进行了修改,以解决未定义的“ h”错误。

torch.manual_seed(1)

class build_EHRNN(nn.Module):
    def __init__(self,inputDimSize=4894,hiddenDimSize=[200,200],batchSize=100,embSize=200,numClass=4894,dropout=0.5,logEps=1e-8):
        super(build_EHRNN,self).__init__()
        
        self.inputDimSize = inputDimSize
        self.hiddenDimSize = hiddenDimSize
        self.numClass = numClass
        self.embSize = embSize
        self.batchSize = batchSize
        self.dropout = nn.Dropout(p=0.5)
        self.logEps = logEps
        
        
        # Embedding inputs
        self.W_emb = nn.Parameter(torch.randn(self.inputDimSize,self.embSize).cuda())
        self.b_emb = nn.Parameter(torch.zeros(self.embSize).cuda())
        
        self.W_out = nn.Parameter(torch.randn(self.hiddenDimSize,self.numClass).cuda())
        self.b_out = nn.Parameter(torch.zeros(self.numClass).cuda())
         
        self.params = [self.W_emb,self.W_out,self.b_emb,self.b_out] 
    
    # def forward(self,x,h,lengths,mask):
    def forward(self,mask):
        self.emb = torch.tanh(torch.matmul(x,self.W_emb) + self.b_emb)
        input_values = self.emb
        self.outputs = [input_values]
        for i,hiddenSize in enumerate([self.hiddenDimSize,self.hiddenDimSize]):  # iterate over layers
            rnn = EHRNN(self.inputDimSize,hiddenSize,self.embSize,self.batchSize,self.numClass) # calculate hidden states
            hidden_state = []
            h = self.init_hidden().cuda()
            for i,seq in enumerate(input_values): # loop over sequences in each batch
                h = rnn(seq,h)                    
                hidden_state.append(h)    
            hidden_state = self.dropout(torch.stack(hidden_state))    # apply dropout between layers
            input_values = hidden_state
       
        y_linear = torch.matmul(hidden_state,self.W_out)  + self.b_out # fully connected layer
        yhat = F.softmax(y_linear,dim=1)  # yhat
        yhat = yhat*mask[:,:,None]   # apply mask
        
        # Loss calculation
        cross_entropy = -(y * torch.log(yhat + self.logEps) + (1. - y) * torch.log(1. - yhat + self.logEps))
        last_step = -torch.mean(y[-1] * torch.log(yhat[-1] + self.logEps) + (1. - y[-1]) * torch.log(1. - yhat[-1] + self.logEps))
        prediction_loss = torch.sum(torch.sum(cross_entropy,dim=1)/ torch.cuda.FloatTensor(lengths)
        cost = torch.mean(prediction_loss) + 0.000001 * (self.W_out ** 2).sum() # regularize
        return (yhat,hidden_state,cost)

    def init_hidden(self):
        return torch.zeros(self.batchSize,self.hiddenDimSize)  # initial state

模型训练

artificalData_seqs = np.array(pickle.load(open(os.path.join(GOOGLE_DRV_PATH,BASE_DIR,'data.encodedDxs'),'rb')))
train,test,valid = load_data(artificalData_seqs,artificalData_seqs)

batchSize = 50     # decreased from 100 to 50
n_batches = int(np.ceil(float(len(train[0])) / float(batchSize)))-1
n_batches_valid = int(np.ceil(float(len(valid[0])) / float(batchSize)))-1
model = build_EHRNN(inputDimSize=4894,hiddenDimSize=200,batchSize=50,logEps=1e-8)
model = model.to(device)



 import torch.nn.functional as F
import pdb

optimizer = torch.optim.Adadelta(model.parameters(),lr = 0.001,rho=0.95)
epochs = 10

counter = 0
# with torch.autograd.detect_anomaly():
for e in range(epochs):
    for x,y in train_dl:
        x,mask,lengths = padding(x,inputDimSize,numClass)
        output,h = model(x,mask)
        
        loss = cost_function(output,y).cost()
        # pdb.set_trace()
        loss.backward()
        print("loss ",loss)
        nn.utils.clip_grad_norm_(model.parameters(),5) # Constraining the weight matrix directly == regularization. 
        optimizer.step()
        optimizer.zero_grad()
    
    with torch.no_grad():
            model.eval()
            val_loss = []
            for x_valid,y_valid in valid_dl:
                    x_val,y_val,lengths = padding(x_valid,y_valid,numClass)
                    outputs_val,hidden_val = model(x_val,mask)
                    loss = cost_function(outputs_val,y_val).cost()
                    val_loss.append(loss.item())
            model.train()

            print("Epoch: {}/{}...".format(e+1,epochs),"Step: {}...".format(counter),"Training Loss: {:.4f}...".format(loss.item()),"Val Loss: {:.4f}".format(torch.mean(torch.tensor(val_loss))))

错误(开始让NaN丢失)

Inside PredictionLoss fn : Sum Dim 0 tensor([[0.1008,0.1539,0.1211,...,0.1533,0.1218,0.1418],[0.0253,0.0449,0.0249,0.0439,0.0134,0.0332],[0.0306,0.0799,0.0570,0.0790,0.0484,0.0678],0.0450,[0.0038,0.0106,0.0098,0.0004,0.0106]],grad_fn=<SumBackward1>)
Inside PredictionLoss fn : Sum Dim 1 tensor([  372.4754,133.2620,219.1195,37.5425,141.3354,37.5070,229.2947,0.0000,379.1829,217.3962,80.1226,37.5074,138.4665,82.1034,89.7893,81.8173,92.8159,141.8856,95.9898,216.0511,133.2535,385.0391,369.4958,244.9362,37.5088,37.5087,141.6083,95.3367,735.0569,378.0407,37.5135,40.7778,82.0872,225.9998,216.6189,379.0732,81.4742,144.4226,93.3905,214.0228,37.5078,224.0793,88.3753,41.2919,140.4855,37.5086,226.6366,148.7171,137.9226,13887.5811,81.1428,84.6804,226.6779,37.5065,223.8841,220.5979,83.2484,37.5080,84.5247,384.2115,80.1173,146.9714,37.6982,134.6618,84.1838,37.5421,730.5516,37.5085,215.1523,136.5673,81.2887,94.4181,140.6268,133.9295,136.2485,386.2103,39.0282,37.5055,42.1506,80.1662,228.5819,39.3403,138.7672,1768.6033,143.5350,40.2060,147.7809,380.9214,750.6883,141.0447,136.9028,37.5049],grad_fn=<SumBackward1>)
Inside PredictionLoss fn : lengths tensor([5.,3.,4.,1.,0.,5.,2.,6.,9.,7.,1.])
Inside PredictionLoss fn : Final Result  tensor([  74.4951,44.4207,54.7799,47.1118,57.3237,nan,75.8366,54.3491,40.0613,46.1555,41.0517,44.8946,40.9086,46.4080,47.2952,47.9949,54.0128,44.4178,77.0078,73.8992,61.2340,47.2028,47.6683,122.5095,75.6081,41.0436,56.5000,54.1547,75.8146,40.7371,48.1409,46.6952,53.5057,56.0198,44.1876,46.8285,56.6591,49.5724,45.9742,1543.0646,40.5714,42.3402,56.6695,55.9710,55.1495,41.6242,42.2623,76.8423,40.0586,48.9905,44.8873,42.0919,121.7586,53.7881,45.5224,40.6443,47.2090,46.8756,44.6432,45.4162,40.0587,77.2421,40.0831,57.1455,46.2557,252.6576,47.8450,49.2603,76.1843,125.1147,47.0149,45.6343,grad_fn=<DivBackward0>)
Inside PredictionLoss fn : Sum Dim 0 tensor([[nan,nan],[nan,nan]],grad_fn=<SumBackward1>)
Inside PredictionLoss fn : Sum Dim 1 tensor([nan,grad_fn=<SumBackward1>)
Inside PredictionLoss fn : lengths tensor([2.,3.])
Inside PredictionLoss fn : Final Result  tensor([nan,grad_fn=<SumBackward1>)
Inside PredictionLoss fn : lengths tensor([3.,2.])
Inside PredictionLoss fn : Final Result  tensor([nan,grad_fn=<SumBackward1>)
Inside PredictionLoss fn : lengths tensor([4.,4.])
Inside PredictionLoss fn : Final Result  tensor([nan,grad_fn=<SumBackward1>)
Inside PredictionLoss fn : lengths tensor([1.,6.])
Inside PredictionLoss fn : Final Result  tensor([nan,8.])
Inside PredictionLoss fn : Final Result  tensor([nan,8.,grad_fn=<SumBackward1>)

解决方法

问题是lengths变量中的值。

在您的cost_function.prediction_loss中,交叉熵损失除以每个序列的长度:(torch.sum(tmp_tensor,dim=1)).float()/ lengths.float()
但是,如果您查看lengths张量的值,则:

Inside PredictionLoss fn : lengths tensor([5.,3.,4.,1.,0.,5.,2.,6.,9.,7.,1.])

您会注意到某些条目是 0 (!)。损失函数中的对应值也为零(零长度序列无损失)。当您将零除以零时,您得到nan


一些良好的编码实践:

  1. 如果可能,请使用库函数而不是重新实现。这些功能通常经过测试和优化,并且在数值上更稳定。
    例如,您可以使用torch.nn.CrossEntropyLoss,以数字稳健的方式结合交叉熵损失和softmax。

  2. 用于损失计算的变量lengths显然不是损失函数或类成员的参数。您应该将其设为明确的参数。

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐


使用本地python环境可以成功执行 import pandas as pd import matplotlib.pyplot as plt # 设置字体 plt.rcParams[&#39;font.sans-serif&#39;] = [&#39;SimHei&#39;] # 能正确显示负号 p
错误1:Request method ‘DELETE‘ not supported 错误还原:controller层有一个接口,访问该接口时报错:Request method ‘DELETE‘ not supported 错误原因:没有接收到前端传入的参数,修改为如下 参考 错误2:cannot r
错误1:启动docker镜像时报错:Error response from daemon: driver failed programming external connectivity on endpoint quirky_allen 解决方法:重启docker -&gt; systemctl r
错误1:private field ‘xxx‘ is never assigned 按Altʾnter快捷键,选择第2项 参考:https://blog.csdn.net/shi_hong_fei_hei/article/details/88814070 错误2:启动时报错,不能找到主启动类 #
报错如下,通过源不能下载,最后警告pip需升级版本 Requirement already satisfied: pip in c:\users\ychen\appdata\local\programs\python\python310\lib\site-packages (22.0.4) Coll
错误1:maven打包报错 错误还原:使用maven打包项目时报错如下 [ERROR] Failed to execute goal org.apache.maven.plugins:maven-resources-plugin:3.2.0:resources (default-resources)
错误1:服务调用时报错 服务消费者模块assess通过openFeign调用服务提供者模块hires 如下为服务提供者模块hires的控制层接口 @RestController @RequestMapping(&quot;/hires&quot;) public class FeignControl
错误1:运行项目后报如下错误 解决方案 报错2:Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.8.1:compile (default-compile) on project sb 解决方案:在pom.
参考 错误原因 过滤器或拦截器在生效时,redisTemplate还没有注入 解决方案:在注入容器时就生效 @Component //项目运行时就注入Spring容器 public class RedisBean { @Resource private RedisTemplate&lt;String
使用vite构建项目报错 C:\Users\ychen\work&gt;npm init @vitejs/app @vitejs/create-app is deprecated, use npm init vite instead C:\Users\ychen\AppData\Local\npm-
参考1 参考2 解决方案 # 点击安装源 协议选择 http:// 路径填写 mirrors.aliyun.com/centos/8.3.2011/BaseOS/x86_64/os URL类型 软件库URL 其他路径 # 版本 7 mirrors.aliyun.com/centos/7/os/x86
报错1 [root@slave1 data_mocker]# kafka-console-consumer.sh --bootstrap-server slave1:9092 --topic topic_db [2023-12-19 18:31:12,770] WARN [Consumer clie
错误1 # 重写数据 hive (edu)&gt; insert overwrite table dwd_trade_cart_add_inc &gt; select data.id, &gt; data.user_id, &gt; data.course_id, &gt; date_format(
错误1 hive (edu)&gt; insert into huanhuan values(1,&#39;haoge&#39;); Query ID = root_20240110071417_fe1517ad-3607-41f4-bdcf-d00b98ac443e Total jobs = 1
报错1:执行到如下就不执行了,没有显示Successfully registered new MBean. [root@slave1 bin]# /usr/local/software/flume-1.9.0/bin/flume-ng agent -n a1 -c /usr/local/softwa
虚拟及没有启动任何服务器查看jps会显示jps,如果没有显示任何东西 [root@slave2 ~]# jps 9647 Jps 解决方案 # 进入/tmp查看 [root@slave1 dfs]# cd /tmp [root@slave1 tmp]# ll 总用量 48 drwxr-xr-x. 2
报错1 hive&gt; show databases; OK Failed with exception java.io.IOException:java.lang.RuntimeException: Error in configuring object Time taken: 0.474 se
报错1 [root@localhost ~]# vim -bash: vim: 未找到命令 安装vim yum -y install vim* # 查看是否安装成功 [root@hadoop01 hadoop]# rpm -qa |grep vim vim-X11-7.4.629-8.el7_9.x
修改hadoop配置 vi /usr/local/software/hadoop-2.9.2/etc/hadoop/yarn-site.xml # 添加如下 &lt;configuration&gt; &lt;property&gt; &lt;name&gt;yarn.nodemanager.res