微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

CharEmbeddings和CNNCharEmbeddings / Encoding有什么区别?

如何解决CharEmbeddings和CNNCharEmbeddings / Encoding有什么区别?

在研究ML模型时,我发现了两个看似不同的模块来执行CharEmbeddingCharEncoding,但我不清楚为什么两者都需要以及它们有什么区别。

CharEmbedding如下,并且通过LSTM完成,就像我一直相信的那样:

class CharEmbeddings(nn.Module):  
    def __init__(self,char_vocab,embedding_dim,hidden_size,which_cuda=0):
        super().__init__()

        self.embedding_dim = embedding_dim 
        self.vocab = char_vocab
        self.hidden_size = hidden_size 

        self.embeddings = nn.Embedding(
            num_embeddings = len(self.vocab),embedding_dim = embedding_dim,padding_idx = self.vocab.pad
        )

        self.bilstm = nn.LSTM(
            input_size = embedding_dim,hidden_size = self.hidden_size,num_layers = 1,bidirectional = True
        )

    def forward(self,sentence_batch):
        # char2index + padding
        words,lengths,unsort_idx = self.prepare(sentence_batch)
        # words -> (n_pad_words,max_word_length)
        # sort + remove 0 lenth words (pads)
        non_zero_words,non_zero_lengths = self.remove_pad_words(words,lengths)
        # non_zero_words -> (n_nonpad_words,max_word_length)

        embeddings = self.embeddings(non_zero_words).to(self.device)
        # embeddings -> (n_nonpad_words,max_word_length,embeddings_dim)
        # pack
        x = torch.nn.utils.rnn.pack_padded_sequence(embeddings,non_zero_lengths,batch_first=True)
        # pass through lstm
        output,hidden = self.bilstm(x)
        x,_ = torch.nn.utils.rnn.pad_packed_sequence(output,batch_first=True)
        # embeddings -> (n_nonpad_words,hidden_size*2)

        # take the output of the lstm correctly
        # filter idx is the id of the last character in each word
        # this aims to find the output of the lshtm on the last non pad char
        # ex: ['h','e','l','o','<pad>']
        # instead of getting the output of '<pad>' we want the output of 'o'
        filter_idx = non_zero_lengths.long().view(-1,1,1).expand(-1,self.hidden_size*2) -1
        # filter_idx -> (n_nonpad_words),hidden_size*2)
        forward_out = x.gather(1,filter_idx).squeeze(1)[:,:self.hidden_size]
        # filter_idx -> (n_nonpad_words,hidden_size)

        # get the output of the first character
        backward_out = x[:,self.hidden_size:]
        # concat first char's output last hidden state part with the last char's output first hidden state part
        x = torch.cat([forward_out,backward_out],1)
        # x -> (n_nonpad_words,hidden_size*2)
        #
        x = torch.cat([x,torch.zeros(len(words)-len(non_zero_words),self.hidden_size*2).to(self.device)],0)
        # x -> (n_pad_words,hidden_size*2)
        # unsort
        x = x[unsort_idx]
        # reshape to sentence size
        x = x.view(len(sentence_batch),-1,self.hidden_size*2)
        # x -> (batch_size,max_sentence_size,hidden_size*2)
        return x
    
    
    def prepare(self,sentence_batch):
    # receibes a batch of sentences and return a batch of words and their lengths sorted in decreasing order
        return padded_words,sort_word_len,unsort_idx
    
    
    def remove_pad_words(self,padded_words,word_lengths):
    # this function remove pad words from a batch of words
    # note that the index returned is the size of the batch before filtering pads
    # so,before unsorting,padded values must be re-added to the tensor
        return padded_words,sort_word_len

忽略了代码的细节,对我来说,这就像是普通的CharEmbedding。 反之亦然,我不明白为什么您必须使用CNN来进行CharEncoding,如下所示:

class CNNCharEmbeddings(CharEmbeddings):
    def __init__(self,cnn_embeddings_size,cnn_ce_kernel_size,cnn_ce_out_channels,which_cuda=0):
        torch.backends.cudnn.deterministic = True
        CharEmbeddings.__init__(self,which_cuda=which_cuda)

        self.embedding_dim = cnn_embeddings_size
        self.vocab = char_vocab

        self.cnn_ce_kernel_size = cnn_ce_kernel_size
        self.cnn_ce_out_channels = cnn_ce_out_channels

        self.cnn = nn.Conv1d(
            in_channels=self.embedding_dim,out_channels=self.cnn_ce_out_channels,kernel_size=cnn_ce_kernel_size,stride=1,padding=int((cnn_ce_kernel_size-1)/2),)

    def forward(self,sentence_batch):

        words,unsort_idx = self.prepare(sentence_batch)
        non_zero_words,lengths)
        embeddings = self.embeddings(non_zero_words).to(self.device)

        x = self.cnn(embeddings.transpose(1,2))
        mask = (torch.arange(x.shape[2]).expand(x.shape).to(self.device) < non_zero_lengths.unsqueeze(1).unsqueeze(1).to(self.device)).float()
        x_min = (torch.arange(x.shape[2]).expand(x.shape).to(self.device) >= non_zero_lengths.unsqueeze(1).unsqueeze(1).to(self.device)).float() * torch.min(x)
        x_min = x_min.detach().float()
        x = x * mask + x_min
        x = F.relu(x)
        x = nn.MaxPool1d(
            kernel_size=x.shape[2]
        )(x)

        x=x.view([x.shape[0],x.shape[1]])
        x = torch.cat([x,self.cnn_ce_out_channels).to(self.device)],0)

        x = x[unsort_idx]
        x = x.view(len(sentence_batch),self.cnn_ce_out_channels)

        return x

通常,使用这两个模块对模型的改进要比仅使用其中一个对模型的改进大。但是,如果我已经拥有CharEncoding,那么我一般就无法理解CNN和CharEmbedding的用处。 与encoder-decoder有关系吗?

谢谢!

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。