如何解决CharEmbeddings和CNNCharEmbeddings / Encoding有什么区别?
在研究ML模型时,我发现了两个看似不同的模块来执行CharEmbedding
和CharEncoding
,但我不清楚为什么两者都需要以及它们有什么区别。
CharEmbedding
如下,并且通过LSTM
完成,就像我一直相信的那样:
class CharEmbeddings(nn.Module):
def __init__(self,char_vocab,embedding_dim,hidden_size,which_cuda=0):
super().__init__()
self.embedding_dim = embedding_dim
self.vocab = char_vocab
self.hidden_size = hidden_size
self.embeddings = nn.Embedding(
num_embeddings = len(self.vocab),embedding_dim = embedding_dim,padding_idx = self.vocab.pad
)
self.bilstm = nn.LSTM(
input_size = embedding_dim,hidden_size = self.hidden_size,num_layers = 1,bidirectional = True
)
def forward(self,sentence_batch):
# char2index + padding
words,lengths,unsort_idx = self.prepare(sentence_batch)
# words -> (n_pad_words,max_word_length)
# sort + remove 0 lenth words (pads)
non_zero_words,non_zero_lengths = self.remove_pad_words(words,lengths)
# non_zero_words -> (n_nonpad_words,max_word_length)
embeddings = self.embeddings(non_zero_words).to(self.device)
# embeddings -> (n_nonpad_words,max_word_length,embeddings_dim)
# pack
x = torch.nn.utils.rnn.pack_padded_sequence(embeddings,non_zero_lengths,batch_first=True)
# pass through lstm
output,hidden = self.bilstm(x)
x,_ = torch.nn.utils.rnn.pad_packed_sequence(output,batch_first=True)
# embeddings -> (n_nonpad_words,hidden_size*2)
# take the output of the lstm correctly
# filter idx is the id of the last character in each word
# this aims to find the output of the lshtm on the last non pad char
# ex: ['h','e','l','o','<pad>']
# instead of getting the output of '<pad>' we want the output of 'o'
filter_idx = non_zero_lengths.long().view(-1,1,1).expand(-1,self.hidden_size*2) -1
# filter_idx -> (n_nonpad_words),hidden_size*2)
forward_out = x.gather(1,filter_idx).squeeze(1)[:,:self.hidden_size]
# filter_idx -> (n_nonpad_words,hidden_size)
# get the output of the first character
backward_out = x[:,self.hidden_size:]
# concat first char's output last hidden state part with the last char's output first hidden state part
x = torch.cat([forward_out,backward_out],1)
# x -> (n_nonpad_words,hidden_size*2)
#
x = torch.cat([x,torch.zeros(len(words)-len(non_zero_words),self.hidden_size*2).to(self.device)],0)
# x -> (n_pad_words,hidden_size*2)
# unsort
x = x[unsort_idx]
# reshape to sentence size
x = x.view(len(sentence_batch),-1,self.hidden_size*2)
# x -> (batch_size,max_sentence_size,hidden_size*2)
return x
def prepare(self,sentence_batch):
# receibes a batch of sentences and return a batch of words and their lengths sorted in decreasing order
return padded_words,sort_word_len,unsort_idx
def remove_pad_words(self,padded_words,word_lengths):
# this function remove pad words from a batch of words
# note that the index returned is the size of the batch before filtering pads
# so,before unsorting,padded values must be re-added to the tensor
return padded_words,sort_word_len
忽略了代码的细节,对我来说,这就像是普通的CharEmbedding
。
反之亦然,我不明白为什么您必须使用CNN来进行CharEncoding
,如下所示:
class CNNCharEmbeddings(CharEmbeddings):
def __init__(self,cnn_embeddings_size,cnn_ce_kernel_size,cnn_ce_out_channels,which_cuda=0):
torch.backends.cudnn.deterministic = True
CharEmbeddings.__init__(self,which_cuda=which_cuda)
self.embedding_dim = cnn_embeddings_size
self.vocab = char_vocab
self.cnn_ce_kernel_size = cnn_ce_kernel_size
self.cnn_ce_out_channels = cnn_ce_out_channels
self.cnn = nn.Conv1d(
in_channels=self.embedding_dim,out_channels=self.cnn_ce_out_channels,kernel_size=cnn_ce_kernel_size,stride=1,padding=int((cnn_ce_kernel_size-1)/2),)
def forward(self,sentence_batch):
words,unsort_idx = self.prepare(sentence_batch)
non_zero_words,lengths)
embeddings = self.embeddings(non_zero_words).to(self.device)
x = self.cnn(embeddings.transpose(1,2))
mask = (torch.arange(x.shape[2]).expand(x.shape).to(self.device) < non_zero_lengths.unsqueeze(1).unsqueeze(1).to(self.device)).float()
x_min = (torch.arange(x.shape[2]).expand(x.shape).to(self.device) >= non_zero_lengths.unsqueeze(1).unsqueeze(1).to(self.device)).float() * torch.min(x)
x_min = x_min.detach().float()
x = x * mask + x_min
x = F.relu(x)
x = nn.MaxPool1d(
kernel_size=x.shape[2]
)(x)
x=x.view([x.shape[0],x.shape[1]])
x = torch.cat([x,self.cnn_ce_out_channels).to(self.device)],0)
x = x[unsort_idx]
x = x.view(len(sentence_batch),self.cnn_ce_out_channels)
return x
通常,使用这两个模块对模型的改进要比仅使用其中一个对模型的改进大。但是,如果我已经拥有CharEncoding
,那么我一般就无法理解CNN和CharEmbedding
的用处。
与encoder-decoder
有关系吗?
谢谢!
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。