如何解决为什么我的一个模型训练需要将近2个小时?
我正在研究神经机器翻译(从Fr到En)。我开发了第一个模型(使用Quadro P5000 GPU 16278MiB),该模型是使用LSTM进行简单的序列到序列的设计。
# Model deFinition
class EncoderLSTM(nn.Module):
def __init__(
self,embedding_size,vocab_size,hidden_size,n_layers,dropout,recurrent_dropout
):
super(EncoderLSTM,self).__init__()
self.embedding_size = embedding_size
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.dropout = dropout
self.recurrent_dropout = recurrent_dropout
self.embedding = nn.Embedding(vocab_size,embedding_size)
self.lstm = nn.LSTM(embedding_size,num_layers=n_layers,bidirectional=True,dropout=(recurrent_dropout if n_layers > 1 else 0))
def load_pretrained_embeddings(self,embeddings):
self.embedding.weight = nn.Parameter(embeddings)
def fine_tuning_embeddings(self,fine_tune=True):
for p in self.embedding.parameters():
p.requires_grad = fine_tune
def forward(self,input_sequences,sequence_lengths):
"""
:params
input_sequences: Tensor[seq_len,batch_size]
sequence_lengths: Tensor[batch_size,]
:return
outputs: Tensor[seq_len,batch_size,2 * hidden_size]
hn: Tensor[n_layers * 2,hidden_size]
cn: Tensor[n_layers * 2,hidden_size]
"""
embedded = self.embedding(input_sequences)
embedded = F.dropout(embedded,p=self.dropout)
packed = nn.utils.rnn.pack_padded_sequence(embedded,sequence_lengths)
outputs,(hn,cn) = self.lstm(packed)
outputs,_ = nn.utils.rnn.pad_packed_sequence(outputs)
return outputs,hn,cn
class DecoderLSTM(nn.Module):
def __init__(
self,recurrent_dropout
):
super(DecoderLSTM,dropout=(recurrent_dropout if n_layers > 1 else 0))
self.fc = nn.Linear(hidden_size,vocab_size)
def load_pretrained_embeddings(self,fine_tune=True):
for p in self.embedding.parameters():
p.requires_grad = fine_tune
def forward(self,input_word_index,h_state,c_state):
"""
:params
input_word_index: Tensor[batch_size,]
h_state: Tensor[num_layers,hidden_size]
c_state: Tensor[num_layers,hidden_size]
:return
logit: Tensor[batch_size,vocab_size]
h_state: Tensor[num_layers,hidden_size]
"""
embedded = self.embedding(input_word_index.unsqueeze(0))
outputs,(h_state,c_state) = self.lstm(embedded,c_state))
logit = self.fc(F.dropout(outputs,p=self.dropout))
logit = logit.squeeze(0)
return logit,c_state
class SeqToSeqLSTM(nn.Module):
def __init__(self,encoder,decoder,device):
assert encoder.n_layers == decoder.n_layers,\
'Encoder and Decoder must have the same number of reccurent layers'
assert encoder.hidden_size == decoder.hidden_size,\
'Encoder and Decoder must have the same number of reccurrent hidden units'
super(SeqToSeqLSTM,self).__init__()
self.encoder = encoder
self.decoder = decoder
self.init_h0 = nn.Linear(decoder.n_layers * 2,decoder.n_layers)
self.init_c0 = nn.Linear(decoder.n_layers * 2,decoder.n_layers)
self.device = device
def forward(self,src_sequences,src_lengths,dest_sequences,dest_lengths,tf_ratio):
"""
:params
src_sequences: Tensor[seq_len,batch_size]
src_lengths: Tensor[batch_size,]
dest_sequences: Tensor[seq_len,batch_size]
dest_lengths: Tensor[batch_size,]
tf_ratio: float
:return
logits: Tensor[max(decode_lengths),vocab_size]
sorted_dest_sequences: Tensor[seq_len,batch_size]
sorted_decode_lengths: Tensor[batch_size,]
sorted_indices: Tensor[batch_size,]
"""
# Encoding
_,c_state = self.encoder(
input_sequences=src_sequences,sequence_lengths=src_lengths
)
# h_state: [n_layers * 2,hidden_size]
# c_state: [n_layers * 2,hidden_size]
# Sort the batch (dest) by decreasing lengths
sorted_dest_lengths,sorted_indices = torch.sort(dest_lengths,dim=0,descending=True)
sorted_dest_sequences = dest_sequences[:,sorted_indices]
h_state = h_state[:,sorted_indices,:]
c_state = c_state[:,:]
# Init hidden and memory states
h_state = self.init_h0(h_state.permute(1,2,0)) # [batch_size,n_layers]
c_state = self.init_c0(c_state.permute(1,n_layers]
h_state = h_state.permute(2,1) # [n_layers,hidden_size]
c_state = c_state.permute(2,hidden_size]
# We won't decode at the <eos> position,since we've finished generating as soon as we generate <eos>
# So,decoding lengths are actual lengths - 1
sorted_decode_lengths = (sorted_dest_lengths - 1).tolist()
# Decoding
batch_size,last = dest_sequences.size(1),None
logits = torch.zeros(max(sorted_decode_lengths),self.decoder.vocab_size).to(self.device)
for t in range(max(sorted_decode_lengths)):
batch_size_t = sum([l > t for l in sorted_decode_lengths])
if last is not None:
if random.random() < tf_ratio:
in_ = last[:batch_size_t]
else:
in_ = sorted_dest_sequences[t,:batch_size_t]
else:
in_ = sorted_dest_sequences[t,:batch_size_t]
# in_ [batch_size,]
logit,c_state = self.decoder(
in_,h_state[:,:batch_size_t,:].contiguous(),c_state[:,:].contiguous()
)
# logit: [batch_size,vocab_size]
# h_state: [num_layers,hidden_size]
# c_state: [num_layers,hidden_size]
logits[t,:] = logit
last = torch.argmax(F.softmax(logit,dim=1),dim=1) # [batch_size,]
return logits,sorted_dest_sequences,sorted_decode_lengths,sorted_indices
# Model initialization
MODEL_NAME = 'seq2seq-lstm'
N_LAYERS = 4
HIDDEN_SIZE = 512
EMbedDING_SIZE = 300
ENC_DROPOUT = 0.3
ENC_RECURRENT_DROPOUT = 0.25
DEC_DROPOUT = 0.15
DEC_RECURRENT_DROPOUT = 0.2
N_EPOCHS = 15
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
GRAD_CLIP = 1.0
TF_RATIO = 1.0
encoder = EncoderLSTM(
embedding_size=EMbedDING_SIZE,vocab_size=len(FR.vocab),hidden_size=HIDDEN_SIZE,n_layers=N_LAYERS,dropout=ENC_DROPOUT,recurrent_dropout=ENC_RECURRENT_DROPOUT
)
encoder.load_pretrained_embeddings(fr_embeddings)
encoder.fine_tuning_embeddings(fine_tune=True)
decoder = DecoderLSTM(
embedding_size=EMbedDING_SIZE,vocab_size=len(EN.vocab),dropout=DEC_DROPOUT,recurrent_dropout=DEC_RECURRENT_DROPOUT
)
decoder.load_pretrained_embeddings(en_embeddings)
decoder.fine_tuning_embeddings(fine_tune=True)
seq2seq = SeqToSeqLSTM(encoder=encoder,decoder=decoder,device=DEVICE)
seq2seq.apply(torch_utils.xavier_init_weights)
seq2seq.to(DEVICE)
optimizer = optim.RMSprop(params=seq2seq.parameters(),lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
print(f'Number of parameters of the model: {torch_utils.count_parameters(seq2seq):,}')
# Number of parameters of the model: 41,471,097
使用上述模型,一个时期的训练时间为04:31分钟。
当我添加注意力机制(Luong风格)时,一个纪元的训练时间为1:55:47小时。我想知道为什么吗?
# Model deFinition
class EncoderLSTM(nn.Module):
def __init__(
self,cn
class LuongAttention(nn.Module):
def __init__(self,method):
if method not in ['dot','concat']:
raise NotImplemented(f'The {method} attention is not defined!')
super(LuongAttention,self).__init__()
self.hidden_size = hidden_size
self.method = method
if method == 'dot':
pass
elif method == 'concat':
self.W = nn.Linear(hidden_size,hidden_size)
self.V = nn.Linear(hidden_size,1)
else:
raise NotImplementedError(f'{method} not implemented!')
def forward(self,enc_outputs,mask):
"""
:args
h_state: Tensor[n_layers,hidden_size]
enc_outputs: Tensor[seq_len,hidden_size]
mask: Tensor[seq_len,batch_size]
:return
attn_weights: Tensor[seq_len,1]
"""
if h_state.shape[0] > 1:
h_state = h_state.sum(dim=0) # [batch_size,hidden_size]
h_state = h_state.unsqueeze(0) # [1,hidden_size]
# Calculating the alignment scores
if self.method == 'dot':
scores = torch.sum(h_state * enc_outputs,dim=2)
scores = scores.unsqueeze(dim=2) / np.sqrt(self.hidden_size) # [seq_len,1]
elif self.method == 'concat':
scores = self.V(
torch.tanh(self.W(
enc_outputs + h_state # [seq_len,hidden_size]
))
) # [seq_len,1]
else:
raise NotImplementedError(f'{method} not implemented!')
# Apply mask to ignore <pad> tokens
mask = mask.unsqueeze(2) # [seq_len,1]
scores = scores.masked_fill(mask == 0,-1e10)
# Calculating the attention weights by softmaxing the alignment scores
attn_weights = F.softmax(scores,dim=1) # [seq_len,1]
return attn_weights
class DecoderLSTM(nn.Module):
def __init__(
self,recurrent_dropout,attention
):
super(DecoderLSTM,self).__init__()
self.embedding_size = embedding_size
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.dropout = dropout
self.recurrent_dropout = recurrent_dropout
self.attention = attention
self.embedding = nn.Embedding(vocab_size,dropout=(recurrent_dropout if n_layers > 1 else 0))
self.fc1 = nn.Linear(hidden_size * 2,hidden_size)
self.fc2 = nn.Linear(hidden_size,c_state,mask):
"""
:params
input_word_index: Tensor[batch_size,hidden_size]
enc_outputs: Tensor[seq_len,hidden_size]
mask: Tensor[seq_len,batch_size]
:return
logit: Tensor[batch_size,hidden_size]
attn_weights: Tensor[batch_size,seq_len]
"""
embedded = self.embedding(input_word_index.unsqueeze(0)) # [seq_len=1,embedding_size]
outputs,c_state))
# outputs: [seq_len=1,hidden_size]
# h_state: [n_layers,hidden_size]
# c_state: [n_layers,hidden_size]
# Compute Attention Weights
attn_weights = self.attention(h_state=outputs,enc_outputs=enc_outputs,mask=mask) # [seq_len,1]
# Compute Context Vector
context_vector = torch.bmm(
enc_outputs.permute(1,0),# [batch_size,seq_len]
attn_weights.permute(1,2),seq_len,1]
).permute(2,1) # [1,hidden_size]
# New input: concatenate context_vector with hidden_states
new_input = torch.cat((context_vector,outputs),dim=2) # [1,hidden_size * 2]
# Get logit
out = torch.tanh(self.fc1(new_input.squeeze(0)))
logit = self.fc2(out) # [batch_size,vocab_size]
return logit,attn_weights.squeeze(2)
class SeqToSeqLSTM(nn.Module):
def __init__(self,pad_index,self).__init__()
self.encoder = encoder
self.decoder = decoder
self.pad_index = pad_index
self.init_h0 = nn.Linear(decoder.n_layers * 2,decoder.n_layers)
self.fc = nn.Linear(2 * encoder.hidden_size,encoder.hidden_size)
self.device = device
def create_mask(self,src_sequences):
"""
:params
src_sequences: Tensor[seq_len,batch_size]
:return
mask: Tensor[seq_len,batch_size]
"""
mask = (src_sequences != self.pad_index)
return mask
def forward(self,]
"""
mask = self.create_mask(src_sequences) # [seq_len,batch_size]
# Encoding
enc_outputs,sequence_lengths=src_lengths
)
# enc_outputs: [seq_len,2 * hidden_size]
# h_state: [n_layers * 2,hidden_size]
enc_outputs = self.fc(enc_outputs)
# enc_outputs: [seq_len,sorted_indices]
enc_outputs = enc_outputs[:,:]
h_state = h_state[:,_ = self.decoder(
in_,enc_outputs[:,:],mask[:,:batch_size_t]
)
# logit: [batch_size,sorted_indices
# Model initialization
encoder = EncoderLSTM(embedding_size=EMbedDING_SIZE,recurrent_dropout=ENC_RECURRENT_DROPOUT)
encoder.load_pretrained_embeddings(fr_embeddings)
encoder.fine_tuning_embeddings(fine_tune=True)
attention = LuongAttention(hidden_size=HIDDEN_SIZE,method='dot')
decoder = DecoderLSTM(embedding_size=EMbedDING_SIZE,recurrent_dropout=DEC_RECURRENT_DROPOUT,attention=attention)
decoder.load_pretrained_embeddings(en_embeddings)
decoder.fine_tuning_embeddings(fine_tune=True)
seq2seq = SeqToSeqLSTM(encoder=encoder,pad_index=EN.vocab.stoi[EN.pad_token],}')
# Number of parameters of the model: 42,520,697
两个模型的参数数量几乎相同。
数据来自europarl并行语料库,其中有89,752个示例,每个示例的长度在15到25之间。
有人知道为什么带有注意力机制的模型要花这么长时间吗?
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。