I am trying to train a GRU model, but running into this assertion error at the point of loss.backward(). Here is my code with the error. Any help is appreciated.
class AttendResistance(nn.Module):
def __init__(self, nb_classes, nb_tokens, embedding_matrix,
embed_dropout_rate=0, final_dropout_rate=0, return_attention=False):
super(AttendResistance, self).__init__()
embedding_dim = 20
hidden_size = 32
self.embed_dropout_rate = embed_dropout_rate
self.final_dropout_rate = final_dropout_rate
self.return_attention = return_attention
self.hidden_size = hidden_size
self.nb_classes = nb_classes
self.embed = nn.Embedding(nb_tokens, embedding_dim)
self.embed.weight = nn.Parameter(embedding_matrix)
self.embed_dropout = nn.Dropout2d(embed_dropout_rate)
self.gru = nn.GRU(embedding_dim, hidden_size, num_layers = 1, batch_first=True, dropout = 0.5,
self.final_drop = nn.Dropout(final_dropout_rate)
self.linear = nn.Linear(hidden_size, nb_classes)
self.softmax = nn.Softmax(dim = 1)
def forward(self, input_seqs):
print (input_seqs.size())
x = self.embed(input_seqs)
print (x.size())
x = nn.Tanh()(x)
print (x.size())
x = self.embed_dropout(x)
print (x.size())
x, _ = self.gru(x)
print (x.size())
x = self.final_drop(x)
print (x.size())
x = self.linear(x[:, -1, :].float())
print (x.size())
outputs = self.softmax(x)
print (outputs.size())
if self.return_attention:
return outputs, att_weights
return outputs
attn_res = AttendResistance(268, 20, embedding_matrix, 0.5, 0.3, True)
attn_res = attn_res.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(attn_res.parameters())
num_epochs = 10
for epoch in range(num_epochs):
for i, (prot_seqs, labels) in enumerate(train_loader):
prot_seqs = Variable(prot_seqs.long()).cuda()
labels = Variable(labels.long()).cuda()
#print (prot_seqs)
#print (labels)
# Forward + Backward + Optimize
outputs, att_weights = attn_res(prot_seqs)
print (outputs)
loss = criterion(outputs, torch.max(labels, 1)[1])
print (loss)
if (i+1) % 100 == 0:
print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'
%(epoch+1, num_epochs, i+1, len(X_train)//batch_size, loss.data[0]))
And here is the error with the print output:
torch.Size([64, 1602])
torch.Size([64, 1602, 20])
torch.Size([64, 1602, 20])
torch.Size([64, 1602, 20])
torch.Size([64, 1602, 32])
torch.Size([64, 1602, 32])
torch.Size([64, 268])
torch.Size([64, 268])
Variable containing:
1.00000e-03 *
3.5743 3.7436 4.2370 ... 3.9607 4.2058 4.2674
3.5743 3.7436 4.2370 ... 3.9607 4.2058 4.2674
3.5743 3.7436 4.2370 ... 3.9607 4.2058 4.2674
... ⋱ ...
3.5743 3.7436 4.2370 ... 3.9607 4.2058 4.2674
3.5743 3.7436 4.2370 ... 3.9607 4.2058 4.2674
3.5743 3.7436 4.2370 ... 3.9607 4.2058 4.2674
[torch.cuda.FloatTensor of size 64x268 (GPU 0)]
Variable containing:
[torch.cuda.FloatTensor of size 1 (GPU 0)]
AssertionError Traceback (most recent call last)
<ipython-input-89-a32cf2edb4cc> in <module>()
17 print (torch.sum(att_weights))
18 print (loss)
---> 19 loss.backward()
20 optimizer.step()
/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/variable.py in backward(self, gradient, retain_graph, create_graph, retain_variables)
165 Variable.
166 """
--> 167 torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)
169 def register_hook(self, hook):
/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(variables, grad_variables, retain_graph, create_graph, retain_variables)
98 Variable._execution_engine.run_backward(
---> 99 variables, grad_variables, retain_graph)
/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/function.py in _do_backward(self, gradients, retain_variables)
333 def _do_backward(self, gradients, retain_variables):
334 self.retain_variables = retain_variables
--> 335 result = super(NestedIOFunction, self)._do_backward(gradients, retain_variables)
336 if not retain_variables:
337 del self._nested_output
/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/function.py in backward(self, *gradients)
341 def backward(self, *gradients):
342 nested_gradients = _unflatten(gradients, self._nested_output)
--> 343 result = self.backward_extended(*nested_gradients)
344 return tuple(_iter_None_tensors(result))
/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in backward_extended(self, grad_output, grad_hy)
333 output,
334 weight,
--> 335 grad_weight)
336 else:
337 grad_weight = [(None,) * len(layer_weight) for layer_weight in weight]
/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in backward_weight(fn, input, hx, output, weight, grad_weight)
467 # copy the weights from the weight_buf into grad_weight
--> 468 grad_params = get_parameters(fn, handle, dw)
469 _copyParams(grad_params, grad_weight)
470 return grad_weight
/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in get_parameters(fn, handle, weight_buf)
169 layer_params.append(param)
170 else:
--> 171 assert cur_offset == offset
173 cur_offset = offset + filter_dim_a[0]
I am a newbie with pytorch. Because the error is not giving me any explicit message, I don’t know what am I doing wrong here. I am aware it is an assertion error that is happening. But I don’t know what are these cur_offset and offset variables are. Running on,
Cuda compilation tools, release 8.0, V8.0.61
@nafizh, i had the same problem, and fixed this by adding something like
, and removed allfloat(), double()
calling. you can find details in the like blew.Ref: https://github.com/pytorch/pytorch/issues/5004