Assertion error (assert cur_offset == offset) in loss.backward() pytorch

nafizh Source

I am trying to train a GRU model, but running into this assertion error at the point of loss.backward(). Here is my code with the error. Any help is appreciated.

class AttendResistance(nn.Module):
       def __init__(self, nb_classes, nb_tokens, embedding_matrix, 
   embed_dropout_rate=0, final_dropout_rate=0, return_attention=False):
        super(AttendResistance, self).__init__()
    embedding_dim = 20
    hidden_size = 32

    self.embed_dropout_rate = embed_dropout_rate
    self.final_dropout_rate = final_dropout_rate
    self.return_attention = return_attention
    self.hidden_size = hidden_size
    self.nb_classes = nb_classes

    self.embed = nn.Embedding(nb_tokens, embedding_dim)
    self.embed.weight = nn.Parameter(embedding_matrix)
    self.embed_dropout = nn.Dropout2d(embed_dropout_rate)
    self.gru = nn.GRU(embedding_dim, hidden_size, num_layers = 1, batch_first=True, dropout = 0.5, 
                    bidirectional=False)
    self.final_drop = nn.Dropout(final_dropout_rate)
    self.linear = nn.Linear(hidden_size, nb_classes)
    self.softmax = nn.Softmax(dim = 1)

def forward(self, input_seqs):
    print (input_seqs.size())
    x = self.embed(input_seqs)
    print (x.size())
    x = nn.Tanh()(x)
    print (x.size())
    x = self.embed_dropout(x)
    print (x.size())
    x, _ = self.gru(x)  
    print (x.size())      
    x = self.final_drop(x)
    print (x.size())
    x = self.linear(x[:, -1, :].float())
    print (x.size())
    outputs = self.softmax(x)
    print (outputs.size())

    if self.return_attention:
        return outputs, att_weights
    else:
        return outputs

attn_res = AttendResistance(268, 20, embedding_matrix, 0.5, 0.3, True)
attn_res = attn_res.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(attn_res.parameters())

num_epochs = 10
for epoch in range(num_epochs):
    for i, (prot_seqs, labels) in enumerate(train_loader):
        prot_seqs = Variable(prot_seqs.long()).cuda()
        labels = Variable(labels.long()).cuda()

        #print (prot_seqs)
        #print (labels)
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs, att_weights = attn_res(prot_seqs)
        print (outputs)
        loss = criterion(outputs, torch.max(labels, 1)[1])
        print (loss)
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 
                   %(epoch+1, num_epochs, i+1, len(X_train)//batch_size, loss.data[0]))

And here is the error with the print output:

torch.Size([64, 1602])
torch.Size([64, 1602, 20])
torch.Size([64, 1602, 20])
torch.Size([64, 1602, 20])
torch.Size([64, 1602, 32])
torch.Size([64, 1602, 32])
torch.Size([64, 268])
torch.Size([64, 268])
Variable containing:
1.00000e-03 *
 3.5743  3.7436  4.2370  ...   3.9607  4.2058  4.2674
 3.5743  3.7436  4.2370  ...   3.9607  4.2058  4.2674
 3.5743  3.7436  4.2370  ...   3.9607  4.2058  4.2674
          ...             ⋱             ...          
 3.5743  3.7436  4.2370  ...   3.9607  4.2058  4.2674
 3.5743  3.7436  4.2370  ...   3.9607  4.2058  4.2674
 3.5743  3.7436  4.2370  ...   3.9607  4.2058  4.2674
[torch.cuda.FloatTensor of size 64x268 (GPU 0)]

Variable containing:
 5.5909
[torch.cuda.FloatTensor of size 1 (GPU 0)]


---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-89-a32cf2edb4cc> in <module>()
     17         print (torch.sum(att_weights))
     18         print (loss)
---> 19         loss.backward()
     20         optimizer.step()
     21 

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/variable.py in backward(self, gradient, retain_graph, create_graph, retain_variables)
    165                 Variable.
    166         """
--> 167         torch.autograd.backward(self, gradient, retain_graph, create_graph, retain_variables)
    168 
    169     def register_hook(self, hook):

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(variables, grad_variables, retain_graph, create_graph, retain_variables)
     97 
     98     Variable._execution_engine.run_backward(
---> 99         variables, grad_variables, retain_graph)
    100 
    101 

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/function.py in _do_backward(self, gradients, retain_variables)
    333     def _do_backward(self, gradients, retain_variables):
    334         self.retain_variables = retain_variables
--> 335         result = super(NestedIOFunction, self)._do_backward(gradients, retain_variables)
    336         if not retain_variables:
    337             del self._nested_output

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/autograd/function.py in backward(self, *gradients)
    341     def backward(self, *gradients):
    342         nested_gradients = _unflatten(gradients, self._nested_output)
--> 343         result = self.backward_extended(*nested_gradients)
    344         return tuple(_iter_None_tensors(result))
    345 

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in backward_extended(self, grad_output, grad_hy)
    333                 output,
    334                 weight,
--> 335                 grad_weight)
    336         else:
    337             grad_weight = [(None,) * len(layer_weight) for layer_weight in weight]

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in backward_weight(fn, input, hx, output, weight, grad_weight)
    466 
    467         # copy the weights from the weight_buf into grad_weight
--> 468         grad_params = get_parameters(fn, handle, dw)
    469         _copyParams(grad_params, grad_weight)
    470         return grad_weight

/home/nafizh/anaconda3/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in get_parameters(fn, handle, weight_buf)
    169                     layer_params.append(param)
    170                 else:
--> 171                     assert cur_offset == offset
    172 
    173                 cur_offset = offset + filter_dim_a[0]

AssertionError: 

I am a newbie with pytorch. Because the error is not giving me any explicit message, I don’t know what am I doing wrong here. I am aware it is an assertion error that is happening. But I don’t know what are these cur_offset and offset variables are. Running on,

0.3.0.post4

Cuda compilation tools, release 8.0, V8.0.61

pythonmachine-learningdeep-learningpytorch

Answers

answered 4 months ago yun xi #1

@nafizh, i had the same problem, and fixed this by adding something like embedding_matrix.float(), and removed all float(), double() calling. you can find details in the like blew.

Ref: https://github.com/pytorch/pytorch/issues/5004

comments powered by Disqus