Pytorch: Error in DataParallel for RNN model

Shiloh_C Source

I'm trying to use torch.nn.DataParallel for a RNN model. My model looks like this:

class EncoderRNN(nn.Module):
def __init__(self, vocal_size, hidden_size):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(vocal_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)

def forward(self, input_batch, input_batch_length, hidden):
    embedded = self.embedding(input_batch)
    packed_input = nn.utils.rnn.pack_padded_sequence(embedded, input_batch_length.cpu().numpy(), batch_first=True)
    output, hidden = self.gru(packed_input, hidden)
    return output, hidden


class DecoderRNN(nn.Module):
def __init__(self, hidden_size, vocab_size):
    super(DecoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(vocab_size, hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
    self.out = nn.Linear(hidden_size, vocab_size)
    self.softmax = nn.LogSoftmax(dim=1)

def forward(self, target_batch, target_batch_length, hidden, train=False):
    embedded = self.embedding(target_batch)
    output = F.relu(embedded)

    if train:
        # minus 1 to eliminate <EOS>
        packed_target = nn.utils.rnn.pack_padded_sequence(output, (target_batch_length - 1).cpu().numpy(),
                                                          batch_first=True)

    output, hidden = self.gru(packed_target, hidden)
    output = self.softmax(self.out(output[0]))
    return output, hidden

And I implemented DataParallel like this when declaring the model:

encoder = nn.DataParallel(encoder)
decoder = nn.DataParallel(decoder)

The code runs on a server with 4 GPUs, and I received following error message:

/home/cjunjie/NLP/DocSummarization/model.py:18: UserWarning: RNN module weights are not part of single contiguous chunk of memory. This means they need to be compacted at every call, possibly greatly increasing memory usage. To compact weights again call flatten_parameters(). 
output, hidden = self.gru(packed_input, hidden)
Traceback (most recent call last):
  File "train.py", line 144, in <module>
    train_iteration(encoder, decoder, fileDataSet)
  File "train.py", line 110, in train_iteration
    target_indices, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
  File "train.py", line 41, in train
    encoder_output, encoder_hidden = encoder(input_batch, input_batch_length, encoder_hidden)
  File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py", line 357, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 74, in forward
return self.gather(outputs, self.output_device)
  File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 86, in gather
    return gather(outputs, output_device, dim=self.dim)
  File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 65, in gather
    return gather_map(outputs)
  File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 60, in gather_map
    return type(out)(map(gather_map, zip(*outputs)))
  File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 60, in gather_map
    return type(out)(map(gather_map, zip(*outputs)))
  File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/utils/rnn.py", line 39, in __new__
    return super(PackedSequence, cls).__new__(cls, *args[0])
  File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 57, in gather_map
    return Gather.apply(target_device, dim, *outputs)
  File "/home/cjunjie/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/_functions.py", line 58, in forward
    assert all(map(lambda i: i.is_cuda, inputs))
AssertionError

I searched for the same problem, but none of them have a solution. Can anyone help?

pytorch

Answers

answered 1 month ago patapouf_ai #1

In order to run the code on GPUs you need to copy both variables and model weights to cuda. I suspect you did not copy model weights to cuda. To do that you need to do

encoder.cuda()
decoder.cuda()

comments powered by Disqus