diff --git a/modeling_pytorch.py b/modeling_pytorch.py
index 07fb256104..32ae35b74f 100644
--- a/modeling_pytorch.py
+++ b/modeling_pytorch.py
@@ -493,7 +493,7 @@ class BertForQuestionAnswering(nn.Module):
             def compute_loss(logits, positions):
                 max_position = positions.max().item()
                 one_hot = torch.FloatTensor(batch_size, max(max_position, seq_length) +1, device=input_ids.device).zero_()
-                one_hot = one_hot.scatter(1, positions, 1)
+                one_hot = one_hot.scatter(1, positions.cpu(), 1) # Second argument need to be LongTensor and not cuda.LongTensor
                 one_hot = one_hot[:, :seq_length]
                 log_probs = nn.functional.log_softmax(logits, dim = -1).view(batch_size, seq_length)
                 loss = -torch.mean(torch.sum(one_hot*log_probs), dim = -1)