diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index f329eb6842..b0188d4d88 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -86,7 +86,7 @@ def _prepare_bart_decoder_inputs(
             causal_lm_mask = None
         new_shape = (bsz, tgt_len, tgt_len)
         # make it broadcastable so can just be added to the attention coefficients
-        decoder_attn_mask = _combine_masks(decoder_padding_mask, causal_lm_mask, new_shape)
+        decoder_attn_mask = _combine_masks(decoder_padding_mask, causal_lm_mask, new_shape).to(device=input_ids.device)
     assert decoder_attn_mask is None or decoder_attn_mask.shape == (bsz, 1, tgt_len, tgt_len)
     return decoder_input_ids, decoder_attn_mask
 
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index 927c37eadf..ef8932618a 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -172,7 +172,7 @@ class BartHeadTests(unittest.TestCase):
     vocab_size = 99
 
     def test_lm_forward(self):
-        input_ids = torch.Tensor(
+        input_ids = torch.tensor(
             [
                 [71, 82, 18, 33, 46, 91, 2],
                 [68, 34, 26, 58, 30, 82, 2],
@@ -187,8 +187,10 @@ class BartHeadTests(unittest.TestCase):
                 [21, 5, 62, 28, 14, 76, 2],
                 [45, 98, 37, 86, 59, 48, 2],
                 [70, 70, 50, 9, 28, 0, 2],
-            ]
-        ).long()
+            ],
+            dtype=torch.long,
+            device=torch_device,
+        )
         batch_size = input_ids.shape[0]
         decoder_lm_labels = ids_tensor([batch_size, input_ids.shape[1]], self.vocab_size)
 
@@ -204,12 +206,14 @@ class BartHeadTests(unittest.TestCase):
             max_position_embeddings=48,
         )
         model = BartForSequenceClassification(config)
+        model.to(torch_device)
         outputs = model.forward(input_ids=input_ids, decoder_input_ids=input_ids)
         logits = outputs[0]
         expected_shape = torch.Size((batch_size, config.num_labels))
         self.assertEqual(logits.shape, expected_shape)
 
         lm_model = BartForMaskedLM(config)
+        lm_model.to(torch_device)
         loss, logits, enc_features = lm_model.forward(
             input_ids=input_ids, lm_labels=decoder_lm_labels, decoder_input_ids=input_ids
         )
@@ -292,6 +296,10 @@ def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
         raise AssertionError(msg)
 
 
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device,)
+
+
 TOLERANCE = 1e-4
 
 
@@ -299,15 +307,15 @@ TOLERANCE = 1e-4
 class BartModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_no_head(self):
-        model = BartModel.from_pretrained("bart-large")
-        input_ids = torch.Tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]).long()
+        model = BartModel.from_pretrained("bart-large").to(torch_device)
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
         inputs_dict = prepare_bart_inputs_dict(model.config, input_ids)
         with torch.no_grad():
             output = model.forward(**inputs_dict)[0]
         expected_shape = torch.Size((1, 11, 1024))
         self.assertEqual(output.shape, expected_shape)
         expected_slice = torch.Tensor(
-            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]]
+            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
         )
         self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
 
@@ -315,20 +323,22 @@ class BartModelIntegrationTest(unittest.TestCase):
     def test_mnli_inference(self):
 
         example_b = [0, 31414, 232, 328, 740, 1140, 69, 46078, 1588, 2, 1]
-        input_ids = torch.Tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2], example_b]).long()
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2], example_b])
 
-        model = AutoModelForSequenceClassification.from_pretrained("bart-large-mnli")  # eval called in from_pre
+        model = AutoModelForSequenceClassification.from_pretrained("bart-large-mnli").to(
+            torch_device
+        )  # eval called in from_pre
         inputs_dict = prepare_bart_inputs_dict(model.config, input_ids)
         # Test that model hasn't changed
         with torch.no_grad():
             batched_logits, features = model.forward(**inputs_dict)
         expected_shape = torch.Size((2, 3))
         self.assertEqual(batched_logits.shape, expected_shape)
-        expected_slice = torch.Tensor([[0.1907, 1.4342, -1.0289]])
+        expected_slice = torch.Tensor([[0.1907, 1.4342, -1.0289]]).to(torch_device)
         logits_arr = batched_logits[0].detach()
 
         # Test that padding does not change results
-        input_ids_no_pad = torch.Tensor([example_b[:-1]]).long()
+        input_ids_no_pad = _long_tensor([example_b[:-1]])
 
         inputs_dict = prepare_bart_inputs_dict(model.config, input_ids=input_ids_no_pad)
         with torch.no_grad():
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 669a494cf2..4e5202a65a 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -68,7 +68,7 @@ class ModelTesterMixin:
             model.eval()
             with torch.no_grad():
                 outputs = model(**inputs_dict)
-            out_2 = outputs[0].numpy()
+            out_2 = outputs[0].cpu().numpy()
             out_2[np.isnan(out_2)] = 0
 
             with tempfile.TemporaryDirectory() as tmpdirname:
@@ -472,6 +472,7 @@ class ModelTesterMixin:
         for model_class in self.all_model_classes:
             config = copy.deepcopy(original_config)
             model = model_class(config)
+            model.to(torch_device)
 
             model_vocab_size = config.vocab_size
             # Retrieve the embeddings and clone theme
diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py
index d62ba2bd79..1d7738b64b 100644
--- a/tests/test_modeling_t5.py
+++ b/tests/test_modeling_t5.py
@@ -20,7 +20,7 @@ from transformers import is_torch_available
 
 from .test_configuration_common import ConfigTester
 from .test_modeling_common import ModelTesterMixin, ids_tensor
-from .utils import CACHE_DIR, require_torch, slow
+from .utils import CACHE_DIR, require_torch, slow, torch_device
 
 
 if is_torch_available():
@@ -125,6 +125,7 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase):
             decoder_lm_labels,
         ):
             model = T5Model(config=config)
+            model.to(torch_device)
             model.eval()
             decoder_output, encoder_output = model(
                 encoder_input_ids=encoder_input_ids,
@@ -157,6 +158,7 @@ class T5ModelTest(ModelTesterMixin, unittest.TestCase):
             decoder_lm_labels,
         ):
             model = T5WithLMHeadModel(config=config)
+            model.to(torch_device)
             model.eval()
             outputs = model(
                 encoder_input_ids=encoder_input_ids,