From 952a77b05da2484eb3f5d9d1cd67612dbb949e2d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 20 Dec 2021 15:22:50 +0100
Subject: [PATCH] [Perceiver] Skip multi-gpu tests for now (#14813)

* [Perceiver] Skip multi-gpu tests for now

* Update tests/test_modeling_perceiver.py

* up

* up
---
 docs/source/model_doc/perceiver.mdx           |  6 ++++-
 docs/source/model_doc/reformer.rst            |  5 ++++
 docs/source/model_doc/transformerxl.rst       |  5 ++++
 .../models/perceiver/modeling_perceiver.py    |  4 ++-
 tests/test_modeling_perceiver.py              | 26 +++----------------
 tests/test_modeling_reformer.py               |  4 ++-
 tests/test_modeling_transfo_xl.py             |  4 ++-
 7 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/docs/source/model_doc/perceiver.mdx b/docs/source/model_doc/perceiver.mdx
index ff9bbaedb4..b474074e8b 100644
--- a/docs/source/model_doc/perceiver.mdx
+++ b/docs/source/model_doc/perceiver.mdx
@@ -86,6 +86,10 @@ is implemented in the library. Note that the models available in the library onl
 with the Perceiver. There are many more use cases, including question answering, named-entity recognition, object detection, 
 audio classification, video classification, etc. 
 
+**Note**:
+
+- Perceiver does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
+
 ## Perceiver specific outputs
 
 [[autodoc]] models.perceiver.modeling_perceiver.PerceiverModelOutput
@@ -208,4 +212,4 @@ audio classification, video classification, etc.
 ## PerceiverForMultimodalAutoencoding
 
 [[autodoc]] PerceiverForMultimodalAutoencoding
-    - forward
\ No newline at end of file
+    - forward
diff --git a/docs/source/model_doc/reformer.rst b/docs/source/model_doc/reformer.rst
index ea48ce5368..6842884d76 100644
--- a/docs/source/model_doc/reformer.rst
+++ b/docs/source/model_doc/reformer.rst
@@ -35,6 +35,11 @@ while being much more memory-efficient and much faster on long sequences.*
 This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The Authors' code can be
 found `here <https://github.com/google/trax/tree/master/trax/models/reformer>`__.
 
+**Note**:
+
+- Reformer does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see `issue #36035
+  <https://github.com/pytorch/pytorch/issues/36035>`__
+
 Axial Positional Encodings
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/transformerxl.rst b/docs/source/model_doc/transformerxl.rst
index df4ebecbf3..178268f522 100644
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -44,6 +44,11 @@ Tips:
 This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
 <https://github.com/kimiyoung/transformer-xl>`__.
 
+**Note**:
+
+- TransformerXL does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see `issue #36035
+  <https://github.com/pytorch/pytorch/issues/36035>`__
+
 
 TransfoXLConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index ede1063130..d3f91c34df 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -2128,7 +2128,9 @@ class PerceiverBasicDecoder(PerceiverAbstractDecoder):
             # to get the indices for the unflattened array
             # unravel_index returns a tuple (x_idx, y_idx, ...)
             # stack to get the [n, d] tensor of coordinates
-            indices = list(torch.from_numpy(x) for x in np.unravel_index(subsampled_points, self.output_index_dims))
+            indices = list(
+                torch.from_numpy(x) for x in np.unravel_index(subsampled_points.cpu(), self.output_index_dims)
+            )
             pos = torch.stack(indices, dim=1)
             batch_size = inputs.shape[0]
             # Map these coordinates to [-1, 1]
diff --git a/tests/test_modeling_perceiver.py b/tests/test_modeling_perceiver.py
index 4e6e271448..128be2d371 100644
--- a/tests/test_modeling_perceiver.py
+++ b/tests/test_modeling_perceiver.py
@@ -758,29 +758,11 @@ class PerceiverModelTest(ModelTesterMixin, unittest.TestCase):
                     loss.backward()
 
     @require_torch_multi_gpu
+    @unittest.skip(
+        reason="Perceiver does not work with data parallel (DP) because of a bug in PyTorch: https://github.com/pytorch/pytorch/issues/36035"
+    )
     def test_multi_gpu_data_parallel_forward(self):
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
-
-            # some params shouldn't be scattered by nn.DataParallel
-            # so just remove them if they are present.
-            blacklist_non_batched_params = ["head_mask", "decoder_head_mask", "cross_attn_head_mask"]
-            for k in blacklist_non_batched_params:
-                inputs_dict.pop(k, None)
-
-            # move input tensors to cuda:O
-            for k, v in inputs_dict.items():
-                if torch.is_tensor(v):
-                    inputs_dict[k] = v.to(0)
-
-            model = model_class(config=config)
-            model.to(0)
-            model.eval()
-
-            # Wrap model in nn.DataParallel
-            model = nn.DataParallel(model)
-            with torch.no_grad():
-                _ = model(**self._prepare_for_class(inputs_dict, model_class))
+        pass
 
     @unittest.skip(reason="Perceiver models don't have a typical head like is the case with BERT")
     def test_save_load_fast_init_from_base(self):
diff --git a/tests/test_modeling_reformer.py b/tests/test_modeling_reformer.py
index dff424bb99..4ccaa41245 100644
--- a/tests/test_modeling_reformer.py
+++ b/tests/test_modeling_reformer.py
@@ -573,8 +573,10 @@ class ReformerTesterMixin:
         self.model_tester.create_and_check_reformer_model_fp16_generate(*config_and_inputs)
 
     @require_torch_multi_gpu
+    @unittest.skip(
+        reason="Reformer does not work with data parallel (DP) because of a bug in PyTorch: https://github.com/pytorch/pytorch/issues/36035"
+    )
     def test_multi_gpu_data_parallel_forward(self):
-        # Opt-out of this test.
         pass
 
     def test_for_sequence_classification(self):
diff --git a/tests/test_modeling_transfo_xl.py b/tests/test_modeling_transfo_xl.py
index 4885e97329..c69f3b2490 100644
--- a/tests/test_modeling_transfo_xl.py
+++ b/tests/test_modeling_transfo_xl.py
@@ -232,8 +232,10 @@ class TransfoXLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestC
         return
 
     @require_torch_multi_gpu
+    @unittest.skip(
+        reason="Transfo-XL does not work with data parallel (DP) because of a bug in PyTorch: https://github.com/pytorch/pytorch/issues/36035"
+    )
     def test_multi_gpu_data_parallel_forward(self):
-        # Opt-out of this test.
         pass
 
     @slow