[Perceiver] Skip multi-gpu tests for now (#14813)

* [Perceiver] Skip multi-gpu tests for now * Update tests/test_modeling_perceiver.py * up * up
2021-12-20 15:22:50 +01:00
parent 8a818c26cb
commit 952a77b05d
7 changed files with 28 additions and 26 deletions
--- a/docs/source/model_doc/perceiver.mdx
+++ b/docs/source/model_doc/perceiver.mdx
@@ -86,6 +86,10 @@ is implemented in the library. Note that the models available in the library onl
 with the Perceiver. There are many more use cases, including question answering, named-entity recognition, object detection, 
 audio classification, video classification, etc. 
 **Note**:
 - Perceiver does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
 ## Perceiver specific outputs
 [[autodoc]] models.perceiver.modeling_perceiver.PerceiverModelOutput
--- a/docs/source/model_doc/reformer.rst
+++ b/docs/source/model_doc/reformer.rst
@@ -35,6 +35,11 @@ while being much more memory-efficient and much faster on long sequences.*
 This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The Authors' code can be
 found `here <https://github.com/google/trax/tree/master/trax/models/reformer>`__.
 **Note**:
 - Reformer does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see `issue #36035
  <https://github.com/pytorch/pytorch/issues/36035>`__
 Axial Positional Encodings
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -44,6 +44,11 @@ Tips:
 This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
 <https://github.com/kimiyoung/transformer-xl>`__.
 **Note**:
 - TransformerXL does **not** work with `torch.nn.DataParallel` due to a bug in PyTorch, see `issue #36035
  <https://github.com/pytorch/pytorch/issues/36035>`__
 TransfoXLConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -2128,7 +2128,9 @@ class PerceiverBasicDecoder(PerceiverAbstractDecoder):
            # to get the indices for the unflattened array
            # unravel_index returns a tuple (x_idx, y_idx, ...)
            # stack to get the [n, d] tensor of coordinates
-            indices = list(torch.from_numpy(x) for x in np.unravel_index(subsampled_points, self.output_index_dims))
+            indices = list(
                torch.from_numpy(x) for x in np.unravel_index(subsampled_points.cpu(), self.output_index_dims)
            )
            pos = torch.stack(indices, dim=1)
            batch_size = inputs.shape[0]
            # Map these coordinates to [-1, 1]
--- a/tests/test_modeling_perceiver.py
+++ b/tests/test_modeling_perceiver.py
@@ -758,29 +758,11 @@ class PerceiverModelTest(ModelTesterMixin, unittest.TestCase):
                    loss.backward()
    @require_torch_multi_gpu
    @unittest.skip(
        reason="Perceiver does not work with data parallel (DP) because of a bug in PyTorch: https://github.com/pytorch/pytorch/issues/36035"
    )
    def test_multi_gpu_data_parallel_forward(self):
-        for model_class in self.all_model_classes:
+        pass
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_model_class(model_class)
            # some params shouldn't be scattered by nn.DataParallel
            # so just remove them if they are present.
            blacklist_non_batched_params = ["head_mask", "decoder_head_mask", "cross_attn_head_mask"]
            for k in blacklist_non_batched_params:
                inputs_dict.pop(k, None)
            # move input tensors to cuda:O
            for k, v in inputs_dict.items():
                if torch.is_tensor(v):
                    inputs_dict[k] = v.to(0)
            model = model_class(config=config)
            model.to(0)
            model.eval()
            # Wrap model in nn.DataParallel
            model = nn.DataParallel(model)
            with torch.no_grad():
                _ = model(**self._prepare_for_class(inputs_dict, model_class))
    @unittest.skip(reason="Perceiver models don't have a typical head like is the case with BERT")
    def test_save_load_fast_init_from_base(self):
--- a/tests/test_modeling_reformer.py
+++ b/tests/test_modeling_reformer.py
@@ -573,8 +573,10 @@ class ReformerTesterMixin:
        self.model_tester.create_and_check_reformer_model_fp16_generate(*config_and_inputs)
    @require_torch_multi_gpu
    @unittest.skip(
        reason="Reformer does not work with data parallel (DP) because of a bug in PyTorch: https://github.com/pytorch/pytorch/issues/36035"
    )
    def test_multi_gpu_data_parallel_forward(self):
        # Opt-out of this test.
        pass
    def test_for_sequence_classification(self):
--- a/tests/test_modeling_transfo_xl.py
+++ b/tests/test_modeling_transfo_xl.py
@@ -232,8 +232,10 @@ class TransfoXLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestC
        return
    @require_torch_multi_gpu
    @unittest.skip(
        reason="Transfo-XL does not work with data parallel (DP) because of a bug in PyTorch: https://github.com/pytorch/pytorch/issues/36035"
    )
    def test_multi_gpu_data_parallel_forward(self):
        # Opt-out of this test.
        pass
    @slow