From ba2cf5f90d2e81c97a528b73479c9cfac730f6f7 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Thu, 8 Apr 2021 14:36:45 -0400
Subject: [PATCH 1/8] Add fairscale and deepspeed back to the CI (#11147)

* Add fairscale and deepspeed back to the CI

* Add deepspeed to single GPU tests
---
 .github/workflows/self-scheduled.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 3b72baea0d..c49a967d2a 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -34,6 +34,7 @@ jobs:
           apt -y update && apt install -y libsndfile1-dev
           pip install --upgrade pip
           pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
+          pip install deepspeed
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
@@ -156,6 +157,8 @@ jobs:
           apt -y update && apt install -y libsndfile1-dev
           pip install --upgrade pip
           pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
+          pip install fairscale
+          pip install deepspeed
 
       - name: Are GPUs recognized by our DL frameworks
         run: |

From 9c9b8e707b9803a1425ed8dd2f51069b22d9230f Mon Sep 17 00:00:00 2001
From: Philipp Schmid <32632186+philschmid@users.noreply.github.com>
Date: Thu, 8 Apr 2021 22:05:53 +0200
Subject: [PATCH 2/8] Updates SageMaker docs for updating DLCs (#11140)

---
 tests/sagemaker/README.md | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/sagemaker/README.md b/tests/sagemaker/README.md
index b3c9906cc5..3d8ab7c2bf 100644
--- a/tests/sagemaker/README.md
+++ b/tests/sagemaker/README.md
@@ -136,10 +136,7 @@ images:
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, 
       *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
 ```
-2. In the PR comment describe what test we ran and with which framework versions. Here you can copy the table from [Current Tests](#current-tests). 
-
-TODO: Add a screenshot of PR + Text template to make it easy to open. 
-
+2. In the PR comment describe what test we ran and with which framework versions. Here you can copy the table from [Current Tests](#current-tests). You can take a look at this [PR](https://github.com/aws/deep-learning-containers/pull/1016), which information are needed.
 
 ## Current Tests
 
@@ -150,4 +147,4 @@ TODO: Add a screenshot of PR + Text template to make it easy to open.
 | pytorch-transfromers-test-2-smd     | test bert finetuning using BERT from transformer lib+ PT SM DDP   | SageMaker createTrainingJob | 16    | train_runtime, eval_accuracy & eval_loss |
 | pytorch-transfromers-test-1-smp     | test roberta finetuning using BERT from transformer lib+ PT SM MP | SageMaker createTrainingJob | 8     | train_runtime, eval_accuracy & eval_loss |
 | tensorflow-transfromers-test-single | Test bert finetuning using BERT from transformer lib+TF           | SageMaker createTrainingJob | 1     | train_runtime, eval_accuracy & eval_loss |
-| tensorflow-transfromers-test-2-smd  | test bert finetuning using BERT from transformer lib+ TF SM DDP   | SageMaker createTrainingJob | 16    | train_runtime, eval_accuracy & eval_loss |
\ No newline at end of file
+| tensorflow-transfromers-test-2-smd  | test bert finetuning using BERT from transformer lib+ TF SM DDP   | SageMaker createTrainingJob | 16    | train_runtime, eval_accuracy & eval_loss |

From dfed4ec263db3a2eb5d018b10abfaf80e339babe Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 8 Apr 2021 16:12:36 -0400
Subject: [PATCH 3/8] Don't duplicate logs in TensorBoard and handle --use_env
 (#11141)

---
 src/transformers/integrations.py  | 8 +++++---
 src/transformers/training_args.py | 6 ++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index ffa78bf3f0..65824c25ca 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -604,9 +604,11 @@ class TensorBoardCallback(TrainerCallback):
                 self.tb_writer.add_hparams(args.to_sanitized_dict(), metric_dict={})
 
     def on_log(self, args, state, control, logs=None, **kwargs):
-        if state.is_world_process_zero:
-            if self.tb_writer is None:
-                self._init_summary_writer(args)
+        if not state.is_world_process_zero:
+            return
+
+        if self.tb_writer is None:
+            self._init_summary_writer(args)
 
         if self.tb_writer is not None:
             logs = rewrite_logs(logs)
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 9e55355290..188bf92b63 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -531,6 +531,12 @@ class TrainingArguments:
     )
 
     def __post_init__(self):
+        # Handle --use_env option in torch.distributed.launch (local_rank not passed as an arg then).
+        # This needs to happen before any call to self.device or self.n_gpu.
+        env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+        if env_local_rank != -1 and env_local_rank != self.local_rank:
+            self.local_rank = env_local_rank
+
         # expand paths, if not os.makedirs("~/bar") will make directory
         # in the current directory instead of the actual home
         #  see https://github.com/huggingface/transformers/issues/10628

From 6c40e49712972141c3d7aeba4ed90bb79f2bb078 Mon Sep 17 00:00:00 2001
From: Andrea Cappelli <ak314@users.noreply.github.com>
Date: Thu, 8 Apr 2021 22:12:49 +0200
Subject: [PATCH 4/8] Run mlm pad to multiple for fp16 (#11128)

* Add mlm collator pad to multiple option (#10627)

* Use padding to 8x in run mlm (#10627)
---
 examples/language-modeling/run_mlm.py  |  7 +++-
 src/transformers/data/data_collator.py | 13 ++++--
 tests/test_data_collator.py            | 56 ++++++++++++++++++++++++--
 3 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py
index 4fd3c4f217..2934fb0c23 100755
--- a/examples/language-modeling/run_mlm.py
+++ b/examples/language-modeling/run_mlm.py
@@ -422,7 +422,12 @@ def main():
 
     # Data collator
     # This one will take care of randomly masking the tokens.
-    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
+    pad_to_multiple_of_8 = data_args.line_by_line and training_args.fp16 and not data_args.pad_to_max_length
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm_probability=data_args.mlm_probability,
+        pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
+    )
 
     # Initialize our Trainer
     trainer = Trainer(
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index 94eaade7b1..9915eb5a5f 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -192,7 +192,7 @@ class DataCollatorForTokenClassification:
         return batch
 
 
-def _collate_batch(examples, tokenizer):
+def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
     """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
     # Tensorize if necessary.
     if isinstance(examples[0], (list, tuple)):
@@ -201,7 +201,7 @@ def _collate_batch(examples, tokenizer):
     # Check if padding is necessary.
     length_of_first = examples[0].size(0)
     are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
-    if are_tensors_same_length:
+    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
         return torch.stack(examples, dim=0)
 
     # If yes, check if we have a `pad_token`.
@@ -213,6 +213,8 @@ def _collate_batch(examples, tokenizer):
 
     # Creating the full tensor and filling it with our data.
     max_length = max(x.size(0) for x in examples)
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
     result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
     for i, example in enumerate(examples):
         if tokenizer.padding_side == "right":
@@ -311,6 +313,8 @@ class DataCollatorForLanguageModeling:
             non-masked tokens and the value to predict for the masked token.
         mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
             The probability with which to (randomly) mask tokens in the input, when :obj:`mlm` is set to :obj:`True`.
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
 
     .. note::
 
@@ -323,6 +327,7 @@ class DataCollatorForLanguageModeling:
     tokenizer: PreTrainedTokenizerBase
     mlm: bool = True
     mlm_probability: float = 0.15
+    pad_to_multiple_of: Optional[int] = None
 
     def __post_init__(self):
         if self.mlm and self.tokenizer.mask_token is None:
@@ -336,9 +341,9 @@ class DataCollatorForLanguageModeling:
     ) -> Dict[str, torch.Tensor]:
         # Handle dict or lists with proper padding and conversion to tensor.
         if isinstance(examples[0], (dict, BatchEncoding)):
-            batch = self.tokenizer.pad(examples, return_tensors="pt")
+            batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of)
         else:
-            batch = {"input_ids": _collate_batch(examples, self.tokenizer)}
+            batch = {"input_ids": _collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)}
 
         # If special token mask has been preprocessed, pop it from the dict.
         special_tokens_mask = batch.pop("special_tokens_mask", None)
diff --git a/tests/test_data_collator.py b/tests/test_data_collator.py
index be138314d3..e9d363229f 100644
--- a/tests/test_data_collator.py
+++ b/tests/test_data_collator.py
@@ -146,11 +146,8 @@ class DataCollatorIntegrationTest(unittest.TestCase):
         self.assertEqual(batch["labels"].shape, torch.Size([2, 6]))
         self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-1] * 3)
 
-    def test_data_collator_for_language_modeling(self):
+    def _test_no_pad_and_pad(self, no_pad_features, pad_features):
         tokenizer = BertTokenizer(self.vocab_file)
-        no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
-        pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
-
         data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
         batch = data_collator(no_pad_features)
         self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
@@ -160,6 +157,15 @@ class DataCollatorIntegrationTest(unittest.TestCase):
         self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
         self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
 
+        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8)
+        batch = data_collator(no_pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        batch = data_collator(pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
         tokenizer._pad_token = None
         data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
         with self.assertRaises(ValueError):
@@ -185,6 +191,32 @@ class DataCollatorIntegrationTest(unittest.TestCase):
         self.assertTrue(torch.any(masked_tokens))
         self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
 
+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(no_pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
+        self.assertTrue(torch.any(masked_tokens))
+        self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
+
+        batch = data_collator(pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
+        self.assertTrue(torch.any(masked_tokens))
+        self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
+
+    def test_data_collator_for_language_modeling(self):
+        no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
+        pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
+        self._test_no_pad_and_pad(no_pad_features, pad_features)
+
+        no_pad_features = [list(range(10)), list(range(10))]
+        pad_features = [list(range(5)), list(range(10))]
+        self._test_no_pad_and_pad(no_pad_features, pad_features)
+
     def test_plm(self):
         tokenizer = BertTokenizer(self.vocab_file)
         no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
@@ -225,6 +257,14 @@ class DataCollatorIntegrationTest(unittest.TestCase):
         self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
         self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))
 
+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(features)
+
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))
+
     def test_sop(self):
         tokenizer = BertTokenizer(self.vocab_file)
         features = [
@@ -242,3 +282,11 @@ class DataCollatorIntegrationTest(unittest.TestCase):
         self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 5)))
         self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
         self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))
+
+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(features)
+
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))

From 66446909b236c17498276857fa88e23d2c91d004 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 8 Apr 2021 13:13:17 -0700
Subject: [PATCH 5/8] [tests] relocate core integration tests (#11146)

* relocate core integration tests

* add sys.path context manager

* cleanup

* try

* try2

* fix path

* doc

* style

* add dep

* add 2 more deps
---
 docs/source/main_classes/trainer.rst          |  4 ++--
 docs/source/testing.rst                       | 23 ++++++++++++++++---
 setup.py                                      | 13 +++++++----
 src/transformers/dependency_versions_table.py |  9 +++++---
 src/transformers/testing_utils.py             | 22 ++++++++++++++++++
 .../deepspeed/ds_config_zero2.json            |  0
 .../deepspeed/ds_config_zero3.json            |  0
 .../deepspeed/test_deepspeed.py               | 11 +++++----
 .../extended}/test_trainer_ext.py             |  5 ++--
 9 files changed, 68 insertions(+), 19 deletions(-)
 rename {examples/tests => tests}/deepspeed/ds_config_zero2.json (100%)
 rename {examples/tests => tests}/deepspeed/ds_config_zero3.json (100%)
 rename {examples/tests => tests}/deepspeed/test_deepspeed.py (98%)
 rename {examples/tests/trainer => tests/extended}/test_trainer_ext.py (98%)

diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index 2e323aaa28..bc9f248827 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -525,7 +525,7 @@ Here is an example of running ``run_translation.py`` under DeepSpeed deploying a
 .. code-block:: bash
 
     deepspeed examples/seq2seq/run_translation.py \
-    --deepspeed examples/tests/deepspeed/ds_config.json \
+    --deepspeed tests/deepspeed/ds_config.json \
     --model_name_or_path t5-small --per_device_train_batch_size 1   \
     --output_dir output_dir --overwrite_output_dir --fp16 \
     --do_train --max_train_samples 500 --num_train_epochs 1 \
@@ -550,7 +550,7 @@ To deploy DeepSpeed with one GPU adjust the :class:`~transformers.Trainer` comma
 .. code-block:: bash
 
     deepspeed --num_gpus=1 examples/seq2seq/run_translation.py \
-    --deepspeed examples/tests/deepspeed/ds_config.json \
+    --deepspeed tests/deepspeed/ds_config.json \
     --model_name_or_path t5-small --per_device_train_batch_size 1   \
     --output_dir output_dir --overwrite_output_dir --fp16 \
     --do_train --max_train_samples 500 --num_train_epochs 1 \
diff --git a/docs/source/testing.rst b/docs/source/testing.rst
index 10ad3e2311..9a4efb06fc 100644
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -1,4 +1,4 @@
-.. 
+..
     Copyright 2020 The HuggingFace Team. All rights reserved.
 
     Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
@@ -388,7 +388,7 @@ For a single or a group of tests via ``pytest`` (after ``pip install pytest-pspe
 
 .. code-block:: bash
 
-   pytest --pspec tests/test_optimization.py 
+   pytest --pspec tests/test_optimization.py
 
 
 
@@ -672,7 +672,7 @@ and it will list:
 
     test_this2.py::test_floor[integer-1-1.0]
     test_this2.py::test_floor[negative--1.5--2.0]
-    test_this2.py::test_floor[large fraction-1.6-1]       
+    test_this2.py::test_floor[large fraction-1.6-1]
 
 So now you can run just the specific test:
 
@@ -795,6 +795,23 @@ leave any data in there.
    otherwise.
 
 
+Temporary sys.path override
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you need to temporary override ``sys.path`` to import from another test for example, you can use the
+``ExtendSysPath`` context manager. Example:
+
+
+.. code-block:: python
+
+    import os
+    from transformers.testing_utils import ExtendSysPath
+    bindir = os.path.abspath(os.path.dirname(__file__))
+    with ExtendSysPath(f"{bindir}/.."):
+        from test_trainer import TrainerIntegrationCommon  # noqa
+
+
+
 Skipping tests
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/setup.py b/setup.py
index 45df48f68b..c3583a3070 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@ To create the package for pypi.
 
 1. Run `make pre-release` (or `make pre-patch` for a patch release) then run `make fix-copies` to fix the index of the
    documentation.
-   
+
 2. Run Tests for Amazon Sagemaker. The documentation is located in `./tests/sagemaker/README.md`, otherwise @philschmid.
 
 3. Unpin specific versions from setup.py that use a git install.
@@ -85,6 +85,7 @@ if stale_egg_info.exists():
 # 1. all dependencies should be listed here with their version requirements if any
 # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
 _deps = [
+    "Pillow",
     "black>=20.8b1",
     "cookiecutter==1.7.2",
     "dataclasses",
@@ -102,13 +103,13 @@ _deps = [
     "jax>=0.2.8",
     "jaxlib>=0.1.59",
     "keras2onnx",
+    "nltk",
     "numpy>=1.17",
     "onnxconverter-common",
     "onnxruntime-tools>=1.4.2",
     "onnxruntime>=1.4.0",
     "packaging",
     "parameterized",
-    "Pillow",
     "protobuf",
     "psutil",
     "pydantic",
@@ -119,15 +120,18 @@ _deps = [
     "recommonmark",
     "regex!=2019.12.17",
     "requests",
+    "rouge-score",
+    "sacrebleu>=1.4.12",
     "sacremoses",
+    "sagemaker>=2.31.0",
     "scikit-learn",
     "sentencepiece==0.1.91",
     "soundfile",
     "sphinx-copybutton",
     "sphinx-markdown-tables",
     "sphinx-rtd-theme==0.4.3",  # sphinx-rtd-theme==0.5.0 introduced big changes in the style.
-    "sphinxext-opengraph==0.4.1",
     "sphinx==3.2.1",
+    "sphinxext-opengraph==0.4.1",
     "starlette",
     "tensorflow-cpu>=2.3",
     "tensorflow>=2.3",
@@ -139,7 +143,6 @@ _deps = [
     "unidic>=1.0.2",
     "unidic_lite>=1.0.7",
     "uvicorn",
-    "sagemaker>=2.31.0",
 ]
 
 
@@ -238,7 +241,7 @@ extras["vision"] = deps_list("Pillow")
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["testing"] = (
     deps_list(
-        "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar", "black"
+        "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar", "black", "sacrebleu", "rouge-score", "nltk"
     )
     + extras["retrieval"]
     + extras["modelcreation"]
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index b53407ad3e..43f4c028fe 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -2,6 +2,7 @@
 # 1. modify the `_deps` dict in setup.py
 # 2. run `make deps_table_update``
 deps = {
+    "Pillow": "Pillow",
     "black": "black>=20.8b1",
     "cookiecutter": "cookiecutter==1.7.2",
     "dataclasses": "dataclasses",
@@ -19,13 +20,13 @@ deps = {
     "jax": "jax>=0.2.8",
     "jaxlib": "jaxlib>=0.1.59",
     "keras2onnx": "keras2onnx",
+    "nltk": "nltk",
     "numpy": "numpy>=1.17",
     "onnxconverter-common": "onnxconverter-common",
     "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
     "onnxruntime": "onnxruntime>=1.4.0",
     "packaging": "packaging",
     "parameterized": "parameterized",
-    "Pillow": "Pillow",
     "protobuf": "protobuf",
     "psutil": "psutil",
     "pydantic": "pydantic",
@@ -36,15 +37,18 @@ deps = {
     "recommonmark": "recommonmark",
     "regex": "regex!=2019.12.17",
     "requests": "requests",
+    "rouge-score": "rouge-score",
+    "sacrebleu": "sacrebleu>=1.4.12",
     "sacremoses": "sacremoses",
+    "sagemaker": "sagemaker>=2.31.0",
     "scikit-learn": "scikit-learn",
     "sentencepiece": "sentencepiece==0.1.91",
     "soundfile": "soundfile",
     "sphinx-copybutton": "sphinx-copybutton",
     "sphinx-markdown-tables": "sphinx-markdown-tables",
     "sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3",
-    "sphinxext-opengraph": "sphinxext-opengraph==0.4.1",
     "sphinx": "sphinx==3.2.1",
+    "sphinxext-opengraph": "sphinxext-opengraph==0.4.1",
     "starlette": "starlette",
     "tensorflow-cpu": "tensorflow-cpu>=2.3",
     "tensorflow": "tensorflow>=2.3",
@@ -56,5 +60,4 @@ deps = {
     "unidic": "unidic>=1.0.2",
     "unidic_lite": "unidic_lite>=1.0.7",
     "uvicorn": "uvicorn",
-    "sagemaker": "sagemaker>=2.31.0",
 }
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 3f1273a7c9..a5c4e7d2b8 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -24,6 +24,7 @@ import unittest
 from distutils.util import strtobool
 from io import StringIO
 from pathlib import Path
+from typing import Iterator, Union
 
 from .file_utils import (
     is_datasets_available,
@@ -621,6 +622,27 @@ class CaptureLogger:
         return f"captured: {self.out}\n"
 
 
+@contextlib.contextmanager
+# adapted from https://stackoverflow.com/a/64789046/9201239
+def ExtendSysPath(path: Union[str, os.PathLike]) -> Iterator[None]:
+    """
+    Temporary add given path to `sys.path`.
+
+    Usage ::
+
+       with ExtendSysPath('/path/to/dir'):
+           mymodule = importlib.import_module('mymodule')
+
+    """
+
+    path = os.fspath(path)
+    try:
+        sys.path.insert(0, path)
+        yield
+    finally:
+        sys.path.remove(path)
+
+
 class TestCasePlus(unittest.TestCase):
     """
     This class extends `unittest.TestCase` with additional features.
diff --git a/examples/tests/deepspeed/ds_config_zero2.json b/tests/deepspeed/ds_config_zero2.json
similarity index 100%
rename from examples/tests/deepspeed/ds_config_zero2.json
rename to tests/deepspeed/ds_config_zero2.json
diff --git a/examples/tests/deepspeed/ds_config_zero3.json b/tests/deepspeed/ds_config_zero3.json
similarity index 100%
rename from examples/tests/deepspeed/ds_config_zero3.json
rename to tests/deepspeed/ds_config_zero3.json
diff --git a/examples/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
similarity index 98%
rename from examples/tests/deepspeed/test_deepspeed.py
rename to tests/deepspeed/test_deepspeed.py
index b9c9b46167..9baaf3085b 100644
--- a/examples/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -16,16 +16,16 @@ import dataclasses
 import io
 import json
 import os
-import sys
 import unittest
 from copy import deepcopy
 
 from parameterized import parameterized
-from transformers import TrainingArguments
+from transformers import TrainingArguments, is_torch_available
 from transformers.file_utils import WEIGHTS_NAME
 from transformers.integrations import is_deepspeed_available
 from transformers.testing_utils import (
     CaptureLogger,
+    ExtendSysPath,
     TestCasePlus,
     execute_subprocess_async,
     get_gpu_count,
@@ -38,8 +38,11 @@ from transformers.trainer_utils import set_seed
 
 
 bindir = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(f"{bindir}/../../../tests")
-from test_trainer import TrainerIntegrationCommon, get_regression_trainer  # noqa
+with ExtendSysPath(f"{bindir}/.."):
+    from test_trainer import TrainerIntegrationCommon  # noqa
+
+    if is_torch_available():
+        from test_trainer import get_regression_trainer  # noqa
 
 
 set_seed(42)
diff --git a/examples/tests/trainer/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
similarity index 98%
rename from examples/tests/trainer/test_trainer_ext.py
rename to tests/extended/test_trainer_ext.py
index 82ec2f625c..6d13f9a4cc 100644
--- a/examples/tests/trainer/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -21,6 +21,7 @@ from unittest.mock import patch
 from transformers.file_utils import is_apex_available
 from transformers.integrations import is_fairscale_available
 from transformers.testing_utils import (
+    ExtendSysPath,
     TestCasePlus,
     execute_subprocess_async,
     get_gpu_count,
@@ -34,8 +35,8 @@ from transformers.trainer_utils import set_seed
 
 
 bindir = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(f"{bindir}/../../seq2seq")
-from run_translation import main  # noqa
+with ExtendSysPath(f"{bindir}/../../examples/seq2seq"):
+    from run_translation import main  # noqa
 
 
 set_seed(42)

From 97ccf67bb3c8bc309e8d2302874f56c734628a1c Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 8 Apr 2021 15:10:44 -0700
Subject: [PATCH 6/8] [setup] extras[docs] must include 'all' (#11148)

* extras[doc] must include 'all'

* fix

* better

* regroup
---
 .circleci/config.yml |  4 ++--
 setup.py             | 24 ++++++++++++++----------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 999af392fb..ec9c5741fb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -348,7 +348,7 @@ jobs:
                       - v0.4-{{ checksum "setup.py" }}
             - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
             - run: pip install --upgrade pip
-            - run: pip install ."[all, docs]"
+            - run: pip install ."[docs]"
             - save_cache:
                   key: v0.4-build_doc-{{ checksum "setup.py" }}
                   paths:
@@ -370,7 +370,7 @@ jobs:
                   keys:
                       - v0.4-deploy_doc-{{ checksum "setup.py" }}
                       - v0.4-{{ checksum "setup.py" }}
-            - run: pip install ."[all,docs]"
+            - run: pip install ."[docs]"
             - save_cache:
                   key: v0.4-deploy_doc-{{ checksum "setup.py" }}
                   paths:
diff --git a/setup.py b/setup.py
index c3583a3070..e942e65a7c 100644
--- a/setup.py
+++ b/setup.py
@@ -246,15 +246,7 @@ extras["testing"] = (
     + extras["retrieval"]
     + extras["modelcreation"]
 )
-extras["docs"] = deps_list(
-    "docutils",
-    "recommonmark",
-    "sphinx",
-    "sphinx-markdown-tables",
-    "sphinx-rtd-theme",
-    "sphinx-copybutton",
-    "sphinxext-opengraph",
-)
+
 extras["quality"] = deps_list("black", "isort", "flake8")
 
 extras["all"] = (
@@ -267,12 +259,24 @@ extras["all"] = (
     + extras["vision"]
 )
 
+extras["docs_specific"] = deps_list(
+    "docutils",
+    "recommonmark",
+    "sphinx",
+    "sphinx-markdown-tables",
+    "sphinx-rtd-theme",
+    "sphinx-copybutton",
+    "sphinxext-opengraph",
+)
+# "docs" needs "all" to resolve all the references
+extras["docs"] = extras["all"] + extras["docs_specific"]
+
 extras["dev"] = (
     extras["all"]
     + extras["testing"]
     + extras["quality"]
     + extras["ja"]
-    + extras["docs"]
+    + extras["docs_specific"]
     + extras["sklearn"]
     + extras["modelcreation"]
 )

From ba8b1f4754257e140ddabbe04a7f3e493e33802d Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Thu, 8 Apr 2021 18:41:36 -0400
Subject: [PATCH 7/8] Add support for multiple models for one config in auto
 classes (#11150)

* Add support for multiple models for one config in auto classes

* Use get_values everywhere

* Prettier doc
---
 src/transformers/modeling_flax_utils.py       |  1 +
 src/transformers/modeling_tf_utils.py         |  1 +
 src/transformers/models/auto/__init__.py      |  2 +
 src/transformers/models/auto/auto_factory.py  | 39 +++++++++++++++++--
 .../models/auto/configuration_auto.py         | 19 ++++++---
 src/transformers/models/auto/modeling_auto.py |  3 +-
 .../models/auto/modeling_tf_auto.py           |  3 +-
 tests/test_modeling_albert.py                 |  3 +-
 tests/test_modeling_auto.py                   | 32 +++++++++++++--
 tests/test_modeling_bert.py                   |  3 +-
 tests/test_modeling_big_bird.py               |  3 +-
 tests/test_modeling_common.py                 | 27 ++++++-------
 tests/test_modeling_convbert.py               |  3 +-
 tests/test_modeling_electra.py                |  3 +-
 tests/test_modeling_flax_bert.py              |  2 +
 tests/test_modeling_funnel.py                 |  3 +-
 tests/test_modeling_led.py                    |  3 +-
 tests/test_modeling_lxmert.py                 |  5 ++-
 tests/test_modeling_megatron_bert.py          |  3 +-
 tests/test_modeling_mobilebert.py             |  3 +-
 tests/test_modeling_tapas.py                  | 19 ++++-----
 tests/test_modeling_tf_albert.py              |  3 +-
 tests/test_modeling_tf_auto.py                | 30 +++++++++++++-
 tests/test_modeling_tf_bert.py                |  3 +-
 tests/test_modeling_tf_common.py              | 31 ++++++++-------
 utils/check_repo.py                           | 13 ++++---
 26 files changed, 188 insertions(+), 72 deletions(-)

diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index c425f1a000..b9464ad3e5 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -387,6 +387,7 @@ class FlaxPreTrainedModel(ABC):
         # get abs dir
         save_directory = os.path.abspath(save_directory)
         # save config as well
+        self.config.architectures = [self.__class__.__name__[4:]]
         self.config.save_pretrained(save_directory)
 
         # save model
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 3eec82e0db..002a7667f2 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1037,6 +1037,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
             logger.info(f"Saved model created in {saved_model_dir}")
 
         # Save configuration file
+        self.config.architectures = [self.__class__.__name__[2:]]
         self.config.save_pretrained(save_directory)
 
         # If we save using the predefined names, we can load using `from_pretrained`
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index ef255d8b26..4abf6da50d 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -22,6 +22,7 @@ from ...file_utils import _BaseLazyModule, is_flax_available, is_tf_available, i
 
 
 _import_structure = {
+    "auto_factory": ["get_values"],
     "configuration_auto": ["ALL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CONFIG_MAPPING", "MODEL_NAMES_MAPPING", "AutoConfig"],
     "feature_extraction_auto": ["FEATURE_EXTRACTOR_MAPPING", "AutoFeatureExtractor"],
     "tokenization_auto": ["TOKENIZER_MAPPING", "AutoTokenizer"],
@@ -104,6 +105,7 @@ if is_flax_available():
 
 
 if TYPE_CHECKING:
+    from .auto_factory import get_values
     from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, MODEL_NAMES_MAPPING, AutoConfig
     from .feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
     from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 1c96f13199..4ec9b6c31c 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -328,6 +328,26 @@ FROM_PRETRAINED_FLAX_DOCSTRING = """
 """
 
 
+def _get_model_class(config, model_mapping):
+    supported_models = model_mapping[type(config)]
+    if not isinstance(supported_models, (list, tuple)):
+        return supported_models
+
+    name_to_model = {model.__name__: model for model in supported_models}
+    architectures = getattr(config, "architectures", [])
+    for arch in architectures:
+        if arch in name_to_model:
+            return name_to_model[arch]
+        elif f"TF{arch}" in name_to_model:
+            return name_to_model[f"TF{arch}"]
+        elif f"Flax{arch}" in name_to_model:
+            return name_to_model[f"Flax{arch}"]
+
+    # If not architecture is set in the config or match the supported models, the first element of the tuple is the
+    # defaults.
+    return supported_models[0]
+
+
 class _BaseAutoModelClass:
     # Base class for auto models.
     _model_mapping = None
@@ -341,7 +361,8 @@ class _BaseAutoModelClass:
 
     def from_config(cls, config, **kwargs):
         if type(config) in cls._model_mapping.keys():
-            return cls._model_mapping[type(config)](config, **kwargs)
+            model_class = _get_model_class(config, cls._model_mapping)
+            return model_class(config, **kwargs)
         raise ValueError(
             f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
             f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
@@ -356,9 +377,8 @@ class _BaseAutoModelClass:
             )
 
         if type(config) in cls._model_mapping.keys():
-            return cls._model_mapping[type(config)].from_pretrained(
-                pretrained_model_name_or_path, *model_args, config=config, **kwargs
-            )
+            model_class = _get_model_class(config, cls._model_mapping)
+            return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
         raise ValueError(
             f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
             f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
@@ -418,3 +438,14 @@ def auto_class_factory(name, model_mapping, checkpoint_for_example="bert-base-ca
     from_pretrained = replace_list_option_in_docstrings(model_mapping)(from_pretrained)
     new_class.from_pretrained = classmethod(from_pretrained)
     return new_class
+
+
+def get_values(model_mapping):
+    result = []
+    for model in model_mapping.values():
+        if isinstance(model, (list, tuple)):
+            result += list(model)
+        else:
+            result.append(model)
+
+    return result
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2bb4586349..aa095c4e6a 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -247,29 +247,38 @@ MODEL_NAMES_MAPPING = OrderedDict(
 )
 
 
+def _get_class_name(model_class):
+    if isinstance(model_class, (list, tuple)):
+        return " or ".join([f":class:`~transformers.{c.__name__}`" for c in model_class])
+    return f":class:`~transformers.{model_class.__name__}`"
+
+
 def _list_model_options(indent, config_to_class=None, use_model_types=True):
     if config_to_class is None and not use_model_types:
         raise ValueError("Using `use_model_types=False` requires a `config_to_class` dictionary.")
     if use_model_types:
         if config_to_class is None:
-            model_type_to_name = {model_type: config.__name__ for model_type, config in CONFIG_MAPPING.items()}
+            model_type_to_name = {
+                model_type: f":class:`~transformers.{config.__name__}`"
+                for model_type, config in CONFIG_MAPPING.items()
+            }
         else:
             model_type_to_name = {
-                model_type: config_to_class[config].__name__
+                model_type: _get_class_name(config_to_class[config])
                 for model_type, config in CONFIG_MAPPING.items()
                 if config in config_to_class
             }
         lines = [
-            f"{indent}- **{model_type}** -- :class:`~transformers.{model_type_to_name[model_type]}` ({MODEL_NAMES_MAPPING[model_type]} model)"
+            f"{indent}- **{model_type}** -- {model_type_to_name[model_type]} ({MODEL_NAMES_MAPPING[model_type]} model)"
             for model_type in sorted(model_type_to_name.keys())
         ]
     else:
-        config_to_name = {config.__name__: clas.__name__ for config, clas in config_to_class.items()}
+        config_to_name = {config.__name__: _get_class_name(clas) for config, clas in config_to_class.items()}
         config_to_model_name = {
             config.__name__: MODEL_NAMES_MAPPING[model_type] for model_type, config in CONFIG_MAPPING.items()
         }
         lines = [
-            f"{indent}- :class:`~transformers.{config_name}` configuration class: :class:`~transformers.{config_to_name[config_name]}` ({config_to_model_name[config_name]} model)"
+            f"{indent}- :class:`~transformers.{config_name}` configuration class: {config_to_name[config_name]} ({config_to_model_name[config_name]} model)"
             for config_name in sorted(config_to_name.keys())
         ]
     return "\n".join(lines)
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 64ff826a8e..cf01739296 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -124,6 +124,7 @@ from ..flaubert.modeling_flaubert import (
 )
 from ..fsmt.modeling_fsmt import FSMTForConditionalGeneration, FSMTModel
 from ..funnel.modeling_funnel import (
+    FunnelBaseModel,
     FunnelForMaskedLM,
     FunnelForMultipleChoice,
     FunnelForPreTraining,
@@ -377,7 +378,7 @@ MODEL_MAPPING = OrderedDict(
         (CTRLConfig, CTRLModel),
         (ElectraConfig, ElectraModel),
         (ReformerConfig, ReformerModel),
-        (FunnelConfig, FunnelModel),
+        (FunnelConfig, (FunnelModel, FunnelBaseModel)),
         (LxmertConfig, LxmertModel),
         (BertGenerationConfig, BertGenerationEncoder),
         (DebertaConfig, DebertaModel),
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index 0abb08c890..2104bb6442 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -91,6 +91,7 @@ from ..flaubert.modeling_tf_flaubert import (
     TFFlaubertWithLMHeadModel,
 )
 from ..funnel.modeling_tf_funnel import (
+    TFFunnelBaseModel,
     TFFunnelForMaskedLM,
     TFFunnelForMultipleChoice,
     TFFunnelForPreTraining,
@@ -242,7 +243,7 @@ TF_MODEL_MAPPING = OrderedDict(
         (XLMConfig, TFXLMModel),
         (CTRLConfig, TFCTRLModel),
         (ElectraConfig, TFElectraModel),
-        (FunnelConfig, TFFunnelModel),
+        (FunnelConfig, (TFFunnelModel, TFFunnelBaseModel)),
         (DPRConfig, TFDPRQuestionEncoder),
         (MPNetConfig, TFMPNetModel),
         (BartConfig, TFBartModel),
diff --git a/tests/test_modeling_albert.py b/tests/test_modeling_albert.py
index 1859f51aa5..7f82c67ba0 100644
--- a/tests/test_modeling_albert.py
+++ b/tests/test_modeling_albert.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -234,7 +235,7 @@ class AlbertModelTest(ModelTesterMixin, unittest.TestCase):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
diff --git a/tests/test_modeling_auto.py b/tests/test_modeling_auto.py
index d395d9640d..0ba839c42a 100644
--- a/tests/test_modeling_auto.py
+++ b/tests/test_modeling_auto.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import copy
+import tempfile
 import unittest
 
 from transformers import is_torch_available
@@ -46,6 +47,8 @@ if is_torch_available():
         BertForSequenceClassification,
         BertForTokenClassification,
         BertModel,
+        FunnelBaseModel,
+        FunnelModel,
         GPT2Config,
         GPT2LMHeadModel,
         RobertaForMaskedLM,
@@ -218,6 +221,21 @@ class AutoModelTest(unittest.TestCase):
         self.assertEqual(model.num_parameters(), 14410)
         self.assertEqual(model.num_parameters(only_trainable=True), 14410)
 
+    def test_from_pretrained_with_tuple_values(self):
+        # For the auto model mapping, FunnelConfig has two models: FunnelModel and FunnelBaseModel
+        model = AutoModel.from_pretrained("sgugger/funnel-random-tiny")
+        self.assertIsInstance(model, FunnelModel)
+
+        config = copy.deepcopy(model.config)
+        config.architectures = ["FunnelBaseModel"]
+        model = AutoModel.from_config(config)
+        self.assertIsInstance(model, FunnelBaseModel)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            model = AutoModel.from_pretrained(tmp_dir)
+            self.assertIsInstance(model, FunnelBaseModel)
+
     def test_parents_and_children_in_mappings(self):
         # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
         # by the parents and will return the wrong configuration type when using auto models
@@ -242,6 +260,12 @@ class AutoModelTest(unittest.TestCase):
                     assert not issubclass(
                         child_config, parent_config
                     ), f"{child_config.__name__} is child of {parent_config.__name__}"
-                    assert not issubclass(
-                        child_model, parent_model
-                    ), f"{child_config.__name__} is child of {parent_config.__name__}"
+
+                    # Tuplify child_model and parent_model since some of them could be tuples.
+                    if not isinstance(child_model, (list, tuple)):
+                        child_model = (child_model,)
+                    if not isinstance(parent_model, (list, tuple)):
+                        parent_model = (parent_model,)
+
+                    for child, parent in [(a, b) for a in child_model for b in parent_model]:
+                        assert not issubclass(child, parent), f"{child.__name__} is child of {parent.__name__}"
diff --git a/tests/test_modeling_bert.py b/tests/test_modeling_bert.py
index 03f76c264b..97da4350ab 100755
--- a/tests/test_modeling_bert.py
+++ b/tests/test_modeling_bert.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -444,7 +445,7 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
diff --git a/tests/test_modeling_big_bird.py b/tests/test_modeling_big_bird.py
index 9a6a55108e..edef01f207 100644
--- a/tests/test_modeling_big_bird.py
+++ b/tests/test_modeling_big_bird.py
@@ -19,6 +19,7 @@ import unittest
 
 from tests.test_modeling_common import floats_tensor
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.models.big_bird.tokenization_big_bird import BigBirdTokenizer
 from transformers.testing_utils import require_torch, slow, torch_device
 
@@ -458,7 +459,7 @@ class BigBirdModelTest(ModelTesterMixin, unittest.TestCase):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 9ce171e649..d5d76162bc 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -24,6 +24,7 @@ from typing import List, Tuple
 
 from transformers import is_torch_available
 from transformers.file_utils import WEIGHTS_NAME
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_torch_multi_gpu, slow, torch_device
 
 
@@ -79,7 +80,7 @@ class ModelTesterMixin:
 
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
-        if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+        if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
             inputs_dict = {
                 k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
                 if isinstance(v, torch.Tensor) and v.ndim > 1
@@ -88,9 +89,9 @@ class ModelTesterMixin:
             }
 
         if return_labels:
-            if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
                 inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
-            elif model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+            elif model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                 inputs_dict["start_positions"] = torch.zeros(
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
@@ -98,18 +99,18 @@ class ModelTesterMixin:
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
             elif model_class in [
-                *MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values(),
-                *MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.values(),
-                *MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.values(),
+                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
+                *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
             ]:
                 inputs_dict["labels"] = torch.zeros(
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
             elif model_class in [
-                *MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values(),
-                *MODEL_FOR_CAUSAL_LM_MAPPING.values(),
-                *MODEL_FOR_MASKED_LM_MAPPING.values(),
-                *MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values(),
+                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
+                *get_values(MODEL_FOR_MASKED_LM_MAPPING),
+                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
             ]:
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
@@ -229,7 +230,7 @@ class ModelTesterMixin:
         config.return_dict = True
 
         for model_class in self.all_model_classes:
-            if model_class in MODEL_MAPPING.values():
+            if model_class in get_values(MODEL_MAPPING):
                 continue
             model = model_class(config)
             model.to(torch_device)
@@ -248,7 +249,7 @@ class ModelTesterMixin:
         config.return_dict = True
 
         for model_class in self.all_model_classes:
-            if model_class in MODEL_MAPPING.values():
+            if model_class in get_values(MODEL_MAPPING):
                 continue
             model = model_class(config)
             model.to(torch_device)
@@ -312,7 +313,7 @@ class ModelTesterMixin:
                 if "labels" in inputs_dict:
                     correct_outlen += 1  # loss is added to beginning
                 # Question Answering model returns start_logits and end_logits
-                if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+                if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                     correct_outlen += 1  # start_logits and end_logits instead of only 1 output
                 if "past_key_values" in outputs:
                     correct_outlen += 1  # past_key_values have been returned
diff --git a/tests/test_modeling_convbert.py b/tests/test_modeling_convbert.py
index 610affc451..062a7f506a 100644
--- a/tests/test_modeling_convbert.py
+++ b/tests/test_modeling_convbert.py
@@ -19,6 +19,7 @@ import unittest
 
 from tests.test_modeling_common import floats_tensor
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -352,7 +353,7 @@ class ConvBertModelTest(ModelTesterMixin, unittest.TestCase):
                 if "labels" in inputs_dict:
                     correct_outlen += 1  # loss is added to beginning
                 # Question Answering model returns start_logits and end_logits
-                if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+                if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                     correct_outlen += 1  # start_logits and end_logits instead of only 1 output
                 if "past_key_values" in outputs:
                     correct_outlen += 1  # past_key_values have been returned
diff --git a/tests/test_modeling_electra.py b/tests/test_modeling_electra.py
index 88138a587c..5935eafee6 100644
--- a/tests/test_modeling_electra.py
+++ b/tests/test_modeling_electra.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -292,7 +293,7 @@ class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
diff --git a/tests/test_modeling_flax_bert.py b/tests/test_modeling_flax_bert.py
index fc339f7501..273f55d157 100644
--- a/tests/test_modeling_flax_bert.py
+++ b/tests/test_modeling_flax_bert.py
@@ -29,6 +29,7 @@ if is_flax_available():
         FlaxBertForNextSentencePrediction,
         FlaxBertForPreTraining,
         FlaxBertForQuestionAnswering,
+        FlaxBertForSequenceClassification,
         FlaxBertForTokenClassification,
         FlaxBertModel,
     )
@@ -125,6 +126,7 @@ class FlaxBertModelTest(FlaxModelTesterMixin, unittest.TestCase):
             FlaxBertForMultipleChoice,
             FlaxBertForQuestionAnswering,
             FlaxBertForNextSentencePrediction,
+            FlaxBertForSequenceClassification,
             FlaxBertForTokenClassification,
             FlaxBertForQuestionAnswering,
         )
diff --git a/tests/test_modeling_funnel.py b/tests/test_modeling_funnel.py
index 0e3846cef1..4435359eb6 100644
--- a/tests/test_modeling_funnel.py
+++ b/tests/test_modeling_funnel.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import FunnelTokenizer, is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -365,7 +366,7 @@ class FunnelModelTest(ModelTesterMixin, unittest.TestCase):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
diff --git a/tests/test_modeling_led.py b/tests/test_modeling_led.py
index 4166060145..caffe199bb 100644
--- a/tests/test_modeling_led.py
+++ b/tests/test_modeling_led.py
@@ -21,6 +21,7 @@ import unittest
 
 from transformers import is_torch_available
 from transformers.file_utils import cached_property
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -412,7 +413,7 @@ class LEDModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
             if "labels" in inputs_dict:
                 correct_outlen += 1  # loss is added to beginning
             # Question Answering model returns start_logits and end_logits
-            if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                 correct_outlen += 1  # start_logits and end_logits instead of only 1 output
             if "past_key_values" in outputs:
                 correct_outlen += 1  # past_key_values have been returned
diff --git a/tests/test_modeling_lxmert.py b/tests/test_modeling_lxmert.py
index f05b3c3ee8..b03cc31335 100644
--- a/tests/test_modeling_lxmert.py
+++ b/tests/test_modeling_lxmert.py
@@ -18,6 +18,7 @@ import copy
 import unittest
 
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -532,11 +533,11 @@ class LxmertModelTest(ModelTesterMixin, unittest.TestCase):
         inputs_dict = copy.deepcopy(inputs_dict)
 
         if return_labels:
-            if model_class in MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
-            elif model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            elif model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 # special case for models like BERT that use multi-loss training for PreTraining
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
diff --git a/tests/test_modeling_megatron_bert.py b/tests/test_modeling_megatron_bert.py
index 3423f2d6f1..5be4716d33 100644
--- a/tests/test_modeling_megatron_bert.py
+++ b/tests/test_modeling_megatron_bert.py
@@ -21,6 +21,7 @@ import os
 import unittest
 
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -290,7 +291,7 @@ class MegatronBertModelTest(ModelTesterMixin, unittest.TestCase):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
diff --git a/tests/test_modeling_mobilebert.py b/tests/test_modeling_mobilebert.py
index 9a0fc9ae96..96c974e2ed 100644
--- a/tests/test_modeling_mobilebert.py
+++ b/tests/test_modeling_mobilebert.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import is_torch_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -272,7 +273,7 @@ class MobileBertModelTest(ModelTesterMixin, unittest.TestCase):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
diff --git a/tests/test_modeling_tapas.py b/tests/test_modeling_tapas.py
index b4f8f13231..b36147d558 100644
--- a/tests/test_modeling_tapas.py
+++ b/tests/test_modeling_tapas.py
@@ -32,6 +32,7 @@ from transformers import (
     is_torch_available,
 )
 from transformers.file_utils import cached_property
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_scatter, require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
@@ -425,7 +426,7 @@ class TapasModelTest(ModelTesterMixin, unittest.TestCase):
 
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
-        if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+        if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
             inputs_dict = {
                 k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous()
                 if isinstance(v, torch.Tensor) and v.ndim > 1
@@ -434,9 +435,9 @@ class TapasModelTest(ModelTesterMixin, unittest.TestCase):
             }
 
         if return_labels:
-            if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+            if model_class in get_values(MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
                 inputs_dict["labels"] = torch.ones(self.model_tester.batch_size, dtype=torch.long, device=torch_device)
-            elif model_class in MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING.values():
+            elif model_class in get_values(MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING):
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
                 )
@@ -457,17 +458,17 @@ class TapasModelTest(ModelTesterMixin, unittest.TestCase):
                     self.model_tester.batch_size, dtype=torch.float, device=torch_device
                 )
             elif model_class in [
-                *MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values(),
-                *MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.values(),
+                *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
             ]:
                 inputs_dict["labels"] = torch.zeros(
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device
                 )
             elif model_class in [
-                *MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values(),
-                *MODEL_FOR_CAUSAL_LM_MAPPING.values(),
-                *MODEL_FOR_MASKED_LM_MAPPING.values(),
-                *MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values(),
+                *get_values(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING),
+                *get_values(MODEL_FOR_MASKED_LM_MAPPING),
+                *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
             ]:
                 inputs_dict["labels"] = torch.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
diff --git a/tests/test_modeling_tf_albert.py b/tests/test_modeling_tf_albert.py
index aabd185f78..ab6b32ab84 100644
--- a/tests/test_modeling_tf_albert.py
+++ b/tests/test_modeling_tf_albert.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import AlbertConfig, is_tf_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_tf, slow
 
 from .test_configuration_common import ConfigTester
@@ -249,7 +250,7 @@ class TFAlbertModelTest(TFModelTesterMixin, unittest.TestCase):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in TF_MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(TF_MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["sentence_order_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
 
         return inputs_dict
diff --git a/tests/test_modeling_tf_auto.py b/tests/test_modeling_tf_auto.py
index ff80adc369..eb0b05f2c7 100644
--- a/tests/test_modeling_tf_auto.py
+++ b/tests/test_modeling_tf_auto.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import copy
+import tempfile
 import unittest
 
 from transformers import is_tf_available
@@ -39,6 +40,8 @@ if is_tf_available():
         TFBertForQuestionAnswering,
         TFBertForSequenceClassification,
         TFBertModel,
+        TFFunnelBaseModel,
+        TFFunnelModel,
         TFGPT2LMHeadModel,
         TFRobertaForMaskedLM,
         TFT5ForConditionalGeneration,
@@ -176,6 +179,21 @@ class TFAutoModelTest(unittest.TestCase):
         self.assertEqual(model.num_parameters(), 14410)
         self.assertEqual(model.num_parameters(only_trainable=True), 14410)
 
+    def test_from_pretrained_with_tuple_values(self):
+        # For the auto model mapping, FunnelConfig has two models: FunnelModel and FunnelBaseModel
+        model = TFAutoModel.from_pretrained("sgugger/funnel-random-tiny")
+        self.assertIsInstance(model, TFFunnelModel)
+
+        config = copy.deepcopy(model.config)
+        config.architectures = ["FunnelBaseModel"]
+        model = TFAutoModel.from_config(config)
+        self.assertIsInstance(model, TFFunnelBaseModel)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            model = TFAutoModel.from_pretrained(tmp_dir)
+            self.assertIsInstance(model, TFFunnelBaseModel)
+
     def test_parents_and_children_in_mappings(self):
         # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
         # by the parents and will return the wrong configuration type when using auto models
@@ -197,4 +215,12 @@ class TFAutoModelTest(unittest.TestCase):
                 for parent_config, parent_model in mapping[: index + 1]:
                     with self.subTest(msg=f"Testing if {child_config.__name__} is child of {parent_config.__name__}"):
                         self.assertFalse(issubclass(child_config, parent_config))
-                        self.assertFalse(issubclass(child_model, parent_model))
+
+                    # Tuplify child_model and parent_model since some of them could be tuples.
+                    if not isinstance(child_model, (list, tuple)):
+                        child_model = (child_model,)
+                    if not isinstance(parent_model, (list, tuple)):
+                        parent_model = (parent_model,)
+
+                    for child, parent in [(a, b) for a in child_model for b in parent_model]:
+                        assert not issubclass(child, parent), f"{child.__name__} is child of {parent.__name__}"
diff --git a/tests/test_modeling_tf_bert.py b/tests/test_modeling_tf_bert.py
index 8817ae2bc1..639ba0be9d 100644
--- a/tests/test_modeling_tf_bert.py
+++ b/tests/test_modeling_tf_bert.py
@@ -17,6 +17,7 @@
 import unittest
 
 from transformers import BertConfig, is_tf_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_tf, slow
 
 from .test_configuration_common import ConfigTester
@@ -282,7 +283,7 @@ class TFBertModelTest(TFModelTesterMixin, unittest.TestCase):
         inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
 
         if return_labels:
-            if model_class in TF_MODEL_FOR_PRETRAINING_MAPPING.values():
+            if model_class in get_values(TF_MODEL_FOR_PRETRAINING_MAPPING):
                 inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
 
         return inputs_dict
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index a2f7085660..51daf3779d 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -25,6 +25,7 @@ from importlib import import_module
 from typing import List, Tuple
 
 from transformers import is_tf_available
+from transformers.models.auto import get_values
 from transformers.testing_utils import (
     _tf_gpu_memory_limit,
     is_pt_tf_cross_test,
@@ -89,7 +90,7 @@ class TFModelTesterMixin:
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
         inputs_dict = copy.deepcopy(inputs_dict)
 
-        if model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+        if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
             inputs_dict = {
                 k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
                 if isinstance(v, tf.Tensor) and v.ndim > 0
@@ -98,21 +99,21 @@ class TFModelTesterMixin:
             }
 
         if return_labels:
-            if model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+            if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
                 inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING.values():
+            elif model_class in get_values(TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING):
                 inputs_dict["start_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
                 inputs_dict["end_positions"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.values():
+            elif model_class in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
                 inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
-            elif model_class in TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING.values():
+            elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING):
                 inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
             elif model_class in [
-                *TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values(),
-                *TF_MODEL_FOR_CAUSAL_LM_MAPPING.values(),
-                *TF_MODEL_FOR_MASKED_LM_MAPPING.values(),
-                *TF_MODEL_FOR_PRETRAINING_MAPPING.values(),
-                *TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values(),
+                *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+                *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING),
+                *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
+                *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
+                *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
             ]:
                 inputs_dict["labels"] = tf.zeros(
                     (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
@@ -580,7 +581,7 @@ class TFModelTesterMixin:
                     ),
                     "input_ids": tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32"),
                 }
-            elif model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
+            elif model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
                 input_ids = tf.keras.Input(batch_shape=(4, 2, max_input), name="input_ids", dtype="int32")
             else:
                 input_ids = tf.keras.Input(batch_shape=(2, max_input), name="input_ids", dtype="int32")
@@ -796,9 +797,9 @@ class TFModelTesterMixin:
     def test_model_common_attributes(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         list_lm_models = (
-            list(TF_MODEL_FOR_CAUSAL_LM_MAPPING.values())
-            + list(TF_MODEL_FOR_MASKED_LM_MAPPING.values())
-            + list(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values())
+            get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING)
+            + get_values(TF_MODEL_FOR_MASKED_LM_MAPPING)
+            + get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING)
         )
 
         for model_class in self.all_model_classes:
@@ -1128,7 +1129,7 @@ class TFModelTesterMixin:
                 ]
                 loss_size = tf.size(added_label)
 
-                if model.__class__ in TF_MODEL_FOR_CAUSAL_LM_MAPPING.values():
+                if model.__class__ in get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING):
                     # if loss is causal lm loss, labels are shift, so that one label per batch
                     # is cut
                     loss_size = loss_size - self.model_tester.batch_size
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 9869133ce0..4fa45d7c66 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -19,6 +19,8 @@ import os
 import re
 from pathlib import Path
 
+from transformers.models.auto import get_values
+
 
 # All paths are set with the intent you should run this script from the root of the repo with the command
 # python utils/check_repo.py
@@ -86,7 +88,6 @@ IGNORE_NON_AUTO_CONFIGURED = [
     "DPRReader",
     "DPRSpanPredictor",
     "FlaubertForQuestionAnswering",
-    "FunnelBaseModel",
     "GPT2DoubleHeadsModel",
     "OpenAIGPTDoubleHeadsModel",
     "RagModel",
@@ -95,7 +96,6 @@ IGNORE_NON_AUTO_CONFIGURED = [
     "T5Stack",
     "TFDPRReader",
     "TFDPRSpanPredictor",
-    "TFFunnelBaseModel",
     "TFGPT2DoubleHeadsModel",
     "TFOpenAIGPTDoubleHeadsModel",
     "TFRagModel",
@@ -153,7 +153,7 @@ def get_model_modules():
 def get_models(module):
     """ Get the objects in module that are models."""
     models = []
-    model_classes = (transformers.PreTrainedModel, transformers.TFPreTrainedModel)
+    model_classes = (transformers.PreTrainedModel, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel)
     for attr_name in dir(module):
         if "Pretrained" in attr_name or "PreTrained" in attr_name:
             continue
@@ -249,10 +249,13 @@ def get_all_auto_configured_models():
     result = set()  # To avoid duplicates we concatenate all model classes in a set.
     for attr_name in dir(transformers.models.auto.modeling_auto):
         if attr_name.startswith("MODEL_") and attr_name.endswith("MAPPING"):
-            result = result | set(getattr(transformers.models.auto.modeling_auto, attr_name).values())
+            result = result | set(get_values(getattr(transformers.models.auto.modeling_auto, attr_name)))
     for attr_name in dir(transformers.models.auto.modeling_tf_auto):
         if attr_name.startswith("TF_MODEL_") and attr_name.endswith("MAPPING"):
-            result = result | set(getattr(transformers.models.auto.modeling_tf_auto, attr_name).values())
+            result = result | set(get_values(getattr(transformers.models.auto.modeling_tf_auto, attr_name)))
+    for attr_name in dir(transformers.models.auto.modeling_flax_auto):
+        if attr_name.startswith("FLAX_MODEL_") and attr_name.endswith("MAPPING"):
+            result = result | set(get_values(getattr(transformers.models.auto.modeling_flax_auto, attr_name)))
     return [cls.__name__ for cls in result]
 
 

From c2e0fd5283fa29bf0d0bed1fdbc9b4206e47c1d1 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 8 Apr 2021 15:46:54 -0700
Subject: [PATCH 8/8] [setup] make fairscale and deepspeed setup extras
 (#11151)

* make fairscale and deepspeed setup extras

* fix default

* Apply suggestions from code review

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* no reason not to ask for the good version

* update the CIs

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 .github/workflows/self-scheduled.yml          |  7 ++-----
 docs/source/main_classes/trainer.rst          | 16 ++++++++++++++++
 setup.py                                      |  4 ++++
 src/transformers/dependency_versions_check.py |  6 +++++-
 src/transformers/dependency_versions_table.py |  2 ++
 src/transformers/integrations.py              |  4 ++--
 src/transformers/trainer.py                   | 10 ++++------
 src/transformers/utils/versions.py            |  6 ++++++
 8 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index c49a967d2a..978d9e02a6 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -33,8 +33,7 @@ jobs:
         run: |
           apt -y update && apt install -y libsndfile1-dev
           pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
-          pip install deepspeed
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,deepspeed]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
@@ -156,9 +155,7 @@ jobs:
         run: |
           apt -y update && apt install -y libsndfile1-dev
           pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
-          pip install fairscale
-          pip install deepspeed
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,deepspeed,fairscale]
 
       - name: Are GPUs recognized by our DL frameworks
         run: |
diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index bc9f248827..10a7a9d54a 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -274,6 +274,14 @@ Install the library via pypi:
 
     pip install fairscale
 
+or via ``transformers``' ``extras``:
+
+.. code-block:: bash
+
+    pip install transformers[fairscale]
+
+(will become available starting from ``transformers==4.6.0``)
+
 or find more details on `the FairScale's GitHub page <https://github.com/facebookresearch/fairscale/#installation>`__.
 
 If you're still struggling with the build, first make sure to read :ref:`zero-install-notes`.
@@ -419,6 +427,14 @@ Install the library via pypi:
 
     pip install deepspeed
 
+or via ``transformers``' ``extras``:
+
+.. code-block:: bash
+
+    pip install transformers[deepspeed]
+
+(will become available starting from ``transformers==4.6.0``)
+
 or find more details on `the DeepSpeed's GitHub page <https://github.com/microsoft/deepspeed#installation>`__ and
 `advanced install <https://www.deepspeed.ai/tutorials/advanced-install/>`__.
 
diff --git a/setup.py b/setup.py
index e942e65a7c..1b2ab5bf31 100644
--- a/setup.py
+++ b/setup.py
@@ -90,7 +90,9 @@ _deps = [
     "cookiecutter==1.7.2",
     "dataclasses",
     "datasets",
+    "deepspeed>0.3.13",
     "docutils==0.16.0",
+    "fairscale>0.3",
     "faiss-cpu",
     "fastapi",
     "filelock",
@@ -233,6 +235,8 @@ extras["onnx"] = deps_list("onnxconverter-common", "keras2onnx") + extras["onnxr
 extras["modelcreation"] = deps_list("cookiecutter")
 
 extras["sagemaker"] = deps_list("sagemaker")
+extras["deepspeed"] = deps_list("deepspeed")
+extras["fairscale"] = deps_list("fairscale")
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
 extras["speech"] = deps_list("soundfile", "torchaudio")
diff --git a/src/transformers/dependency_versions_check.py b/src/transformers/dependency_versions_check.py
index 7e36aaef30..e6e676481d 100644
--- a/src/transformers/dependency_versions_check.py
+++ b/src/transformers/dependency_versions_check.py
@@ -14,7 +14,7 @@
 import sys
 
 from .dependency_versions_table import deps
-from .utils.versions import require_version_core
+from .utils.versions import require_version, require_version_core
 
 
 # define which module versions we always want to check at run time
@@ -41,3 +41,7 @@ for pkg in pkgs_to_check_at_runtime:
         require_version_core(deps[pkg])
     else:
         raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py")
+
+
+def dep_version_check(pkg, hint=None):
+    require_version(deps[pkg], hint)
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 43f4c028fe..bd070d7bdf 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -7,7 +7,9 @@ deps = {
     "cookiecutter": "cookiecutter==1.7.2",
     "dataclasses": "dataclasses",
     "datasets": "datasets",
+    "deepspeed": "deepspeed>0.3.13",
     "docutils": "docutils==0.16.0",
+    "fairscale": "fairscale>0.3",
     "faiss-cpu": "faiss-cpu",
     "fastapi": "fastapi",
     "filelock": "filelock",
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 65824c25ca..7e4ab0f5c7 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -24,8 +24,8 @@ import tempfile
 from copy import deepcopy
 from pathlib import Path
 
+from .dependency_versions_check import dep_version_check
 from .utils import logging
-from .utils.versions import require_version
 
 
 logger = logging.get_logger(__name__)
@@ -324,7 +324,7 @@ def deepspeed_parse_config(ds_config):
 
     If it's already a dict, return a copy of it, so that we can freely modify it.
     """
-    require_version("deepspeed>0.3.13")
+    dep_version_check("deepspeed")
 
     if isinstance(ds_config, dict):
         # Don't modify user's data should they want to reuse it (e.g. in tests), because once we
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index dc31164331..41800b7fd3 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -54,6 +54,7 @@ from torch.utils.data.distributed import DistributedSampler
 from torch.utils.data.sampler import RandomSampler, SequentialSampler
 
 from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
+from .dependency_versions_check import dep_version_check
 from .file_utils import (
     WEIGHTS_NAME,
     is_apex_available,
@@ -139,17 +140,14 @@ if is_torch_tpu_available():
     import torch_xla.distributed.parallel_loader as pl
 
 if is_fairscale_available():
+    dep_version_check("fairscale")
     import fairscale
+    from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP
     from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP
+    from fairscale.nn.wrap import auto_wrap
     from fairscale.optim import OSS
     from fairscale.optim.grad_scaler import ShardedGradScaler
 
-    if version.parse(fairscale.__version__) >= version.parse("0.3"):
-        from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP
-        from fairscale.nn.wrap import auto_wrap
-    else:
-        FullyShardedDDP = None
-
 if is_sagemaker_dp_enabled():
     import smdistributed.dataparallel.torch.distributed as dist
     from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP
diff --git a/src/transformers/utils/versions.py b/src/transformers/utils/versions.py
index b573a361b9..73151487bc 100644
--- a/src/transformers/utils/versions.py
+++ b/src/transformers/utils/versions.py
@@ -60,6 +60,12 @@ def require_version(requirement: str, hint: Optional[str] = None) -> None:
     Args:
         requirement (:obj:`str`): pip style definition, e.g.,  "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
         hint (:obj:`str`, `optional`): what suggestion to print in case of requirements not being met
+
+    Example::
+
+       require_version("pandas>1.1.2")
+       require_version("numpy>1.18.5", "this is important to have for whatever reason")
+
     """
 
     hint = f"\n{hint}" if hint is not None else ""