Update README.md

Update modeling_gpt_neo.py
Update README.md
2021-04-23 22:46:21 +02:00 · 2021-04-22 18:35:41 +02:00 · 2021-04-18 22:14:34 +02:00 · 2021-04-18 21:45:59 +02:00 · 2021-04-18 21:30:38 +02:00 · 2021-04-18 20:52:38 +02:00
261 changed files with 13503 additions and 3493 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -145,7 +145,7 @@ jobs:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -s --make-reports=tests_torch ./tests/ | tee tests_output.txt
+            - run: python -m pytest -n 4 --dist=loadfile -s --make-reports=tests_torch ./tests/ | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -277,7 +277,7 @@ jobs:
                      - v0.4-custom_tokenizers-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[ja,testing,sentencepiece]
+            - run: pip install .[ja,testing,sentencepiece,jieba]
            - run: python -m unidic download
            - save_cache:
                  key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
@@ -348,7 +348,7 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install ."[all, docs]"
+            - run: pip install ."[docs]"
            - save_cache:
                  key: v0.4-build_doc-{{ checksum "setup.py" }}
                  paths:
@@ -370,7 +370,7 @@ jobs:
                  keys:
                      - v0.4-deploy_doc-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: pip install ."[all,docs]"
+            - run: pip install ."[docs]"
            - save_cache:
                  key: v0.4-deploy_doc-{{ checksum "setup.py" }}
                  paths:
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -60,4 +60,6 @@ deploy_doc "7d9a9d0" v4.2.2
 deploy_doc "bae0c79" v4.3.3
 deploy_doc "c988db5" v4.4.0
 deploy_doc "c5d6a28" v4.4.1
-deploy_doc "6bc89ed"  # v4.4.2 Latest stable release
+deploy_doc "6bc89ed" v4.4.2
+deploy_doc "4906a29" v4.5.0
+deploy_doc "4bae96e"  # v4.5.1 Latest stable release
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -34,7 +34,7 @@ Models:
 - funnel: @sgugger
 - gpt2: @patrickvonplaten, @LysandreJik
 - rag: @patrickvonplaten, @lhoestq
- tensorflow: @LysandreJik
+- tensorflow: @Rocketknight1

 Library:

--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -30,7 +30,7 @@ Fixes # (issue)
 ## Who can review?

 Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
-members/contributors which may be interested in your PR.
+members/contributors who may be interested in your PR.

 <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @

--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -5,6 +5,7 @@ on:
    branches:
      - master
      - ci_*
+      - ci-*
    paths:
      - "src/**"
      - "tests/**"
@@ -186,11 +187,99 @@ jobs:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports

+  run_tests_torch_cuda_extensions_gpu:
+    runs-on: [self-hosted, docker-gpu, single-gpu]
+    container:
+      image: nvcr.io/nvidia/pytorch:21.03-py3
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .[testing,deepspeed]
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Cuda version:', torch.version.cuda)"
+          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+      - name: Run all tests on GPU
+        run: |
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_tests_torch_cuda_extensions_gpu_test_reports
+          path: reports
+
+  run_tests_torch_cuda_extensions_multi_gpu:
+    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    container:
+      image: nvcr.io/nvidia/pytorch:21.03-py3
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .[testing,deepspeed,fairscale]
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Cuda version:', torch.version.cuda)"
+          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+      - name: Run all tests on GPU
+        run: |
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
+          path: reports
+
+
  send_results:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
-    needs: [run_tests_torch_gpu, run_tests_tf_gpu, run_tests_torch_multi_gpu, run_tests_tf_multi_gpu]
+    needs: [
+        run_tests_torch_gpu,
+        run_tests_tf_gpu,
+        run_tests_torch_multi_gpu,
+        run_tests_tf_multi_gpu,
+        run_tests_torch_cuda_extensions_gpu,
+        run_tests_torch_cuda_extensions_multi_gpu
+    ]
    steps:
      - uses: actions/checkout@v2

--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -246,11 +246,98 @@ jobs:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports

+  run_all_tests_torch_cuda_extensions_gpu:
+    runs-on: [self-hosted, docker-gpu, single-gpu]
+    container:
+      image: nvcr.io/nvidia/pytorch:21.03-py3
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .[testing,deepspeed]
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Cuda version:', torch.version.cuda)"
+          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+      - name: Run all tests on GPU
+        run: |
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_tests_torch_cuda_extensions_gpu_test_reports
+          path: reports
+
+  run_all_tests_torch_cuda_extensions_multi_gpu:
+    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    container:
+      image: nvcr.io/nvidia/pytorch:21.03-py3
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .[testing,deepspeed,fairscale]
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Cuda version:', torch.version.cuda)"
+          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+      - name: Run all tests on GPU
+        run: |
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
+          path: reports
+
  send_results:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
-    needs: [run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu]
+    needs: [
+        run_all_tests_torch_gpu,
+        run_all_tests_tf_gpu,
+        run_all_tests_torch_multi_gpu,
+        run_all_tests_tf_multi_gpu,
+        run_all_tests_torch_cuda_extensions_gpu,
+        run_all_tests_torch_cuda_extensions_multi_gpu
+    ]
    steps:
      - uses: actions/checkout@v2

--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -2,7 +2,7 @@ name: Stale Bot

 on:
  schedule:
-    - cron: "0 0 * * *"
+    - cron: "0 15 * * *"

 jobs:
  close_stale_issues:
--- a/.gitignore
+++ b/.gitignore
@@ -9,8 +9,7 @@ __pycache__/
 *.so

 # tests and logs
-tests/fixtures/*
-!tests/fixtures/sample_text_no_unicode.txt
+tests/fixtures/cached_*_text.txt
 logs/
 lightning_logs/
 lang_code_data/
--- a/README.md
+++ b/README.md
@@ -1,3 +1,12 @@
+# Patches
+
+This branch has the following patches:
+
+* gpt-neo model is loaded directly on GPU to save system memory
+* repetition_penalty has range and slope settings, so it doesn't penalize all tokens in the context window
+* no copy of the state dict is made while loading a pretrained model
+* local self attention uses padding so it doesn't OOM on long sequences
+
 <!---
 Copyright 2020 The HuggingFace Team. All rights reserved.

@@ -200,9 +209,11 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[CPM](https://huggingface.co/transformers/model_doc/cpm.html)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeiT](https://huggingface.co/transformers/model_doc/deit.html)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
 1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
 1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
@@ -223,6 +234,8 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](https://huggingface.co/transformers/model_doc/megatron_bert.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,10 +1,11 @@
 // These two things need to be updated at each release for the version selector.
 // Last stable version
-const stableVersion = "v4.4.2"
+const stableVersion = "v4.5.1"
 // Dictionary doc folder to label. The last stable version should have an empty key.
 const versionMapping = {
    "master": "master",
-    "": "v4.4.0/v4.4.1/v4.4.2 (stable)",
+    "": "v4.5.0/v4.5.1 (stable)",
+    "v4.4.2": "v4.4.0/v4.4.1/v4.4.2",
    "v4.3.3": "v4.3.0/v4.3.1/v4.3.2/v4.3.3",
    "v4.2.2": "v4.2.0/v4.2.1/v4.2.2",
    "v4.1.1": "v4.1.0/v4.1.1",
--- a/docs/source/add_new_model.rst
+++ b/docs/source/add_new_model.rst
@@ -388,7 +388,7 @@ Next, you can finally start adding new code to 🤗 Transformers. Go into the cl

 ::

-   cd transformers
+    cd transformers

 In the special case that you are adding a model whose architecture exactly matches the model architecture of an
 existing model you only have to add a conversion script as described in `this section <#write-a-conversion-script>`__.
@@ -417,27 +417,27 @@ You should do the following:

 ::

-   git checkout -b add_brand_new_bert
+    git checkout -b add_brand_new_bert

 2. Commit the automatically generated code:

 ::

-   git add .
-   git commit
+    git add .
+    git commit

 3. Fetch and rebase to current master

 ::

-   git fetch upstream
-   git rebase upstream/master
+    git fetch upstream
+    git rebase upstream/master

 4. Push the changes to your account using:

 ::

-   git push -u origin a-descriptive-name-for-my-changes
+    git push -u origin a-descriptive-name-for-my-changes

 5. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the
   GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for
@@ -451,8 +451,8 @@ time to time by doing:

 ::

-   git fetch upstream
-   git merge upstream/master
+    git fetch upstream
+    git merge upstream/master

 In general, all questions you might have regarding the model or your implementation should be asked in your PR and
 discussed/solved in the PR. This way, the Hugging Face team will always be notified when you are committing new code or
--- a/docs/source/community.md
+++ b/docs/source/community.md
@@ -51,3 +51,4 @@ This page regroups resources around 🤗 Transformers developed by the community
 |[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | How to decode CTC sequence with language model adjustment | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)|
 |[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | How to fine-tune BART for summarization in two languages with Trainer class | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
 |[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | How to evaluate BigBird on long document question answering on Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|
+| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | How to create YouTube captions from any video by transcribing the audio with Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) |
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -47,12 +47,12 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas

 .. code-block:: shell

-   export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
+    export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12

-   transformers-cli convert --model_type bert \
-     --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
-     --config $BERT_BASE_DIR/bert_config.json \
-     --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
+    transformers-cli convert --model_type bert \
+      --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
+      --config $BERT_BASE_DIR/bert_config.json \
+      --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin

 You can download Google's pre-trained models for the conversion `here
 <https://github.com/google-research/bert#pre-trained-models>`__.
@@ -72,12 +72,12 @@ Here is an example of the conversion process for the pre-trained ``ALBERT Base``

 .. code-block:: shell

-   export ALBERT_BASE_DIR=/path/to/albert/albert_base
+    export ALBERT_BASE_DIR=/path/to/albert/albert_base

-   transformers-cli convert --model_type albert \
-     --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
-     --config $ALBERT_BASE_DIR/albert_config.json \
-     --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
+    transformers-cli convert --model_type albert \
+      --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
+      --config $ALBERT_BASE_DIR/albert_config.json \
+      --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin

 You can download Google's pre-trained models for the conversion `here
 <https://github.com/google-research/albert#pre-trained-models>`__.
@@ -91,13 +91,13 @@ save as the same format than OpenAI pretrained model (see `here <https://github.

 .. code-block:: shell

-   export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
+    export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights

-   transformers-cli convert --model_type gpt \
-     --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--config OPENAI_GPT_CONFIG] \
-     [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
+    transformers-cli convert --model_type gpt \
+      --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+      [--config OPENAI_GPT_CONFIG] \
+      [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \


 OpenAI GPT-2
@@ -108,13 +108,13 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode

 .. code-block:: shell

-   export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
+    export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights

-   transformers-cli convert --model_type gpt2 \
-     --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--config OPENAI_GPT2_CONFIG] \
-     [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
+    transformers-cli convert --model_type gpt2 \
+      --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+      [--config OPENAI_GPT2_CONFIG] \
+      [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]

 Transformer-XL
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -124,13 +124,13 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo

 .. code-block:: shell

-   export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
+    export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint

-   transformers-cli convert --model_type transfo_xl \
-     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--config TRANSFO_XL_CONFIG] \
-     [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
+    transformers-cli convert --model_type transfo_xl \
+      --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+      [--config TRANSFO_XL_CONFIG] \
+      [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]


 XLNet
@@ -140,14 +140,14 @@ Here is an example of the conversion process for a pre-trained XLNet model:

 .. code-block:: shell

-   export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
-   export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
+    export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
+    export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config

-   transformers-cli convert --model_type xlnet \
-     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
-     --config $TRANSFO_XL_CONFIG_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--finetuning_task_name XLNET_FINETUNED_TASK] \
+    transformers-cli convert --model_type xlnet \
+      --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
+      --config $TRANSFO_XL_CONFIG_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+      [--finetuning_task_name XLNET_FINETUNED_TASK] \


 XLM
@@ -157,13 +157,13 @@ Here is an example of the conversion process for a pre-trained XLM model:

 .. code-block:: shell

-   export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
+    export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint

-   transformers-cli convert --model_type xlm \
-     --tf_checkpoint $XLM_CHECKPOINT_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
-    [--config XML_CONFIG] \
-    [--finetuning_task_name XML_FINETUNED_TASK]
+    transformers-cli convert --model_type xlm \
+      --tf_checkpoint $XLM_CHECKPOINT_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
+     [--config XML_CONFIG] \
+     [--finetuning_task_name XML_FINETUNED_TASK]


 T5
@@ -173,9 +173,9 @@ Here is an example of the conversion process for a pre-trained T5 model:

 .. code-block:: shell

-   export T5=/path/to/t5/uncased_L-12_H-768_A-12
+    export T5=/path/to/t5/uncased_L-12_H-768_A-12

-   transformers-cli convert --model_type t5 \
-     --tf_checkpoint $T5/t5_model.ckpt \
-     --config $T5/t5_config.json \
-     --pytorch_dump_output $T5/pytorch_model.bin
+    transformers-cli convert --model_type t5 \
+      --tf_checkpoint $T5/t5_model.ckpt \
+      --config $T5/t5_config.json \
+      --pytorch_dump_output $T5/pytorch_model.bin
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -182,7 +182,7 @@ such:

 .. code-block::

-   >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
+    >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]

 We can use our tokenizer to automatically generate such a sentence by passing the two sequences to ``tokenizer`` as two
 arguments (and not a list, like before) like this:
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -22,7 +22,7 @@ State-of-the-art NLP for everyone:
 - Hands-on practitioners
 - AI/ML/NLP teachers and educators

-.. 
+..
    Copyright 2020 The HuggingFace Team. All rights reserved.

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
@@ -114,122 +114,136 @@ and conversion utilities for the following models:
 11. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
    Span-based Dynamic Convolution <https://arxiv.org/abs/2008.02496>`__ by Zihang Jiang, Weihao Yu, Daquan Zhou,
    Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-12. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
+12. :doc:`CPM <model_doc/cpm>` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative
+    Chinese Pre-trained Language Model <https://arxiv.org/abs/2012.00413>`__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei
+    Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng,
+    Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang,
+    Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+13. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
    Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
    Lav R. Varshney, Caiming Xiong and Richard Socher.
-13. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
+14. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
    Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu
    Chen.
-14. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
+15. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
    with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
    Weizhu Chen.
-15. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+16. :doc:`DeiT <model_doc/deit>` (from Facebook) released with the paper `Training data-efficient image transformers &
+    distillation through attention <https://arxiv.org/abs/2012.12877>`__ by Hugo Touvron, Matthieu Cord, Matthijs
+    Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+17. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
    Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
    Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-16. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+18. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
    distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
    Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
    `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
    version of DistilBERT.
-17. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+19. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
    Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
    Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-18. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+20. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
    Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
    Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-19. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+21. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
    Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
    Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-20. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+22. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
    Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
    Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-21. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+23. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
    Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
    and Ilya Sutskever.
-22. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+24. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
    Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
    Luan, Dario Amodei** and Ilya Sutskever**.
-23. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
+25. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
    <https://github.com/EleutherAI/gpt-neo>`__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-24. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
+26. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
    <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
-25. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+27. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-26. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+28. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-27. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+29. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-28. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+30. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
    by Hao Tan and Mohit Bansal.
-29. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
+31. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
    Machine Translation <https://arxiv.org/abs/2010.11125>`__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi
    Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman
    Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-30. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+32. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
    Translator Team.
-31. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+33. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-32. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
+34. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
    Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
    Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-33. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+35. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
+    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
+    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+36. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
+    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
+    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+37. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
    Jianfeng Lu, Tie-Yan Liu.
-34. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+38. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-35. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+39. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
    Mohammad Saleh and Peter J. Liu.
-36. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+40. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-37. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+41. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-38. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+42. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-39. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
+43. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
    `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
    Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-40. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
+44. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
    about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
    Krishna, and Kurt W. Keutzer.
-41. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+45. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-42. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+46. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
    Francesco Piccinno and Julian Martin Eisenschlos.
-43. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+47. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-44. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
+48. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
    Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
    Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
    Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-45. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+49. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
    Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
    Zhou, Abdelrahman Mohamed, Michael Auli.
-46. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+50. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-47. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+51. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-48. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+52. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
    Zettlemoyer and Veselin Stoyanov.
-49. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+53. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-50. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
+54. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
    Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
    Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.

@@ -274,6 +288,8 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
@@ -304,6 +320,8 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
@@ -376,6 +394,7 @@ TensorFlow and/or Flax.

    pretrained_models
    examples
+    troubleshooting
    custom_datasets
    notebooks
    sagemaker
@@ -402,6 +421,7 @@ TensorFlow and/or Flax.

    main_classes/callback
    main_classes/configuration
+    main_classes/data_collator
    main_classes/logging
    main_classes/model
    main_classes/optimizer_schedules
@@ -423,15 +443,18 @@ TensorFlow and/or Flax.
    model_doc/bert
    model_doc/bertweet
    model_doc/bertgeneration
+    model_doc/bert_japanese
    model_doc/bigbird
    model_doc/blenderbot
    model_doc/blenderbot_small
    model_doc/bort
    model_doc/camembert
    model_doc/convbert
+    model_doc/cpm
    model_doc/ctrl
    model_doc/deberta
    model_doc/deberta_v2
+    model_doc/deit
    model_doc/dialogpt
    model_doc/distilbert
    model_doc/dpr
@@ -449,6 +472,8 @@ TensorFlow and/or Flax.
    model_doc/marian
    model_doc/m2m_100
    model_doc/mbart
+    model_doc/megatron_bert
+    model_doc/megatron_gpt2
    model_doc/mobilebert
    model_doc/mpnet
    model_doc/mt5
--- a/docs/source/main_classes/data_collator.rst
+++ b/docs/source/main_classes/data_collator.rst
@@ -0,0 +1,71 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Data Collator
+-----------------------------------------------------------------------------------------------------------------------
+
+Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of
+the same type as the elements of :obj:`train_dataset` or :obj:`eval_dataset`.
+
+To be able to build batches, data collators may apply some processing (like padding). Some of them (like
+:class:`~transformers.DataCollatorForLanguageModeling`) also apply some random data augmentation (like random masking)
+oin the formed batch.
+
+Examples of use can be found in the :doc:`example scripts <../examples>` or :doc:`example notebooks <../notebooks>`.
+
+
+Default data collator
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.data.data_collator.default_data_collator
+
+
+DataCollatorWithPadding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.data.data_collator.DataCollatorWithPadding
+    :members:
+
+
+DataCollatorForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.data.data_collator.DataCollatorForTokenClassification
+    :members:
+
+
+DataCollatorForSeq2Seq
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.data.data_collator.DataCollatorForSeq2Seq
+    :members:
+
+
+DataCollatorForLanguageModeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.data.data_collator.DataCollatorForLanguageModeling
+    :members: mask_tokens
+
+
+DataCollatorForWholeWordMask
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.data.data_collator.DataCollatorForWholeWordMask
+    :members: mask_tokens
+
+
+DataCollatorForPermutationLanguageModeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.data.data_collator.DataCollatorForPermutationLanguageModeling
+    :members: mask_tokens
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -134,6 +134,8 @@ Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, O

 This provided support is new and experimental as of this writing.

+.. _zero-install-notes:
+
 Installation Notes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

@@ -156,7 +158,8 @@ please, read the following notes first.
 In these notes we give examples for what to do when ``pytorch`` has been built with CUDA ``10.2``. If your situation is
 different remember to adjust the version number to the one you are after.

-**Possible problem #1:**
+Possible problem #1
+=======================================================================================================================

 While, Pytorch comes with its own CUDA toolkit, to build these two projects you must have an identical version of CUDA
 installed system-wide.
@@ -176,7 +179,8 @@ If you don't have CUDA installed system-wide, install it first. You will find th
 search engine. For example, if you're on Ubuntu you may want to search for: `ubuntu cuda 10.2 install
 <https://www.google.com/search?q=ubuntu+cuda+10.2+install>`__.

-**Possible problem #2:**
+Possible problem #2
+=======================================================================================================================

 Another possible common problem is that you may have more than one CUDA toolkit installed system-wide. For example you
 may have:
@@ -222,7 +226,8 @@ exist. ``lib64`` sub-directory is where the various CUDA ``.so`` objects, like `
 that your system will have it named differently, but if it is adjust it to reflect your reality.


-**Possible problem #3:**
+Possible problem #3
+=======================================================================================================================

 Some older CUDA versions may refuse to build with newer compilers. For example, you my have ``gcc-9`` but it wants
 ``gcc-7``.
@@ -247,13 +252,6 @@ should find ``gcc-7`` (and ``g++7``) and then the build will succeed.

 As always make sure to edit the paths in the example to match your situation.

-**If still unsuccessful:**
-
-If after addressing these you still encounter build issues, please, proceed with the GitHub Issue of `FairScale
-<https://github.com/facebookresearch/fairscale/issues>`__ and `Deepspeed
-<https://github.com/microsoft/DeepSpeed/issues>`__, depending on the project you have the problem with.
-
-
 FairScale
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

@@ -267,20 +265,74 @@ provides support for the following features from `the ZeRO paper <https://arxiv.

 You will need at least two GPUs to use this feature.

-To deploy this feature:

-1. Install the library via pypi:
+**Installation**:

-   .. code-block:: bash
+Install the library via pypi:

-       pip install fairscale
+.. code-block:: bash

-   or find more details on `the FairScale's GitHub page
-   <https://github.com/facebookresearch/fairscale/#installation>`__.
+    pip install fairscale

-2. To use the first version of Sharded data-parallelism, add ``--sharded_ddp simple`` to the command line arguments,
-   and make sure you have added the distributed launcher ``-m torch.distributed.launch
-   --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.
+or via ``transformers``' ``extras``:
+
+.. code-block:: bash
+
+    pip install transformers[fairscale]
+
+(will become available starting from ``transformers==4.6.0``)
+
+or find more details on `the FairScale's GitHub page <https://github.com/facebookresearch/fairscale/#installation>`__.
+
+If you're still struggling with the build, first make sure to read :ref:`zero-install-notes`.
+
+If it's still not resolved the build issue, here are a few more ideas.
+
+``fairscale`` seems to have an issue with the recently introduced by pip build isolation feature. If you have a problem
+with it, you may want to try one of:
+
+.. code-block:: bash
+
+    pip install fairscale --no-build-isolation .
+
+or:
+
+.. code-block:: bash
+
+    git clone https://github.com/facebookresearch/fairscale/
+    cd fairscale
+    rm -r dist build
+    python setup.py bdist_wheel
+    pip uninstall -y fairscale
+    pip install dist/fairscale-*.whl
+
+``fairscale`` also has issues with building against pytorch-nightly, so if you use it you may have to try one of:
+
+.. code-block:: bash
+
+    pip uninstall -y fairscale; pip install fairscale --pre \
+    -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html \
+    --no-cache --no-build-isolation
+
+or:
+
+.. code-block:: bash
+
+    pip install -v --disable-pip-version-check . \
+    -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html --pre
+
+Of course, adjust the urls to match the cuda version you use.
+
+If after trying everything suggested you still encounter build issues, please, proceed with the GitHub Issue of
+`FairScale <https://github.com/facebookresearch/fairscale/issues>`__.
+
+
+
+**Usage**:
+
+To use the first version of Sharded data-parallelism, add ``--sharded_ddp simple`` to the command line arguments, and
+make sure you have added the distributed launcher ``-m torch.distributed.launch
+--nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.

 For example here is how you could use it for ``run_translation.py`` with 2 GPUs:

@@ -303,9 +355,9 @@ Notes:
  able to use significantly larger batch sizes using the same hardware (e.g. 3x and even bigger) which should lead to
  significantly shorter training time.

-3. To use the second version of Sharded data-parallelism, add ``--sharded_ddp zero_dp_2`` or ``--sharded_ddp zero_dp_3`
-   to the command line arguments, and make sure you have added the distributed launcher ``-m torch.distributed.launch
-   --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.
+3. To use the second version of Sharded data-parallelism, add ``--sharded_ddp zero_dp_2`` or ``--sharded_ddp
+   zero_dp_3`` to the command line arguments, and make sure you have added the distributed launcher ``-m
+   torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.

 For example here is how you could use it for ``run_translation.py`` with 2 GPUs:

@@ -346,19 +398,23 @@ DeepSpeed
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 `DeepSpeed <https://github.com/microsoft/DeepSpeed>`__ implements everything described in the `ZeRO paper
-<https://arxiv.org/abs/1910.02054>`__, except ZeRO's stage 3. "Parameter Partitioning (Pos+g+p)". Currently it provides
-full support for:
+<https://arxiv.org/abs/1910.02054>`__. Currently it provides full support for:

 1. Optimizer State Partitioning (ZeRO stage 1)
-2. Add Gradient Partitioning (ZeRO stage 2)
-3. Custom fp16 handling
-4. A range of fast Cuda-extension-based Optimizers
-5. ZeRO-Offload
+2. Gradient Partitioning (ZeRO stage 2)
+3. Param Partitioning (ZeRO stage 3)
+4. Custom mixed precision training handling
+5. A range of fast CUDA-extension-based Optimizers
+6. ZeRO-Offload

 ZeRO-Offload has its own dedicated paper: `ZeRO-Offload: Democratizing Billion-Scale Model Training
 <https://arxiv.org/abs/2101.06840>`__.

-DeepSpeed is currently used only for training, as all the currently available features are of no use to inference.
+DeepSpeed ZeRO-2 is currently used only for training, as all the currently available features are of no use to
+inference.
+
+DeepSpeed ZeRO-3 can be used for inference as well, since it allows huge models to be loaded on multiple GPUs, which
+won't be possible on a single GPU.



@@ -371,7 +427,82 @@ Install the library via pypi:

    pip install deepspeed

-or find more details on `the DeepSpeed's GitHub page <https://github.com/microsoft/deepspeed#installation>`__.
+or via ``transformers``' ``extras``:
+
+.. code-block:: bash
+
+    pip install transformers[deepspeed]
+
+(will become available starting from ``transformers==4.6.0``)
+
+or find more details on `the DeepSpeed's GitHub page <https://github.com/microsoft/deepspeed#installation>`__ and
+`advanced install <https://www.deepspeed.ai/tutorials/advanced-install/>`__.
+
+If you're still struggling with the build, first make sure to read :ref:`zero-install-notes`.
+
+If you don't prebuild the extensions and rely on them to be built at run time and you tried all of the above solutions
+to no avail, the next thing to try is to pre-build the modules before installing them.
+
+To make a local build for DeepSpeed:
+
+.. code-block:: bash
+
+    git clone https://github.com/microsoft/DeepSpeed/
+    cd DeepSpeed
+    rm -rf build
+    TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 pip install . \
+    --global-option="build_ext" --global-option="-j8" --no-cache -v \
+    --disable-pip-version-check 2>&1 | tee build.log
+
+Edit ``TORCH_CUDA_ARCH_LIST`` to insert the code for the architectures of the GPU cards you intend to use.
+
+Or if you need to use the same setup on multiple machines, make a binary wheel:
+
+.. code-block:: bash
+
+    git clone https://github.com/microsoft/DeepSpeed/
+    cd DeepSpeed
+    rm -rf build
+    TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 \
+    python setup.py build_ext -j8 bdist_wheel
+
+it will generate something like ``dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` which now you can install
+as ``pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` locally or on any other machine.
+
+Again, remember to ensure to adjust ``TORCH_CUDA_ARCH_LIST`` to the target architectures.
+
+You can find the complete list of NVIDIA GPUs and their corresponding **Compute Capabilities** (same as arch in this
+context) `here <https://developer.nvidia.com/cuda-gpus>`__.
+
+You can check the archs pytorch was built with using:
+
+.. code-block:: bash
+
+    python -c "import torch; print(torch.cuda.get_arch_list())"
+
+Here is how to find out the arch for one of the installed GPU. For example, for GPU 0:
+
+.. code-block:: bash
+
+    CUDA_VISIBLE_DEVICES=0 python -c "import torch; \
+    print(torch.cuda.get_device_properties(torch.device('cuda')))"
+
+If the output is:
+
+.. code-block:: bash
+
+    _CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82)
+
+then you know that this card's arch is ``8.6``.
+
+You can also leave ``TORCH_CUDA_ARCH_LIST`` out completely and then the build program will automatically query the
+architecture of the GPUs the build is made on. This may or may not match the GPUs on the target machines, that's why
+it's best to specify the desired archs explicitly.
+
+If after trying everything suggested you still encounter build issues, please, proceed with the GitHub Issue of
+`Deepspeed <https://github.com/microsoft/DeepSpeed/issues>`__,
+
+

 Deployment with multiple GPUs
 =======================================================================================================================
@@ -410,7 +541,7 @@ Here is an example of running ``run_translation.py`` under DeepSpeed deploying a
 .. code-block:: bash

    deepspeed examples/seq2seq/run_translation.py \
-    --deepspeed examples/tests/deepspeed/ds_config.json \
+    --deepspeed tests/deepspeed/ds_config.json \
    --model_name_or_path t5-small --per_device_train_batch_size 1   \
    --output_dir output_dir --overwrite_output_dir --fp16 \
    --do_train --max_train_samples 500 --num_train_epochs 1 \
@@ -435,7 +566,7 @@ To deploy DeepSpeed with one GPU adjust the :class:`~transformers.Trainer` comma
 .. code-block:: bash

    deepspeed --num_gpus=1 examples/seq2seq/run_translation.py \
-    --deepspeed examples/tests/deepspeed/ds_config.json \
+    --deepspeed tests/deepspeed/ds_config.json \
    --model_name_or_path t5-small --per_device_train_batch_size 1   \
    --output_dir output_dir --overwrite_output_dir --fp16 \
    --do_train --max_train_samples 500 --num_train_epochs 1 \
@@ -460,18 +591,18 @@ with DeepSpeed is to have at least the following configuration in the configurat

 .. code-block:: json

-  {
-    "zero_optimization": {
-       "stage": 2,
-       "allgather_partitions": true,
-       "allgather_bucket_size": 2e8,
-       "reduce_scatter": true,
-       "reduce_bucket_size": 2e8,
-       "overlap_comm": true,
-       "contiguous_gradients": true,
-       "cpu_offload": true
-    },
-  }
+    {
+      "zero_optimization": {
+         "stage": 2,
+         "allgather_partitions": true,
+         "allgather_bucket_size": 2e8,
+         "reduce_scatter": true,
+         "reduce_bucket_size": 2e8,
+         "overlap_comm": true,
+         "contiguous_gradients": true,
+         "cpu_offload": true
+      },
+    }

 which enables ``cpu_offload`` and some other important features. You may experiment with the buffer sizes, you will
 find more details in the discussion below.
@@ -498,7 +629,7 @@ Deployment in Notebooks
 The problem with running notebook cells as a script is that there is no normal ``deepspeed`` launcher to rely on, so
 under certain setups we have to emulate it.

-Here is how you'd have to adjust your training code in the notebook to use DeepSpeed.
+If you're using only 1 GPU, here is how you'd have to adjust your training code in the notebook to use DeepSpeed.

 .. code-block:: python

@@ -516,7 +647,11 @@ Here is how you'd have to adjust your training code in the notebook to use DeepS
    trainer = Trainer(...)
    trainer.train()

-Note: `...` stands for the normal arguments that you'd pass to the functions.
+Note: ``...`` stands for the normal arguments that you'd pass to the functions.
+
+If you want to use more than 1 GPU, you must use a multi-process environment for DeepSpeed to work. That is, you have
+to use the launcher for that purpose and this cannot be accomplished by emulating the distributed environment presented
+at the beginning of this section.

 If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated
 cell with:
@@ -570,21 +705,29 @@ cell with:
    EOT


-That's said if the script is not in the notebook cells, you can launch ``deepspeed`` normally via shell from a cell
-with:
+If the training script is in a normal file and not in the notebook cells, you can launch ``deepspeed`` normally via
+shell from a cell. For example, to use ``run_translation.py`` you would launch it with:

 .. code-block::

-   !deepspeed examples/seq2seq/run_translation.py ...
+    !git clone https://github.com/huggingface/transformers
+    !cd transformers; deepspeed examples/seq2seq/run_translation.py ...

-or with bash magic, where you can write a multi-line code for the shell to run:
+or with ``%%bash`` magic, where you can write a multi-line code for the shell program to run:

 .. code-block::

-   %%bash
+    %%bash
+
+    git clone https://github.com/huggingface/transformers
+    cd transformers
+    deepspeed examples/seq2seq/run_translation.py ...
+
+In such case you don't need any of the code presented at the beginning of this section.
+
+Note: ``%%bash`` magic is neat, but currently it buffers the output so you won't see the logs until the process
+completes.

-   cd /somewhere
-   deepspeed examples/seq2seq/run_translation.py ...



@@ -600,16 +743,16 @@ repo <https://github.com/microsoft/DeepSpeedExamples>`__:

 .. code-block:: bash

-  git clone https://github.com/microsoft/DeepSpeedExamples
-  cd DeepSpeedExamples
-  find . -name '*json'
+    git clone https://github.com/microsoft/DeepSpeedExamples
+    cd DeepSpeedExamples
+    find . -name '*json'

 Continuing the code from above, let's say you're looking to configure the Lamb optimizer. So you can search through the
 example ``.json`` files with:

 .. code-block:: bash

-  grep -i Lamb $(find . -name '*json')
+    grep -i Lamb $(find . -name '*json')

 Some more examples are to be found in the `main repo <https://github.com/microsoft/DeepSpeed>`__ as well.

@@ -717,26 +860,45 @@ Of course, you will need to adjust the values in this example to your situation.
 ZeRO
 =======================================================================================================================

+`Zero Redundancy Optimizer (ZeRO) <https://www.deepspeed.ai/tutorials/zero/>`__ is the work horse of DeepSpeed. It
+support 3 different levels (stages) of optimization. The first one is not quite interesting for scalability purposes,
+therefore this document focuses on stages 2 and 3. You will find more indepth information in the DeepSpeed
+documentation.
+
 The ``zero_optimization`` section of the configuration file is the most important part (`docs
 <https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training>`__), since that is where you define
-which ZeRO stages you want to enable and how to configure them.
+which ZeRO stages you want to enable and how to configure them. You will find the explanation for each parameter in the
+DeepSpeed docs.
+
+This section has to be configured exclusively via DeepSpeed configuration - the :class:`~transformers.Trainer` provides
+no equivalent command line arguments.
+
+Note: currently DeepSpeed doesn't validate parameter names, so if you misspell any, it'll use the default setting for
+the parameter that got misspelled. You can watch the DeepSpeed engine start up log messages to see what values it is
+going to use.
+
+
+ZeRO-2 Config
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+The following is an example configuration for ZeRO stage 2:

 .. code-block:: json

    {
-       "zero_optimization": {
-           "stage": 2,
-           "allgather_partitions": true,
-           "allgather_bucket_size": 5e8,
-           "overlap_comm": true,
-           "reduce_scatter": true,
-           "reduce_bucket_size": 5e8,
-           "contiguous_gradients": true,
-           "cpu_offload": true
-       }
+        "zero_optimization": {
+            "stage": 2,
+            "allgather_partitions": true,
+            "allgather_bucket_size": 5e8,
+            "overlap_comm": true,
+            "reduce_scatter": true,
+            "reduce_bucket_size": 5e8,
+            "contiguous_gradients": true,
+            "cpu_offload": true
+        }
    }

-Notes:
+**Performance tuning:**

 - enabling ``cpu_offload`` should reduce GPU RAM usage (it requires ``"stage": 2``)
 - ``"overlap_comm": true`` trades off increased GPU RAM usage to lower all-reduce latency. ``overlap_comm`` uses 4.5x
@@ -748,9 +910,217 @@ Notes:
  the slower the communication, and the more GPU RAM will be available to other tasks. So if a bigger batch size is
  important, getting a slightly slower training time could be a good trade.

-This section has to be configured exclusively via DeepSpeed configuration - the :class:`~transformers.Trainer` provides
-no equivalent command line arguments.

+ZeRO-3 Config
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+The following is an example configuration for ZeRO stage 3:
+
+
+.. code-block:: json
+
+    {
+        "zero_optimization": {
+            "stage": 3,
+            "cpu_offload": true,
+            "cpu_offload_params": true,
+            "cpu_offload_use_pin_memory" : true,
+            "overlap_comm": true,
+            "contiguous_gradients": true,
+            "sub_group_size": 1e14,
+            "reduce_bucket_size": 1e6,
+            "stage3_prefetch_bucket_size": 0.94e6,
+            "stage3_param_persistence_threshold": 1e4,
+            "stage3_max_live_parameters": 1e9,
+            "stage3_max_reuse_distance": 1e9,
+            "stage3_gather_fp16_weights_on_model_save": true
+        }
+    }
+
+Note: if you're migrating from ZeRO-2 configuration that: ``allgather_partitions``, ``allgather_bucket_size`` and
+``reduce_scatter`` configuration parameters are not used in ZeRO-3. If you keep these they will just be ignored.
+
+**Performance tuning:**
+
+- ``sub_group_size``: ``1e14``
+- ``reduce_bucket_size``: ``hidden_size*hidden_size``
+- ``stage3_prefetch_bucket_size``: ``0.9 * hidden_size * hidden_size``
+- ``stage3_param_persistence_threshold``: ``10 * hidden_size``
+- ``stage3_max_live_parameters``: ``1e9``
+- ``stage3_max_reuse_distance``: ``1e9``
+
+If hitting OOM reduce ``stage3_max_live_parameters`` and ``stage3_max_reuse_distance``. They should have minimal impact
+on performance unless you are doing activation checkpointing. ``1e9`` would consume ~2GB. The memory is shared by
+``stage3_max_live_parameters`` and ``stage3_max_reuse_distance``, so its not additive, its just 2GB total.
+
+``stage3_max_live_parameters`` is the upper limit on how many full parameters you want to keep on the GPU at any given
+time. "reuse distance" is a metric we are using to figure out when will a parameter be used again in the future, and we
+use the ``stage3_max_reuse_distance`` to decide whether to throw away the parameter or to keep it. If a parameter is
+going to be used again in near future (less than ``stage3_max_reuse_distance``) then we keep it to reduce communication
+overhead. This is super helpful when you have activation checkpointing enabled, where we do a forward recompute and
+backward passes a a single layer granularity and want to keep the parameter in the forward recompute till the backward
+
+If you set ``reduce_bucket_size``, ``stage3_prefetch_bucket_size`` and ``stage3_param_persistence_threshold`` as
+recommended above, they will already be fairly small so you won't have to tune those much.
+
+Since ``hidden_size`` varies from model to model, the ``Trainer`` will automatically set the needed value for the 3
+config parameters that contain that variable (using ``model.config.hidden_size``). Just set these values to ``0`` as
+shown below and the right configuration will be passed to DeepSpeed:
+
+.. code-block:: json
+
+    {
+        "zero_optimization": {
+            "stage": 3,
+            "cpu_offload": true,
+            "cpu_offload_params": true,
+            "cpu_offload_use_pin_memory" : true,
+            "overlap_comm": true,
+            "contiguous_gradients": true,
+            "sub_group_size": 1e14,
+            "reduce_bucket_size": 0,
+            "stage3_prefetch_bucket_size": 0,
+            "stage3_param_persistence_threshold": 0,
+            "stage3_max_live_parameters": 1e9,
+            "stage3_max_reuse_distance": 1e9,
+            "stage3_gather_fp16_weights_on_model_save": true
+        }
+    }
+
+``stage3_gather_fp16_weights_on_model_save`` enables model fp16 weights consolidation when model gets saved. With large
+models and multiple GPUs this is an expensive operation both in terms of memory and speed. It's currently required if
+you plan to resume the training. Watch out for future updates that will remove this limitation and make things more
+flexible.
+
+
+ZeRO-2 vs ZeRO-3 Performance
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+ZeRO-3 is likely to be slower than ZeRO-2 if everything else is configured the same because the former has to gather
+model weights in addition to what ZeRO-2 does. If ZeRO-2 meets your needs and you don't need to scale beyond a few GPUs
+then you may choose to stick to it. It's important to understand that ZeRO-3 enables a much higher scalability capacity
+at a cost of speed.
+
+It's possible to adjust ZeRO-3 configuration to make it perform closer to ZeRO-2:
+
+- set ``stage3_param_persistence_threshold`` to a very large number - larger than the largest parameter, e.g., ``6 *
+  hidden_size * hidden_size``. This will keep the parameters on the GPUs.
+- turn off ``cpu_offload_params`` since ZeRO-2 doesn't have that option.
+
+The performance will likely improve significantly with just ``cpu_offload_params`` turned off, even if you don't change
+``stage3_param_persistence_threshold``. Of course, these changes will impact the size of the model you can train. So
+these help you to trade scalability for speed depending on your needs.
+
+
+
+ZeRO-2 Example
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+Here is a full ZeRO-2 all-enabled configuration file ``ds_config_zero2.json``:
+
+.. code-block:: json
+
+    {
+        "fp16": {
+            "enabled": true,
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "initial_scale_power": 16,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
+
+        "zero_optimization": {
+            "stage": 2,
+            "allgather_partitions": true,
+            "allgather_bucket_size": 2e8,
+            "overlap_comm": true,
+            "reduce_scatter": true,
+            "reduce_bucket_size": 2e8,
+            "contiguous_gradients": true,
+            "cpu_offload": true
+        },
+
+        "optimizer": {
+            "type": "AdamW",
+            "params": {
+                "lr": 3e-5,
+                "betas": [0.8, 0.999],
+                "eps": 1e-8,
+                "weight_decay": 3e-7
+            }
+        },
+
+        "scheduler": {
+            "type": "WarmupLR",
+            "params": {
+                "warmup_min_lr": 0,
+                "warmup_max_lr": 3e-5,
+                "warmup_num_steps": 500
+            }
+        },
+
+        "steps_per_print": 2000,
+        "wall_clock_breakdown": false
+    }
+
+
+
+ZeRO-3 Example
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+Here is a full ZeRO-3 all-enabled configuration file ``ds_config_zero3.json``:
+
+.. code-block:: json
+
+    {
+        "fp16": {
+            "enabled": true,
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "initial_scale_power": 16,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
+
+        "zero_optimization": {
+            "stage": 3,
+            "cpu_offload": true,
+            "cpu_offload_params": true,
+            "cpu_offload_use_pin_memory" : true,
+            "overlap_comm": true,
+            "contiguous_gradients": true,
+            "sub_group_size": 1e14,
+            "reduce_bucket_size": 1e6,
+            "stage3_prefetch_bucket_size": 0.94e6,
+            "stage3_param_persistence_threshold": 1e4,
+            "stage3_max_live_parameters": 1e9,
+            "stage3_max_reuse_distance": 1e9,
+            "stage3_gather_fp16_weights_on_model_save": true
+        },
+
+        "optimizer": {
+            "type": "AdamW",
+            "params": {
+                "lr": 3e-5,
+                "betas": [0.8, 0.999],
+                "eps": 1e-8,
+                "weight_decay": 3e-7
+            }
+        },
+
+        "scheduler": {
+            "type": "WarmupLR",
+            "params": {
+                "warmup_min_lr": 0,
+                "warmup_max_lr": 3e-5,
+                "warmup_num_steps": 500
+            }
+        },
+
+        "steps_per_print": 2000,
+        "wall_clock_breakdown": false
+    }


 Optimizer and Scheduler
@@ -772,7 +1142,7 @@ If ``cpu_offload`` is enabled you must use both DeepSpeed scheduler and DeepSpee


 Optimizer
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


 DeepSpeed's main optimizers are Adam, AdamW, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are
@@ -818,7 +1188,7 @@ make sure to adjust the values. e.g. if use Adam you will want ``weight_decay``


 Scheduler
-"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

 DeepSpeed supports LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR LR schedulers. The full documentation is `here
 <https://www.deepspeed.ai/docs/config-json/#scheduler-parameters>`__.
@@ -886,11 +1256,7 @@ and ``warmup_max_lr``, ``warmup_num_steps`` and ``total_num_steps`` will be corr
 Automatic Mixed Precision
 =======================================================================================================================

-You can work with FP16 in one of the following ways:
-
-1. Pytorch native amp, as documented `here <https://www.deepspeed.ai/docs/config-json/#fp16-training-options>`__.
-2. NVIDIA's apex, as documented `here
-   <https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options>`__.
+You can use automatic mixed precision with either a pytorch-like AMP way or the apex-like way:

 If you want to use an equivalent of the Pytorch native amp, you can either configure the ``fp16`` entry in the
 configuration file, or use the following command line arguments: ``--fp16 --fp16_backend amp``.
@@ -909,6 +1275,8 @@ Here is an example of the ``fp16`` configuration:
        },
    }

+Here is the `documentation <https://www.deepspeed.ai/docs/config-json/#fp16-training-options>`__.
+
 If you want to use NVIDIA's apex instead, you can can either configure the ``amp`` entry in the configuration file, or
 use the following command line arguments: ``--fp16 --fp16_backend apex --fp16_opt_level 01``.

@@ -923,6 +1291,9 @@ Here is an example of the ``amp`` configuration:
        }
    }

+Here is the `documentation
+<https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options>`__.
+

 Gradient Accumulation
 =======================================================================================================================
@@ -935,12 +1306,12 @@ While normally DeepSpeed gets gradient accumulation configured with:
        "gradient_accumulation_steps": 3,
    }

-in this case, to enable gradient accumulation, pass the command line `--gradient_accumulation_steps` argument as normal
-and it will get injected into the DeepSpeed configuration.
+in this case, to enable gradient accumulation, pass the command line ``--gradient_accumulation_steps 3`` argument as
+normal and it will get injected into the DeepSpeed configuration.

-If you try to add it directly to the configuration file, you will receive an error from the Trainer - this is because
-this setting is needed by the Trainer too, and so this approach ensures that there is a single way of setting this
-value and thus avoid potential subtle errors.
+If you try to add it directly to the configuration file, you will receive an error from the ``Trainer`` - this is
+because this setting is needed by the ``Trainer`` too, and so this approach ensures that there is a single way of
+setting this value and thus avoid potential subtle errors.



@@ -963,6 +1334,175 @@ Here is an example of the ``gradient_clipping`` configuration:



+Getting the model weights out
+=======================================================================================================================
+
+As long as you continue training and resuming using DeepSpeed you don't need to worry about anything. DeepSpeed stores
+fp32 master weights in its custom checkpoint optimizer files, which are ``global_step*/*optim_states.pt`` (this is glob
+pattern), and are saved under the normal checkpoint.
+
+**FP16 Weights:**
+
+When a model is saved under ZeRO-2, you end up having the normal ``pytorch_model.bin`` file with the model weights, but
+they are only the fp16 version of the weights.
+
+Under ZeRO-3, things are much more complicated, since the model weights are partitioned out over multiple GPUs,
+therefore ``"stage3_gather_fp16_weights_on_model_save": true`` is required to get the ``Trainer`` to save the fp16
+version of the weights. If this setting is ``False`` ``pytorch_model.bin`` won't be created. This is because by default
+DeepSpeed's ``state_dict`` contains a placeholder and not the real weights. If we were to save this ``state_dict`` it
+won't be possible to load it back.
+
+**FP32 Weights:**
+
+While the fp16 weights are fine for resuming training, if you finished finetuning your model and want to upload it to
+the `models hub <https://huggingface.co/models>`__ or pass it to someone else you most likely will want to get the fp32
+weights. This cannot be done during training since this is a process that requires a lot of memory, and therefore this
+is performed offline.
+
+DeepSpeed creates a special conversion script ``zero_to_fp32.py`` which it places in the top-level of the checkpoint
+folder. Using this script you can extract the weights at any point. The script is standalone and you no longer need to
+have the configuration file or a ``Trainer`` to do the extraction.
+
+Let's say your checkpoint folder looks like this:
+
+.. code-block:: bash
+
+    $ ls -l output_dir/checkpoint-1/
+    -rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json
+    drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/
+    -rw-rw-r-- 1 stas stas   12 Mar 27 13:16 latest
+    -rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt
+    -rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin
+    -rw-rw-r-- 1 stas stas  623 Mar 27 20:42 scheduler.pt
+    -rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json
+    -rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model
+    -rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json
+    -rw-rw-r-- 1 stas stas  339 Mar 27 20:42 trainer_state.json
+    -rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin
+    -rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py*
+
+In this example there is just one DeepSpeed checkpoint sub-folder `global_step1`. Therefore to reconstruct the fp32
+weights just run:
+
+.. code-block:: bash
+
+    python zero_to_fp32.py global_step1 pytorch_model.bin
+
+The script will automatically handle either ZeRO-2 or ZeRO-3 checkpoint.
+
+``python zero_to_fp32.py -h`` will give you usage details.
+
+If you have multiple DeepSpeed checkpoint sub-folders, pick the one you know to have the desired weights.
+
+This is it. ``pytorch_model.bin`` will now contain the full fp32 model weights consolidated from multiple GPUs.
+
+Note: currently the script requires 2x general RAM of the final fp32 model weights.
+
+ZeRO 3 Nuances
+=======================================================================================================================
+
+ZeRO 3 is quite different from ZeRO 2 because of its param sharding feature.
+
+While all the efforts were made for things to just work without needing any special changes to your models, in certain
+circumstances you may find the following information to be needed.
+
+
+Registering External Parameters
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+If layer A needs to access weights belonging to layer B, currently layer A needs to tell DeepSpeed about it. This is
+done with the help of ``deepspeed.zero.register_external_parameter`` that needs to be called in ``A.__init__`` and can
+be seen in the following example:
+
+.. code-block:: python
+
+    class ModuleZ3(torch.nn.Module):
+        def __init__(self, *args):
+            super().__init__(self, *args)
+            self.layer1 = SomeLayer()
+            self.layer2 = OtherLayer()
+            deepspeed.zero.register_external_parameter(self, self.layer1.weight)
+
+        def forward(self, input):
+            x = self.layer1(input)
+            # self.layer1.weight is needed in ModuleZ3.forward
+            y = self.layer2(x, self.layer1.weight)
+            return y
+
+In general ``transformers`` models don't use this style of referring to other layer's weights so most likely you won't
+need to use it.
+
+For full details on this method please refer to `Registering External Parameters
+<https://deepspeed.readthedocs.io/en/latest/zero3.html#registering-external-parameters>`__.
+
+
+
+Constructing Massive Models
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+DeepSpeed/ZeRO-3 can handle models with Trillions of parameters which may not fit onto the existing RAM. In such cases,
+but also if you want the initialization to happen much faster, initialize the model using `deepspeed.zero.Init()`
+context manager (which is also a function decorator), like so:
+
+.. code-block:: python
+
+    from transformers import T5ForConditionalGeneration, T5Config
+    import deepspeed
+    with deepspeed.zero.Init():
+       config = T5Config.from_pretrained("t5-small")
+       model = T5ForConditionalGeneration(config)
+
+As you can see this gives you a randomly initialized model.
+
+If you want to use a pretrained model, ``model_class.from_pretrained`` will activate this feature as long as
+``is_deepspeed_zero3_enabled()`` returns ``True``, which can be set manually via ``deepspeed_zero3_enable(True)``.
+Therefore to enable this feature here is the required sequence:
+
+.. code-block:: python
+
+    from transformers.integrations import deepspeed_zero3_enable
+    deepspeed_zero3_enable(True)
+    model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+If you're using ``Trainer`` command line arguments which include ``--deepspeed ds_config.json`` with ZeRO-3 config
+enabled, then you can skip ``deepspeed_zero3_enable(True)`` as it will try to discover whether it'll be run under
+ZeRO-3 and ``from_pretrained`` will automatically activate this feature.
+
+Note: If the fp16 weights of the model can't fit onto the memory of a single GPU this feature must be used.
+
+For full details on this method and other related features please refer to `Constructing Massive Models
+<https://deepspeed.readthedocs.io/en/latest/zero3.html#constructing-massive-models>`__.
+
+
+
+
+
+Gathering Parameters
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+Under ZeRO-3 on multiple GPUs no single GPU has all the parameters unless it's the parameters for the currently
+executing layer. So if you need to access all parameters from all layers at once there is a specific method to do it.
+Most likely you won't need it, but if you do please refer to `Gathering Parameters
+<https://deepspeed.readthedocs.io/en/latest/zero3.html#manual-parameter-coordination>`__
+
+We do however use it internally in several places, one such example is when loading pretrained model weights in
+``from_pretrained``. We load one layer at a time and immediately partition it to all participating GPUs, as for very
+large models it won't be possible to load it on one GPU and then spread it out to multiple GPUs, due to memory
+limitations.
+
+Also under ZeRO-3, if you write your own code and run into a model parameter weight that looks like:
+
+.. code-block:: python
+
+    tensor([1.], device='cuda:0', dtype=torch.float16, requires_grad=True)
+
+stress on ``tensor([1.])``, or if you get an error where it says the parameter is of size ``1``, instead of some much
+larger multi-dimensional shape, this means that the parameter is partitioned and what you see is a ZeRO-3 placeholder.
+
+
+
+
+
 Notes
 =======================================================================================================================

--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -44,6 +44,13 @@ AutoTokenizer
    :members:


+AutoFeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoFeatureExtractor
+    :members:
+
+
 AutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -90,7 +90,7 @@ BertForPreTraining
    :members: forward


-BertModelLMHeadModel
+BertLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertLMHeadModel
--- a/docs/source/model_doc/bert_japanese.rst
+++ b/docs/source/model_doc/bert_japanese.rst
@@ -0,0 +1,78 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BertJapanese
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BERT models trained on Japanese text.
+
+There are models with two different tokenization methods:
+
+- Tokenize with MeCab and WordPiece. This requires some extra dependencies, `fugashi
+  <https://github.com/polm/fugashi>`__ which is a wrapper around `MeCab <https://taku910.github.io/mecab/>`__.
+- Tokenize into characters.
+
+To use `MecabTokenizer`, you should ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install
+from source) to install dependencies.
+
+See `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__.
+
+Example of using a model with MeCab and WordPiece tokenization:
+
+.. code-block::
+
+    >>> import torch
+    >>> from transformers import AutoModel, AutoTokenizer 
+
+    >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
+    >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
+
+    >>> ## Input Japanese Text
+    >>> line = "吾輩は猫である。"
+
+    >>> inputs = tokenizer(line, return_tensors="pt")
+
+    >>> print(tokenizer.decode(inputs['input_ids'][0]))
+    [CLS] 吾輩 は 猫 で ある 。 [SEP]
+
+    >>> outputs = bertjapanese(**inputs)
+
+Example of using a model with Character tokenization:
+
+.. code-block::
+
+    >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
+    >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
+
+    >>> ## Input Japanese Text
+    >>> line = "吾輩は猫である。"
+
+    >>> inputs = tokenizer(line, return_tensors="pt")
+
+    >>> print(tokenizer.decode(inputs['input_ids'][0]))
+    [CLS] 吾 輩 は 猫 で あ る 。 [SEP]
+
+    >>> outputs = bertjapanese(**inputs)
+
+Tips:
+
+- This implementation is the same as BERT, except for tokenization method. Refer to the :doc:`documentation of BERT
+  <bert>` for more usage examples.
+
+BertJapaneseTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertJapaneseTokenizer
+    :members: 
--- a/docs/source/model_doc/bertgeneration.rst
+++ b/docs/source/model_doc/bertgeneration.rst
@@ -38,22 +38,22 @@ Usage:

 .. code-block::

-  # leverage checkpoints for Bert2Bert model...
-  # use BERT's cls token as BOS token and sep token as EOS token
-  encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
-  # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
-  decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
-  bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
+    >>> # leverage checkpoints for Bert2Bert model...
+    >>> # use BERT's cls token as BOS token and sep token as EOS token
+    >>> encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
+    >>> # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
+    >>> decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
+    >>> bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)

-  # create tokenizer...
-  tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
+    >>> # create tokenizer...
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

-  input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
-  labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
+    >>> input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
+    >>> labels = tokenizer('This is a short summary', return_tensors="pt").input_ids

-  # train...
-  loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
-  loss.backward()
+    >>> # train...
+    >>> loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
+    >>> loss.backward()


 - Pretrained :class:`~transformers.EncoderDecoderModel` are also directly available in the model hub, e.g.,
@@ -61,15 +61,15 @@ Usage:

 .. code-block::

-  # instantiate sentence fusion model
-  sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
-  tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
+    >>> # instantiate sentence fusion model
+    >>> sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")

-  input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids
+    >>> input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids

-  outputs = sentence_fuser.generate(input_ids)
+    >>> outputs = sentence_fuser.generate(input_ids)

-  print(tokenizer.decode(outputs[0]))
+    >>> print(tokenizer.decode(outputs[0]))


 Tips:
--- a/docs/source/model_doc/bertweet.rst
+++ b/docs/source/model_doc/bertweet.rst
@@ -31,28 +31,28 @@ Example of use:

 .. code-block::

-  import torch
-  from transformers import AutoModel, AutoTokenizer 
+    >>> import torch
+    >>> from transformers import AutoModel, AutoTokenizer 

-  bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
+    >>> bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

-  # For transformers v4.x+: 
-  tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
+    >>> # For transformers v4.x+: 
+    >>> tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

-  # For transformers v3.x: 
-  # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
+    >>> # For transformers v3.x: 
+    >>> # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

-  # INPUT TWEET IS ALREADY NORMALIZED!
-  line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
+    >>> # INPUT TWEET IS ALREADY NORMALIZED!
+    >>> line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"

-  input_ids = torch.tensor([tokenizer.encode(line)])
+    >>> input_ids = torch.tensor([tokenizer.encode(line)])

-  with torch.no_grad():
-      features = bertweet(input_ids)  # Models outputs are now tuples
+    >>> with torch.no_grad():
+    ...     features = bertweet(input_ids)  # Models outputs are now tuples

-  ## With TensorFlow 2.0+:
-  # from transformers import TFAutoModel
-  # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
+    >>> # With TensorFlow 2.0+:
+    >>> # from transformers import TFAutoModel
+    >>> # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")


 The original code can be found `here <https://github.com/VinAIResearch/BERTweet>`__.
--- a/docs/source/model_doc/convbert.rst
+++ b/docs/source/model_doc/convbert.rst
@@ -56,8 +56,7 @@ ConvBertTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.ConvBertTokenizerFast
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
+    :members:


 ConvBertModel
--- a/docs/source/model_doc/cpm.rst
+++ b/docs/source/model_doc/cpm.rst
@@ -0,0 +1,44 @@
+..
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+CPM
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The CPM model was proposed in `CPM: A Large-scale Generative Chinese Pre-trained Language Model
+<https://arxiv.org/abs/2012.00413>`__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin,
+Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen,
+Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+
+The abstract from the paper is the following:
+
+*Pre-trained Language Models (PLMs) have proven to be beneficial for various downstream NLP tasks. Recently, GPT-3,
+with 175 billion parameters and 570GB training data, drew a lot of attention due to the capacity of few-shot (even
+zero-shot) learning. However, applying GPT-3 to address Chinese NLP tasks is still challenging, as the training corpus
+of GPT-3 is primarily English, and the parameters are not publicly available. In this technical report, we release the
+Chinese Pre-trained Language Model (CPM) with generative pre-training on large-scale Chinese training data. To the best
+of our knowledge, CPM, with 2.6 billion parameters and 100GB Chinese training data, is the largest Chinese pre-trained
+language model, which could facilitate several downstream Chinese NLP tasks, such as conversation, essay generation,
+cloze test, and language understanding. Extensive experiments demonstrate that CPM achieves strong performance on many
+NLP tasks in the settings of few-shot (even zero-shot) learning.*
+
+The original implementation can be found here: https://github.com/TsinghuaAI/CPM-Generate
+
+Note: We only have a tokenizer here, since the model architecture is the same as GPT-2.
+
+CpmTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CpmTokenizer
+    :members:
--- a/docs/source/model_doc/deit.rst
+++ b/docs/source/model_doc/deit.rst
@@ -0,0 +1,109 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+DeiT
+-----------------------------------------------------------------------------------------------------------------------
+
+.. note::
+
+    This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
+    breaking changes to fix it in the future. If you see something strange, file a `Github Issue
+    <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__.
+
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The DeiT model was proposed in `Training data-efficient image transformers & distillation through attention
+<https://arxiv.org/abs/2012.12877>`__ by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre
+Sablayrolles, Hervé Jégou. The `Vision Transformer (ViT) <https://huggingface.co/transformers/model_doc/vit.html>`__
+introduced in `Dosovitskiy et al., 2020 <https://arxiv.org/abs/2010.11929>`__ has shown that one can match or even
+outperform existing convolutional neural networks using a Transformer encoder (BERT-like). However, the ViT models
+introduced in that paper required training on expensive infrastructure for multiple weeks, using external data. DeiT
+(data-efficient image transformers) are more efficiently trained transformers for image classification, requiring far
+less data and far less computing resources compared to the original ViT models.
+
+The abstract from the paper is the following:
+
+*Recently, neural networks purely based on attention were shown to address image understanding tasks such as image
+classification. However, these visual transformers are pre-trained with hundreds of millions of images using an
+expensive infrastructure, thereby limiting their adoption. In this work, we produce a competitive convolution-free
+transformer by training on Imagenet only. We train them on a single computer in less than 3 days. Our reference vision
+transformer (86M parameters) achieves top-1 accuracy of 83.1% (single-crop evaluation) on ImageNet with no external
+data. More importantly, we introduce a teacher-student strategy specific to transformers. It relies on a distillation
+token ensuring that the student learns from the teacher through attention. We show the interest of this token-based
+distillation, especially when using a convnet as a teacher. This leads us to report results competitive with convnets
+for both Imagenet (where we obtain up to 85.2% accuracy) and when transferring to other tasks. We share our code and
+models.*
+
+Tips:
+
+- Compared to ViT, DeiT models use a so-called distillation token to effectively learn from a teacher (which, in the
+  DeiT paper, is a ResNet like-model). The distillation token is learned through backpropagation, by interacting with
+  the class ([CLS]) and patch tokens through the self-attention layers.
+- There are 2 ways to fine-tune distilled models, either (1) in a classic way, by only placing a prediction head on top
+  of the final hidden state of the class token and not using the distillation signal, or (2) by placing both a
+  prediction head on top of the class token and on top of the distillation token. In that case, the [CLS] prediction
+  head is trained using regular cross-entropy between the prediction of the head and the ground-truth label, while the
+  distillation prediction head is trained using hard distillation (cross-entropy between the prediction of the
+  distillation head and the label predicted by the teacher). At inference time, one takes the average prediction
+  between both heads as final prediction. (2) is also called "fine-tuning with distillation", because one relies on a
+  teacher that has already been fine-tuned on the downstream dataset. In terms of models, (1) corresponds to
+  :class:`~transformers.DeiTForImageClassification` and (2) corresponds to
+  :class:`~transformers.DeiTForImageClassificationWithTeacher`.
+- Note that the authors also did try soft distillation for (2) (in which case the distillation prediction head is
+  trained using KL divergence to match the softmax output of the teacher), but hard distillation gave the best results.
+- All released checkpoints were pre-trained and fine-tuned on ImageNet-1k only. No external data was used. This is in
+  contrast with the original ViT model, which used external data like the JFT-300M dataset/Imagenet-21k for
+  pre-training.
+- The authors of DeiT also released more efficiently trained ViT models, which you can directly plug into
+  :class:`~transformers.ViTModel` or :class:`~transformers.ViTForImageClassification`. Techniques like data
+  augmentation, optimization, and regularization were used in order to simulate training on a much larger dataset
+  (while only using ImageNet-1k for pre-training). There are 4 variants available (in 3 different sizes):
+  `facebook/deit-tiny-patch16-224`, `facebook/deit-small-patch16-224`, `facebook/deit-base-patch16-224` and
+  `facebook/deit-base-patch16-384`. Note that one should use :class:`~transformers.DeiTFeatureExtractor` in order to
+  prepare images for the model.
+
+
+DeiTConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DeiTConfig
+    :members:
+
+
+DeiTFeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DeiTFeatureExtractor
+    :members: __call__
+
+
+DeiTModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DeiTModel
+    :members: forward
+
+
+DeiTForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DeiTForImageClassification
+    :members: forward
+
+
+DeiTForImageClassificationWithTeacher
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DeiTForImageClassificationWithTeacher
+    :members: forward
--- a/docs/source/model_doc/gpt_neo.rst
+++ b/docs/source/model_doc/gpt_neo.rst
@@ -38,9 +38,9 @@ The :obj:`generate()` method can be used to generate text using GPT Neo model.
    ...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
    ...          "researchers was the fact that the unicorns spoke perfect English."

-    >>> input_ids = tokenizer(unicorns, return_tensors="pt").input_ids
+    >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids

-    >>> gen_tokens = model.generate(ids, do_sample=True, temperature=0.9, max_length=100,)
+    >>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
    >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]


--- a/docs/source/model_doc/herbert.rst
+++ b/docs/source/model_doc/herbert.rst
@@ -40,20 +40,20 @@ Examples of use:

 .. code-block::

-  from transformers import HerbertTokenizer, RobertaModel
+    >>> from transformers import HerbertTokenizer, RobertaModel

-  tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
-  model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
+    >>> tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+    >>> model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")

-  encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
-  outputs = model(encoded_input)
+    >>> encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
+    >>> outputs = model(encoded_input)

-  # HerBERT can also be loaded using AutoTokenizer and AutoModel:
-  import torch
-  from transformers import AutoModel, AutoTokenizer
+    >>> # HerBERT can also be loaded using AutoTokenizer and AutoModel:
+    >>> import torch
+    >>> from transformers import AutoModel, AutoTokenizer

-  tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
-  model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")
+    >>> tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+    >>> model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")


 The original code can be found `here <https://github.com/allegro/HerBERT>`__.
--- a/docs/source/model_doc/layoutlm.rst
+++ b/docs/source/model_doc/layoutlm.rst
@@ -56,24 +56,24 @@ Tips:

 .. code-block::

-   def normalize_bbox(bbox, width, height):
-        return [
-            int(1000 * (bbox[0] / width)),
-            int(1000 * (bbox[1] / height)),
-            int(1000 * (bbox[2] / width)),
-            int(1000 * (bbox[3] / height)),
-        ]
+    def normalize_bbox(bbox, width, height):
+         return [
+             int(1000 * (bbox[0] / width)),
+             int(1000 * (bbox[1] / height)),
+             int(1000 * (bbox[2] / width)),
+             int(1000 * (bbox[3] / height)),
+         ]

 Here, :obj:`width` and :obj:`height` correspond to the width and height of the original document in which the token
 occurs. Those can be obtained using the Python Image Library (PIL) library for example, as follows:

 .. code-block::

-   from PIL import Image
+    from PIL import Image

-   image = Image.open("name_of_your_document - can be a png file, pdf, etc.")
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.")

-   width, height = image.size
+    width, height = image.size

 - For a demo which shows how to fine-tune :class:`LayoutLMForTokenClassification` on the `FUNSD dataset
  <https://guillaumejaume.github.io/FUNSD/>`__ (a collection of annotated forms), see `this notebook
--- a/docs/source/model_doc/led.rst
+++ b/docs/source/model_doc/led.rst
@@ -73,8 +73,7 @@ LEDTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.LEDTokenizerFast
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
+    :members:


 LED specific outputs
--- a/docs/source/model_doc/megatron_bert.rst
+++ b/docs/source/model_doc/megatron_bert.rst
@@ -0,0 +1,153 @@
+.. 
+    Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+MegatronBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The MegatronBERT model was proposed in `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
+Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
+Jared Casper and Bryan Catanzaro.
+
+The abstract from the paper is the following:
+
+*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
+Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
+constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
+efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
+approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
+parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
+illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
+15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
+that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
+the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
+billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
+BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
+achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
+accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
+of 89.4%).*
+
+Tips:
+
+We have provided pretrained `BERT-345M <https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m>`__ checkpoints
+for use to evaluate or finetuning downstream tasks.
+
+To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__ for and setup the NVIDIA GPU Cloud (NGC)
+Registry CLI. Further documentation for downloading models can be found in the `NGC documentation
+<https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
+
+Alternatively, you can directly download the checkpoints using:
+
+BERT-345M-uncased::
+
+.. code-block:: bash
+
+    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
+    -O megatron_bert_345m_v0_1_uncased.zip
+
+BERT-345M-cased::
+
+.. code-block:: bash
+
+    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
+    megatron_bert_345m_v0_1_cased.zip
+
+Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to convert them to a format that will
+easily be loaded by Hugging Face Transformers and our port of the BERT code.
+
+The following commands allow you to do the conversion. We assume that the folder ``models/megatron_bert`` contains
+``megatron_bert_345m_v0_1_{cased, uncased}.zip`` and that the commands are run from inside that folder::
+
+.. code-block:: bash
+
+    python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip 
+
+.. code-block:: bash
+
+    python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
+
+The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
+and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
+approach using "tensor parallel" and "pipeline parallel" techniques.
+
+MegatronBertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertConfig
+    :members:
+
+
+MegatronBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertModel
+    :members: forward
+
+
+MegatronBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForMaskedLM
+    :members: forward
+
+
+MegatronBertForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForCausalLM
+    :members: forward
+
+
+MegatronBertForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForNextSentencePrediction
+    :members: forward
+
+
+MegatronBertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForPreTraining
+    :members: forward
+
+
+MegatronBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForSequenceClassification
+    :members: forward
+
+
+MegatronBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForMultipleChoice
+    :members: forward
+
+
+MegatronBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForTokenClassification
+    :members: forward
+
+
+MegatronBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForQuestionAnswering
+    :members: forward
+
+
--- a/docs/source/model_doc/megatron_gpt2.rst
+++ b/docs/source/model_doc/megatron_gpt2.rst
@@ -0,0 +1,70 @@
+.. 
+    Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+MegatronGPT2
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The MegatronGPT2 model was proposed in `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
+Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
+Jared Casper and Bryan Catanzaro.
+
+The abstract from the paper is the following:
+
+*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
+Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
+constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
+efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
+approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
+parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
+illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
+15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
+that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
+the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
+billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
+BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
+achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
+accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
+of 89.4%).*
+
+Tips:
+
+We have provided pretrained `GPT2-345M <https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m>`__ checkpoints
+for use to evaluate or finetuning downstream tasks.
+
+To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__ for and setup the NVIDIA GPU Cloud (NGC)
+Registry CLI. Further documentation for downloading models can be found in the `NGC documentation
+<https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
+
+Alternatively, you can directly download the checkpoints using::
+
+.. code-block:: bash
+
+    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
+    megatron_gpt2_345m_v0_0.zip
+
+Once you have obtained the checkpoint from NVIDIA GPU Cloud (NGC), you have to convert it to a format that will easily
+be loaded by Hugging Face Transformers GPT2 implementation.
+
+The following command allows you to do the conversion. We assume that the folder ``models/megatron_gpt2`` contains
+``megatron_gpt2_345m_v0_0.zip`` and that the command is run from that folder::
+
+.. code-block:: bash
+
+    python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
+
+The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
+and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
+approach using "tensor parallel" and "pipeline parallel" techniques.
+
--- a/docs/source/model_doc/phobert.rst
+++ b/docs/source/model_doc/phobert.rst
@@ -31,23 +31,23 @@ Example of use:

 .. code-block::

-  import torch
-  from transformers import AutoModel, AutoTokenizer
+    >>> import torch
+    >>> from transformers import AutoModel, AutoTokenizer

-  phobert = AutoModel.from_pretrained("vinai/phobert-base")
-  tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
+    >>> phobert = AutoModel.from_pretrained("vinai/phobert-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

-  # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
-  line = "Tôi là sinh_viên trường đại_học Công_nghệ ."
+    >>> # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
+    >>> line = "Tôi là sinh_viên trường đại_học Công_nghệ ."

-  input_ids = torch.tensor([tokenizer.encode(line)])
+    >>> input_ids = torch.tensor([tokenizer.encode(line)])

-  with torch.no_grad():
-      features = phobert(input_ids)  # Models outputs are now tuples
+    >>> with torch.no_grad():
+    ...     features = phobert(input_ids)  # Models outputs are now tuples

-  ## With TensorFlow 2.0+:
-  # from transformers import TFAutoModel
-  # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
+    >>> # With TensorFlow 2.0+:
+    >>> # from transformers import TFAutoModel
+    >>> # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")


 The original code can be found `here <https://github.com/VinAIResearch/PhoBERT>`__.
--- a/docs/source/model_doc/reformer.rst
+++ b/docs/source/model_doc/reformer.rst
@@ -145,8 +145,8 @@ For training, the :class:`~transformers.ReformerModelWithLMHead` should be used

 .. code-block::

-  input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
-  loss = model(input_ids, labels=input_ids)[0]
+    input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+    loss = model(input_ids, labels=input_ids)[0]


 ReformerConfig
--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -73,10 +73,10 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash

 .. code-block::

-  input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
-  labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
-  # the forward function automatically creates the correct decoder_input_ids
-  loss = model(input_ids=input_ids, labels=labels).loss
+    input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
+    labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
+    # the forward function automatically creates the correct decoder_input_ids
+    loss = model(input_ids=input_ids, labels=labels).loss

 - Supervised training

@@ -86,10 +86,10 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash

 .. code-block::

-  input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
-  labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
-  # the forward function automatically creates the correct decoder_input_ids
-  loss = model(input_ids=input_ids, labels=labels).loss
+    input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+    labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
+    # the forward function automatically creates the correct decoder_input_ids
+    loss = model(input_ids=input_ids, labels=labels).loss


 T5Config
--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -1,5 +1,5 @@
 .. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
+    Copyright 2021 The HuggingFace Team. All rights reserved.

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
@@ -47,10 +47,6 @@ Tips:
  which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be
  used for classification. The authors also add absolute position embeddings, and feed the resulting sequence of
  vectors to a standard Transformer encoder.
- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
-  use a higher resolution than pre-training `(Touvron et al., 2019) <https://arxiv.org/abs/1906.06423>`__, `(Kolesnikov
-  et al., 2020) <https://arxiv.org/abs/1912.11370>`__. The authors report the best results with a resolution of 384x384
-  during fine-tuning.
 - As the Vision Transformer expects each image to be of the same size (resolution), one can use
  :class:`~transformers.ViTFeatureExtractor` to resize (or rescale) and normalize images for the model.
 - Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
@@ -61,6 +57,10 @@ Tips:
  14 million images and 21k classes) only, or (2) also fine-tuned on `ImageNet
  <http://www.image-net.org/challenges/LSVRC/2012/>`__ (also referred to as ILSVRC 2012, a collection of 1.3 million
  images and 1,000 classes).
+- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
+  use a higher resolution than pre-training `(Touvron et al., 2019) <https://arxiv.org/abs/1906.06423>`__, `(Kolesnikov
+  et al., 2020) <https://arxiv.org/abs/1912.11370>`__. In order to fine-tune at higher resolution, the authors perform
+  2D interpolation of the pre-trained position embeddings, according to their location in the original image.
 - The best results are obtained with supervised pre-training, which is not the case in NLP. The authors also performed
  an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked
  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -1,4 +1,4 @@
-.. 
+..
    Copyright 2020 The HuggingFace Team. All rights reserved.

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
@@ -70,19 +70,19 @@ Run all:

 .. code-block:: console

-   pytest
+    pytest

 or:

 .. code-block:: bash

-   make test
+    make test

 Note that the latter is defined as:

 .. code-block:: bash

-   python -m pytest -n auto --dist=loadfile -s -v ./tests/
+    python -m pytest -n auto --dist=loadfile -s -v ./tests/

 which tells pytest to:

@@ -100,13 +100,13 @@ All tests of the test suite:

 .. code-block:: bash

-   pytest --collect-only -q
+    pytest --collect-only -q

 All tests of a given test file:

 .. code-block:: bash

-   pytest tests/test_optimization.py --collect-only -q
+    pytest tests/test_optimization.py --collect-only -q



@@ -117,7 +117,7 @@ To run an individual test module:

 .. code-block:: bash

-   pytest tests/test_logging.py
+    pytest tests/test_logging.py


 Run specific tests
@@ -128,7 +128,7 @@ class containing those tests. For example, it could be:

 .. code-block:: bash

-   pytest tests/test_optimization.py::OptimizationTest::test_adam_w
+    pytest tests/test_optimization.py::OptimizationTest::test_adam_w

 Here:

@@ -140,7 +140,7 @@ If the file contains multiple classes, you can choose to run only tests of a giv

 .. code-block:: bash

-   pytest tests/test_optimization.py::OptimizationTest
+    pytest tests/test_optimization.py::OptimizationTest


 will run all the tests inside that class.
@@ -149,7 +149,7 @@ As mentioned earlier you can see what tests are contained inside the ``Optimizat

 .. code-block:: bash

-   pytest tests/test_optimization.py::OptimizationTest --collect-only -q
+    pytest tests/test_optimization.py::OptimizationTest --collect-only -q

 You can run tests by keyword expressions.

@@ -157,7 +157,7 @@ To run only tests whose name contains ``adam``:

 .. code-block:: bash

-   pytest -k adam tests/test_optimization.py
+    pytest -k adam tests/test_optimization.py

 Logical ``and`` and ``or`` can be used to indicate whether all keywords should match or either. ``not`` can be used to
 negate.
@@ -166,19 +166,19 @@ To run all tests except those whose name contains ``adam``:

 .. code-block:: bash

-   pytest -k "not adam" tests/test_optimization.py
+    pytest -k "not adam" tests/test_optimization.py

 And you can combine the two patterns in one:

 .. code-block:: bash

-   pytest -k "ada and not adam" tests/test_optimization.py
+    pytest -k "ada and not adam" tests/test_optimization.py

 For example to run both ``test_adafactor`` and ``test_adam_w`` you can use:

 .. code-block:: bash

-   pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py
+    pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py

 Note that we use ``or`` here, since we want either of the keywords to match to include both.

@@ -186,7 +186,7 @@ If you want to include only tests that include both patterns, ``and`` is to be u

 .. code-block:: bash

-   pytest -k "test and ada" tests/test_optimization.py
+    pytest -k "test and ada" tests/test_optimization.py



@@ -251,7 +251,7 @@ example, to run all except ``test_modeling_*.py`` tests:

 .. code-block:: bash

-   pytest `ls -1 tests/*py | grep -v test_modeling`
+    pytest `ls -1 tests/*py | grep -v test_modeling`


 Clearing state
@@ -292,13 +292,13 @@ Repeat tests

 .. code-block:: bash

-   pip install pytest-flakefinder
+    pip install pytest-flakefinder

 And then run every test multiple times (50 by default):

 .. code-block:: bash

-   pytest --flake-finder --flake-runs=5 tests/test_failing_test.py
+    pytest --flake-finder --flake-runs=5 tests/test_failing_test.py

 .. note::
   This plugin doesn't work with ``-n`` flag from ``pytest-xdist``.
@@ -322,19 +322,19 @@ As explained earlier this allows detection of coupled tests - where one test's s

 .. code-block:: bash

-   pytest tests
-   [...]
-   Using --random-order-bucket=module
-   Using --random-order-seed=573663
+    pytest tests
+    [...]
+    Using --random-order-bucket=module
+    Using --random-order-seed=573663

 So that if the given particular sequence fails, you can reproduce it by adding that exact seed, e.g.:

 .. code-block:: bash

-   pytest --random-order-seed=573663
-   [...]
-   Using --random-order-bucket=module
-   Using --random-order-seed=573663
+    pytest --random-order-seed=573663
+    [...]
+    Using --random-order-bucket=module
+    Using --random-order-seed=573663

 It will only reproduce the exact order if you use the exact same list of tests (or no list at all). Once you start to
 manually narrowing down the list you can no longer rely on the seed, but have to list them manually in the exact order
@@ -342,7 +342,7 @@ they failed and tell pytest to not randomize them instead using ``--random-order

 .. code-block:: bash

-   pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py
+    pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py

 To disable the shuffling for all tests:

@@ -369,7 +369,7 @@ progressbar, and show tests that fail and the assert instantly. It gets activate

 .. code-block:: bash

-   pip install pytest-sugar
+    pip install pytest-sugar

 To run tests without it, run:

@@ -388,7 +388,7 @@ For a single or a group of tests via ``pytest`` (after ``pip install pytest-pspe

 .. code-block:: bash

-   pytest --pspec tests/test_optimization.py 
+    pytest --pspec tests/test_optimization.py



@@ -490,8 +490,8 @@ Inside tests:

 .. code-block:: bash

-   from transformers.testing_utils import get_gpu_count
-   n_gpu = get_gpu_count() # works with torch and tf
+    from transformers.testing_utils import get_gpu_count
+    n_gpu = get_gpu_count() # works with torch and tf



@@ -514,8 +514,8 @@ You will need at least 2 GPUs to see these tests in action:

 .. code-block:: bash

-   CUDA_VISIBLE_DEVICES="0,1" RUN_SLOW=1 pytest -sv examples/seq2seq/test_finetune_trainer.py \
-   examples/seq2seq/test_seq2seq_examples_multi_gpu.py
+    CUDA_VISIBLE_DEVICES="0,1" RUN_SLOW=1 pytest -sv examples/seq2seq/test_finetune_trainer.py \
+    examples/seq2seq/test_seq2seq_examples_multi_gpu.py


 Output capture
@@ -528,13 +528,13 @@ To disable output capturing and to get the ``stdout`` and ``stderr`` normally, u

 .. code-block:: bash

-   pytest -s tests/test_logging.py
+    pytest -s tests/test_logging.py

 To send test results to JUnit format output:

 .. code-block:: bash

-   py.test tests --junitxml=result.xml
+    py.test tests --junitxml=result.xml


 Color control
@@ -544,7 +544,7 @@ To have no color (e.g., yellow on white background is not readable):

 .. code-block:: bash

-   pytest --color=no tests/test_logging.py
+    pytest --color=no tests/test_logging.py



@@ -555,7 +555,7 @@ Creating a URL for each test failure:

 .. code-block:: bash

-   pytest --pastebin=failed tests/test_logging.py
+    pytest --pastebin=failed tests/test_logging.py

 This will submit test run information to a remote Paste service and provide a URL for each failure. You may select
 tests as usual or add for example -x if you only want to send one particular failure.
@@ -564,7 +564,7 @@ Creating a URL for a whole test session log:

 .. code-block:: bash

-   pytest --pastebin=all tests/test_logging.py
+    pytest --pastebin=all tests/test_logging.py



@@ -606,13 +606,13 @@ and you could run just the ``negative`` and ``integer`` sets of params with:

 .. code-block:: bash

-   pytest -k "negative and integer" tests/test_mytest.py
+    pytest -k "negative and integer" tests/test_mytest.py

 or all but ``negative`` sub-tests, with:

 .. code-block:: bash

-   pytest -k "not negative" tests/test_mytest.py
+    pytest -k "not negative" tests/test_mytest.py

 Besides using the ``-k`` filter that was just mentioned, you can find out the exact name of each sub-test and run any
 or all of them using their exact names.
@@ -672,7 +672,7 @@ and it will list:

    test_this2.py::test_floor[integer-1-1.0]
    test_this2.py::test_floor[negative--1.5--2.0]
-    test_this2.py::test_floor[large fraction-1.6-1]       
+    test_this2.py::test_floor[large fraction-1.6-1]

 So now you can run just the specific test:

@@ -795,6 +795,23 @@ leave any data in there.
   otherwise.


+Temporary sys.path override
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you need to temporary override ``sys.path`` to import from another test for example, you can use the
+``ExtendSysPath`` context manager. Example:
+
+
+.. code-block:: python
+
+    import os
+    from transformers.testing_utils import ExtendSysPath
+    bindir = os.path.abspath(os.path.dirname(__file__))
+    with ExtendSysPath(f"{bindir}/.."):
+        from test_trainer import TrainerIntegrationCommon  # noqa
+
+
+
 Skipping tests
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/troubleshooting.md
+++ b/docs/source/troubleshooting.md
@@ -0,0 +1,30 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Troubleshooting
+
+This document is to help find solutions for common problems.
+
+## Firewalled environments
+
+Some cloud and intranet setups have their GPU instances firewalled to the outside world, so if your script is trying to download model weights or datasets it will first hang and then timeout with an error message like:
+
+```
+ValueError: Connection error, and we cannot find the requested files in the cached path.
+Please try again or make sure your Internet connection is on.
+```
+
+One possible solution in this situation is to use the ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode).
--- a/examples/language-modeling/run_clm.py
+++ b/examples/language-modeling/run_clm.py
@@ -43,12 +43,13 @@ from transformers import (
    default_data_collator,
    set_seed,
 )
+from transformers.testing_utils import CaptureLogger
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
 from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0")
+check_min_version("4.6.0.dev0")

 logger = logging.getLogger(__name__)

@@ -136,8 +137,8 @@ class DataTrainingArguments:
    block_size: Optional[int] = field(
        default=None,
        metadata={
-            "help": "Optional input sequence length after tokenization."
-            "The training dataset will be truncated in block of this size for training."
+            "help": "Optional input sequence length after tokenization. "
+            "The training dataset will be truncated in block of this size for training. "
            "Default to the model max input length for single sentence inputs (take into account special tokens)."
        },
    )
@@ -229,17 +230,19 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
            )
    else:
        data_files = {}
@@ -254,7 +257,7 @@ def main():
        )
        if extension == "txt":
            extension = "text"
-        datasets = load_dataset(extension, data_files=data_files)
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -316,8 +319,18 @@ def main():
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

+    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
+    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
+
    def tokenize_function(examples):
-        return tokenizer(examples[text_column_name])
+        with CaptureLogger(tok_logger) as cl:
+            output = tokenizer(examples[text_column_name])
+        # clm input could be much much longer than block_size
+        if "Token indices sequence length is longer than the" in cl.out:
+            tok_logger.warning(
+                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model."
+            )
+        return output

    tokenized_datasets = datasets.map(
        tokenize_function,
@@ -330,14 +343,14 @@ def main():
    if data_args.block_size is None:
        block_size = tokenizer.model_max_length
        if block_size > 1024:
-            logger.warn(
+            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
            )
        block_size = 1024
    else:
        if data_args.block_size > tokenizer.model_max_length:
-            logger.warn(
+            logger.warning(
                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
            )
--- a/examples/language-modeling/run_clm_no_trainer.py
+++ b/examples/language-modeling/run_clm_no_trainer.py
@@ -305,14 +305,14 @@ def main():
    if args.block_size is None:
        block_size = tokenizer.model_max_length
        if block_size > 1024:
-            logger.warn(
+            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
            )
        block_size = 1024
    else:
        if args.block_size > tokenizer.model_max_length:
-            logger.warn(
+            logger.warning(
                f"The block_size passed ({args.block_size}) is larger than the maximum length for the model"
                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
            )
--- a/examples/language-modeling/run_mlm.py
+++ b/examples/language-modeling/run_mlm.py
@@ -48,7 +48,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0")
+check_min_version("4.6.0.dev0")

 logger = logging.getLogger(__name__)
 MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
@@ -239,17 +239,19 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
            )
    else:
        data_files = {}
@@ -260,7 +262,7 @@ def main():
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
-        datasets = load_dataset(extension, data_files=data_files)
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -324,14 +326,14 @@ def main():
    if data_args.max_seq_length is None:
        max_seq_length = tokenizer.model_max_length
        if max_seq_length > 1024:
-            logger.warn(
+            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
            )
            max_seq_length = 1024
    else:
        if data_args.max_seq_length > tokenizer.model_max_length:
-            logger.warn(
+            logger.warning(
                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
            )
@@ -422,7 +424,12 @@ def main():

    # Data collator
    # This one will take care of randomly masking the tokens.
-    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
+    pad_to_multiple_of_8 = data_args.line_by_line and training_args.fp16 and not data_args.pad_to_max_length
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm_probability=data_args.mlm_probability,
+        pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
+    )

    # Initialize our Trainer
    trainer = Trainer(
--- a/examples/language-modeling/run_mlm_flax.py
+++ b/examples/language-modeling/run_mlm_flax.py
@@ -475,17 +475,19 @@ if __name__ == "__main__":
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
            )
    else:
        data_files = {}
@@ -496,7 +498,7 @@ if __name__ == "__main__":
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
-        datasets = load_dataset(extension, data_files=data_files)
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

--- a/examples/language-modeling/run_mlm_no_trainer.py
+++ b/examples/language-modeling/run_mlm_no_trainer.py
@@ -308,14 +308,14 @@ def main():
    if args.max_seq_length is None:
        max_seq_length = tokenizer.model_max_length
        if max_seq_length > 1024:
-            logger.warn(
+            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
            )
            max_seq_length = 1024
    else:
        if args.max_seq_length > tokenizer.model_max_length:
-            logger.warn(
+            logger.warning(
                f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
            )
--- a/examples/language-modeling/run_plm.py
+++ b/examples/language-modeling/run_plm.py
@@ -44,7 +44,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0")
+check_min_version("4.6.0.dev0")

 logger = logging.getLogger(__name__)

@@ -236,17 +236,19 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
            )
    else:
        data_files = {}
@@ -257,7 +259,7 @@ def main():
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
-        datasets = load_dataset(extension, data_files=data_files)
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -319,7 +321,7 @@ def main():
    text_column_name = "text" if "text" in column_names else column_names[0]

    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warn(
+        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
--- a/examples/legacy/question-answering/run_squad.py
+++ b/examples/legacy/question-answering/run_squad.py
@@ -436,7 +436,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

            if args.version_2_with_negative:
-                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")
+                logger.warning("tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
--- a/examples/legacy/seq2seq/README.md
+++ b/examples/legacy/seq2seq/README.md
@@ -28,7 +28,7 @@ For deprecated `bertabs` instructions, see [`bertabs/README.md`](https://github.
 - `FSMTForConditionalGeneration`
 - `T5ForConditionalGeneration`

-### Downlowd the Datasets
+### Download the Datasets

 #### XSUM

--- a/examples/legacy/seq2seq/run_distributed_eval.py
+++ b/examples/legacy/seq2seq/run_distributed_eval.py
@@ -204,7 +204,8 @@ def run_generate():
            save_json(preds, save_path)
            return
        tgt_file = Path(args.data_dir).joinpath(args.type_path + ".target")
-        labels = [x.rstrip() for x in open(tgt_file).readlines()][: len(preds)]
+        with open(tgt_file) as f:
+            labels = [x.rstrip() for x in f.readlines()][: len(preds)]

        # Calculate metrics, save metrics,  and save _generations.txt
        calc_bleu = "translation" in args.task
--- a/examples/legacy/seq2seq/seq2seq_trainer.py
+++ b/examples/legacy/seq2seq/seq2seq_trainer.py
@@ -73,7 +73,7 @@ class Seq2SeqTrainer(Trainer):
            ), "Make sure that `config.pad_token_id` is correcly defined when ignoring `pad_token` for loss calculation or doing label smoothing."

        if self.config.pad_token_id is None and self.config.eos_token_id is not None:
-            logger.warn(
+            logger.warning(
                f"The `config.pad_token_id` is `None`. Using `config.eos_token_id` = {self.config.eos_token_id} for padding.."
            )

@@ -127,7 +127,7 @@ class Seq2SeqTrainer(Trainer):
        if self.lr_scheduler is None:
            self.lr_scheduler = self._get_lr_scheduler(num_training_steps)
        else:  # ignoring --lr_scheduler
-            logger.warn("scheduler is passed to `Seq2SeqTrainer`, `--lr_scheduler` arg is ignored.")
+            logger.warning("scheduler is passed to `Seq2SeqTrainer`, `--lr_scheduler` arg is ignored.")

    def _get_lr_scheduler(self, num_training_steps):
        schedule_func = arg_to_scheduler[self.args.lr_scheduler]
--- a/examples/multiple-choice/run_swag.py
+++ b/examples/multiple-choice/run_swag.py
@@ -46,7 +46,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0")
+check_min_version("4.6.0.dev0")

 logger = logging.getLogger(__name__)

@@ -268,10 +268,10 @@ def main():
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files)
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
    else:
        # Downloading and loading the swag dataset from the hub.
-        datasets = load_dataset("swag", "regular")
+        datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -310,14 +310,14 @@ def main():
    if data_args.max_seq_length is None:
        max_seq_length = tokenizer.model_max_length
        if max_seq_length > 1024:
-            logger.warn(
+            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
            )
            max_seq_length = 1024
    else:
        if data_args.max_seq_length > tokenizer.model_max_length:
-            logger.warn(
+            logger.warning(
                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
            )
--- a/examples/question-answering/run_qa.py
+++ b/examples/question-answering/run_qa.py
@@ -46,7 +46,7 @@ from utils_qa import postprocess_qa_predictions


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0")
+check_min_version("4.6.0.dev0")

 logger = logging.getLogger(__name__)

@@ -256,7 +256,7 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
    else:
        data_files = {}
        if data_args.train_file is not None:
@@ -269,7 +269,7 @@ def main():
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files, field="data")
+        datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -324,7 +324,7 @@ def main():
    pad_on_right = tokenizer.padding_side == "right"

    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warn(
+        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
--- a/examples/question-answering/run_qa_beam_search.py
+++ b/examples/question-answering/run_qa_beam_search.py
@@ -45,7 +45,7 @@ from utils_qa import postprocess_qa_predictions_with_beam_search


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0")
+check_min_version("4.6.0.dev0")

 logger = logging.getLogger(__name__)

@@ -255,7 +255,7 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
    else:
        data_files = {}
        if data_args.train_file is not None:
@@ -267,7 +267,7 @@ def main():
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files, field="data")
+        datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -313,7 +313,7 @@ def main():
    pad_on_right = tokenizer.padding_side == "right"

    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warn(
+        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
--- a/examples/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/question-answering/run_qa_beam_search_no_trainer.py
@@ -0,0 +1,797 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning a 🤗 Transformers model on question answering.
+"""
+# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
+
+import argparse
+import logging
+import math
+import os
+import random
+
+import datasets
+import numpy as np
+import torch
+from datasets import load_dataset, load_metric
+from torch.utils.data.dataloader import DataLoader
+from tqdm.auto import tqdm
+
+import transformers
+from accelerate import Accelerator
+from transformers import (
+    AdamW,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    SchedulerType,
+    XLNetConfig,
+    XLNetForQuestionAnswering,
+    XLNetTokenizerFast,
+    default_data_collator,
+    get_scheduler,
+    set_seed,
+)
+from transformers.utils import check_min_version
+from utils_qa import postprocess_qa_predictions_with_beam_search
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.5.0.dev0")
+
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a Question Answering task")
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
+    )
+    parser.add_argument(
+        "--preprocessing_num_workers", type=int, default=4, help="A csv or a json file containing the training data."
+    )
+    parser.add_argument("--do_predict", action="store_true", help="Eval the question answering model")
+    parser.add_argument(
+        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        type=int,
+        default=384,
+        help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
+        " sequences shorter will be padded if `--pad_to_max_lengh` is passed.",
+    )
+    parser.add_argument(
+        "--pad_to_max_length",
+        action="store_true",
+        help="If passed, pad all samples to `max_seq_length`. Otherwise, dynamic padding is used.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--doc_stride",
+        type=int,
+        default=128,
+        help="When splitting up a long document into chunks how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--n_best_size",
+        type=int,
+        default=20,
+        help="The total number of n-best predictions to generate when looking for an answer.",
+    )
+    parser.add_argument(
+        "--null_score_diff_threshold",
+        type=float,
+        default=0.0,
+        help="The threshold used to select the null answer: if the best answer has a score that is less than "
+        "the score of the null answer minus this threshold, the null answer is selected for this example. "
+        "Only useful when `version_2_with_negative=True`.",
+    )
+    parser.add_argument(
+        "--version_2_with_negative",
+        type=bool,
+        default=False,
+        help="If true, some of the examples do not have an answer.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        type=int,
+        default=30,
+        help="The maximum length of an answer that can be generated. This is needed because the start "
+        "and end predictions are not conditioned on one another.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help="For debugging purposes or quicker training, truncate the number of training examples to this "
+        "value if set.",
+    )
+    parser.add_argument(
+        "--max_val_samples",
+        type=int,
+        default=None,
+        help="For debugging purposes or quicker training, truncate the number of validation examples to this "
+        "value if set.",
+    )
+    parser.add_argument(
+        "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument(
+        "--max_test_samples",
+        type=int,
+        default=None,
+        help="For debugging purposes or quicker training, truncate the number of test examples to this",
+    )
+
+    args = parser.parse_args()
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_file is None and args.validation_file is None:
+        raise ValueError("Need either a dataset name or a training/validation file.")
+    else:
+        if args.train_file is not None:
+            extension = args.train_file.split(".")[-1]
+            assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+        if args.validation_file is not None:
+            extension = args.validation_file.split(".")[-1]
+            assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    accelerator = Accelerator()
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state)
+
+    # Setup logging, we only want one process per machine to log things on the screen.
+    # accelerator.is_local_main_process is only True for one process per machine.
+    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+    else:
+        data_files = {}
+        if args.train_file is not None:
+            data_files["train"] = args.train_file
+        if args.validation_file is not None:
+            data_files["validation"] = args.validation_file
+        extension = args.train_file.split(".")[-1]
+        raw_datasets = load_dataset(extension, data_files=data_files)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    config = XLNetConfig.from_pretrained(args.model_name_or_path)
+    tokenizer = XLNetTokenizerFast.from_pretrained(args.model_name_or_path)
+    model = XLNetForQuestionAnswering.from_pretrained(
+        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
+    )
+
+    # Preprocessing the datasets.
+    # Preprocessing is slighlty different for training and evaluation.
+    column_names = raw_datasets["train"].column_names
+
+    question_column_name = "question" if "question" in column_names else column_names[0]
+    context_column_name = "context" if "context" in column_names else column_names[1]
+    answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+    # Padding side determines if we do (question|context) or (context|question).
+    pad_on_right = tokenizer.padding_side == "right"
+
+    if args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+
+    max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)
+
+    # Training preprocessing
+    def prepare_train_features(examples):
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            return_special_tokens_mask=True,
+            return_token_type_ids=True,
+            padding="max_length",
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # The offset mappings will give us a map from token to character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
+        special_tokens = tokenized_examples.pop("special_tokens_mask")
+
+        # Let's label those examples!
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+        tokenized_examples["is_impossible"] = []
+        tokenized_examples["cls_index"] = []
+        tokenized_examples["p_mask"] = []
+
+        for i, offsets in enumerate(offset_mapping):
+            # We will label impossible answers with the index of the CLS token.
+            input_ids = tokenized_examples["input_ids"][i]
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+            tokenized_examples["cls_index"].append(cls_index)
+
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples["token_type_ids"][i]
+            for k, s in enumerate(special_tokens[i]):
+                if s:
+                    sequence_ids[k] = 3
+            context_idx = 1 if pad_on_right else 0
+
+            # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0.
+            # The cls token gets 1.0 too (for predictions of empty answers).
+            tokenized_examples["p_mask"].append(
+                [
+                    0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0
+                    for k, s in enumerate(sequence_ids)
+                ]
+            )
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            answers = examples[answer_column_name][sample_index]
+            # If no answers are given, set the cls_index as answer.
+            if len(answers["answer_start"]) == 0:
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+                tokenized_examples["is_impossible"].append(1.0)
+            else:
+                # Start/end character index of the answer in the text.
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+
+                # Start token index of the current span in the text.
+                token_start_index = 0
+                while sequence_ids[token_start_index] != context_idx:
+                    token_start_index += 1
+
+                # End token index of the current span in the text.
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != context_idx:
+                    token_end_index -= 1
+                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                    tokenized_examples["is_impossible"].append(1.0)
+                else:
+                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                    # Note: we could go after the last offset if the answer is the last word (edge case).
+                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
+                        token_start_index += 1
+                    tokenized_examples["start_positions"].append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    tokenized_examples["end_positions"].append(token_end_index + 1)
+                    tokenized_examples["is_impossible"].append(0.0)
+
+        return tokenized_examples
+
+    if "train" not in raw_datasets:
+        raise ValueError("--do_train requires a train dataset")
+    train_dataset = raw_datasets["train"]
+    if args.max_train_samples is not None:
+        # We will select sample from whole data if agument is specified
+        train_dataset = train_dataset.select(range(args.max_train_samples))
+    # Create train feature from dataset
+    train_dataset = train_dataset.map(
+        prepare_train_features,
+        batched=True,
+        num_proc=args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=not args.overwrite_cache,
+    )
+    if args.max_train_samples is not None:
+        # Number of samples might increase during Feature Creation, We select only specified max samples
+        train_dataset = train_dataset.select(range(args.max_train_samples))
+
+    # Validation preprocessing
+    def prepare_validation_features(examples):
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            return_special_tokens_mask=True,
+            return_token_type_ids=True,
+            padding="max_length",
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+        # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
+        special_tokens = tokenized_examples.pop("special_tokens_mask")
+
+        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+        # corresponding example_id and we will store the offset mappings.
+        tokenized_examples["example_id"] = []
+
+        # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label.
+        tokenized_examples["cls_index"] = []
+        tokenized_examples["p_mask"] = []
+
+        for i, input_ids in enumerate(tokenized_examples["input_ids"]):
+            # Find the CLS token in the input ids.
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+            tokenized_examples["cls_index"].append(cls_index)
+
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples["token_type_ids"][i]
+            for k, s in enumerate(special_tokens[i]):
+                if s:
+                    sequence_ids[k] = 3
+            context_idx = 1 if pad_on_right else 0
+
+            # Build the p_mask: non special tokens and context gets 0.0, the others 1.0.
+            tokenized_examples["p_mask"].append(
+                [
+                    0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0
+                    for k, s in enumerate(sequence_ids)
+                ]
+            )
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_idx else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+
+        return tokenized_examples
+
+    if "validation" not in raw_datasets:
+        raise ValueError("--do_eval requires a validation dataset")
+    eval_examples = raw_datasets["validation"]
+    if args.max_val_samples is not None:
+        # We will select sample from whole data
+        eval_examples = eval_examples.select(range(args.max_val_samples))
+    # Validation Feature Creation
+    eval_dataset = eval_examples.map(
+        prepare_validation_features,
+        batched=True,
+        num_proc=args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=not args.overwrite_cache,
+    )
+
+    if args.max_val_samples is not None:
+        # During Feature creation dataset samples might increase, we will select required samples again
+        eval_dataset = eval_dataset.select(range(args.max_val_samples))
+
+    if args.do_predict:
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        test_examples = raw_datasets["test"]
+        if args.max_test_samples is not None:
+            # We will select sample from whole data
+            test_examples = test_examples.select(range(args.max_test_samples))
+        # Test Feature Creation
+        test_dataset = test_examples.map(
+            prepare_validation_features,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not args.overwrite_cache,
+        )
+        if args.max_test_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            test_dataset = test_dataset.select(range(args.max_test_samples))
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # DataLoaders creation:
+    if args.pad_to_max_length:
+        # If padding was already done ot max length, we use the default data collator that will just convert everything
+        # to tensors.
+        data_collator = default_data_collator
+    else:
+        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
+        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
+        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
+
+    train_dataloader = DataLoader(
+        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
+    )
+
+    eval_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
+    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
+
+    if args.do_predict:
+        test_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
+        test_dataloader = DataLoader(
+            test_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
+        )
+
+    # Post-processing:
+    def post_processing_function(examples, features, predictions, stage="eval"):
+        # Post-processing: we match the start logits and end logits to answers in the original context.
+        predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search(
+            examples=examples,
+            features=features,
+            predictions=predictions,
+            version_2_with_negative=args.version_2_with_negative,
+            n_best_size=args.n_best_size,
+            max_answer_length=args.max_answer_length,
+            start_n_top=model.config.start_n_top,
+            end_n_top=model.config.end_n_top,
+            output_dir=args.output_dir,
+            prefix=stage,
+        )
+        # Format the result to the format the metric expects.
+        if args.version_2_with_negative:
+            formatted_predictions = [
+                {"id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k]}
+                for k, v in predictions.items()
+            ]
+        else:
+            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+    metric = load_metric("squad_v2" if args.version_2_with_negative else "squad")
+
+    def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
+        """
+        Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
+
+        Args:
+            start_or_end_logits(:obj:`tensor`):
+                This is the output predictions of the model. We can only enter either start or end logits.
+            eval_dataset: Evaluation dataset
+            max_len(:obj:`int`):
+                The maximum length of the output tensor. ( See the model.eval() part for more details )
+        """
+
+        step = 0
+        # create a numpy array and fill it with -100.
+        logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float32)
+        # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather
+        for i, output_logit in enumerate(start_or_end_logits):  # populate columns
+            # We have to fill it such that we have to take the whole tensor and replace it on the newly created array
+            # And after every iteration we have to change the step
+
+            batch_size = output_logit.shape[0]
+            cols = output_logit.shape[1]
+            if step + batch_size < len(dataset):
+                logits_concat[step : step + batch_size, :cols] = output_logit
+            else:
+                logits_concat[step:, :cols] = output_logit[: len(dataset) - step]
+
+            step += batch_size
+
+        return logits_concat
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader
+    )
+
+    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
+    # shorter in multiprocess)
+
+    # Scheduler and math around the number of training steps.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    else:
+        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # Train!
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+
+    for epoch in range(args.num_train_epochs):
+        model.train()
+        for step, batch in enumerate(train_dataloader):
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss = loss / args.gradient_accumulation_steps
+            accelerator.backward(loss)
+            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+        # intialize all lists to collect the batches
+
+    all_start_top_log_probs = []
+    all_start_top_index = []
+    all_end_top_log_probs = []
+    all_end_top_index = []
+    all_cls_logits = []
+    for step, batch in enumerate(eval_dataloader):
+        with torch.no_grad():
+            outputs = model(**batch)
+            start_top_log_probs = outputs.start_top_log_probs
+            start_top_index = outputs.start_top_index
+            end_top_log_probs = outputs.end_top_log_probs
+            end_top_index = outputs.end_top_index
+            cls_logits = outputs.cls_logits
+
+            if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
+                start_top_log_probs = accelerator.pad_across_processes(start_top_log_probs, dim=1, pad_index=-100)
+                start_top_index = accelerator.pad_across_processes(start_top_index, dim=1, pad_index=-100)
+                end_top_log_probs = accelerator.pad_across_processes(end_top_log_probs, dim=1, pad_index=-100)
+                end_top_index = accelerator.pad_across_processes(end_top_index, dim=1, pad_index=-100)
+                cls_logits = accelerator.pad_across_processes(cls_logits, dim=1, pad_index=-100)
+
+            all_start_top_log_probs.append(accelerator.gather(start_top_log_probs).cpu().numpy())
+            all_start_top_index.append(accelerator.gather(start_top_index).cpu().numpy())
+            all_end_top_log_probs.append(accelerator.gather(end_top_log_probs).cpu().numpy())
+            all_end_top_index.append(accelerator.gather(end_top_index).cpu().numpy())
+            all_cls_logits.append(accelerator.gather(cls_logits).cpu().numpy())
+
+    max_len = max([x.shape[1] for x in all_end_top_log_probs])  # Get the max_length of the tensor
+
+    # concatenate all numpy arrays collected above
+    start_top_log_probs_concat = create_and_fill_np_array(all_start_top_log_probs, eval_dataset, max_len)
+    start_top_index_concat = create_and_fill_np_array(all_start_top_index, eval_dataset, max_len)
+    end_top_log_probs_concat = create_and_fill_np_array(all_end_top_log_probs, eval_dataset, max_len)
+    end_top_index_concat = create_and_fill_np_array(all_end_top_index, eval_dataset, max_len)
+    all_cls_logits = np.concatenate(all_cls_logits, axis=0)
+
+    # delete the list of numpy arrays
+    del start_top_log_probs
+    del start_top_index
+    del end_top_log_probs
+    del end_top_index
+
+    eval_dataset.set_format(type=None, columns=list(eval_dataset.features.keys()))
+    outputs_numpy = (
+        start_top_log_probs_concat,
+        start_top_index_concat,
+        end_top_log_probs_concat,
+        end_top_index_concat,
+        cls_logits,
+    )
+    prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy)
+    eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
+    logger.info(f"Evaluation metrics: {eval_metric}")
+
+    if args.do_predict:
+        # intialize all lists to collect the batches
+
+        all_start_top_log_probs = []
+        all_start_top_index = []
+        all_end_top_log_probs = []
+        all_end_top_index = []
+        all_cls_logits = []
+        for step, batch in enumerate(test_dataloader):
+            with torch.no_grad():
+                outputs = model(**batch)
+                start_top_log_probs = outputs.start_top_log_probs
+                start_top_index = outputs.start_top_index
+                end_top_log_probs = outputs.end_top_log_probs
+                end_top_index = outputs.end_top_index
+                cls_logits = outputs.cls_logits
+
+                if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
+                    start_top_log_probs = accelerator.pad_across_processes(start_top_log_probs, dim=1, pad_index=-100)
+                    start_top_index = accelerator.pad_across_processes(start_top_index, dim=1, pad_index=-100)
+                    end_top_log_probs = accelerator.pad_across_processes(end_top_log_probs, dim=1, pad_index=-100)
+                    end_top_index = accelerator.pad_across_processes(end_top_index, dim=1, pad_index=-100)
+                    cls_logits = accelerator.pad_across_processes(cls_logits, dim=1, pad_index=-100)
+
+                all_start_top_log_probs.append(accelerator.gather(start_top_log_probs).cpu().numpy())
+                all_start_top_index.append(accelerator.gather(start_top_index).cpu().numpy())
+                all_end_top_log_probs.append(accelerator.gather(end_top_log_probs).cpu().numpy())
+                all_end_top_index.append(accelerator.gather(end_top_index).cpu().numpy())
+                all_cls_logits.append(accelerator.gather(cls_logits).cpu().numpy())
+
+        max_len = max([x.shape[1] for x in all_end_top_log_probs])  # Get the max_length of the tensor
+
+        # concatenate all numpy arrays collected above
+        start_top_log_probs_concat = create_and_fill_np_array(all_start_top_log_probs, test_dataset, max_len)
+        start_top_index_concat = create_and_fill_np_array(all_start_top_index, test_dataset, max_len)
+        end_top_log_probs_concat = create_and_fill_np_array(all_end_top_log_probs, test_dataset, max_len)
+        end_top_index_concat = create_and_fill_np_array(all_end_top_index, test_dataset, max_len)
+        all_cls_logits = np.concatenate(all_cls_logits, axis=0)
+
+        # delete the list of numpy arrays
+        del start_top_log_probs
+        del start_top_index
+        del end_top_log_probs
+        del end_top_index
+
+        test_dataset.set_format(type=None, columns=list(test_dataset.features.keys()))
+        outputs_numpy = (
+            start_top_log_probs_concat,
+            start_top_index_concat,
+            end_top_log_probs_concat,
+            end_top_index_concat,
+            cls_logits,
+        )
+
+        prediction = post_processing_function(test_examples, test_dataset, outputs_numpy)
+        test_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
+        logger.info(f"Test metrics: {test_metric}")
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/question-answering/run_qa_no_trainer.py
+++ b/examples/question-answering/run_qa_no_trainer.py
@@ -0,0 +1,753 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning a 🤗 Transformers model on question answering.
+"""
+# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
+
+import argparse
+import logging
+import math
+import os
+import random
+
+import datasets
+import numpy as np
+import torch
+from datasets import load_dataset, load_metric
+from torch.utils.data.dataloader import DataLoader
+from tqdm.auto import tqdm
+
+import transformers
+from accelerate import Accelerator
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_MAPPING,
+    AdamW,
+    AutoConfig,
+    AutoModelForQuestionAnswering,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    SchedulerType,
+    default_data_collator,
+    get_scheduler,
+    set_seed,
+)
+from transformers.utils import check_min_version
+from utils_qa import postprocess_qa_predictions
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.5.0.dev0")
+
+
+logger = logging.getLogger(__name__)
+# You should update this to your particular problem to have better documentation of `model_type`
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a Question Answering task")
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
+    )
+    parser.add_argument(
+        "--preprocessing_num_workers", type=int, default=4, help="A csv or a json file containing the training data."
+    )
+    parser.add_argument("--do_predict", action="store_true", help="Eval the question answering model")
+    parser.add_argument(
+        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        type=int,
+        default=384,
+        help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
+        " sequences shorter will be padded if `--pad_to_max_lengh` is passed.",
+    )
+    parser.add_argument(
+        "--pad_to_max_length",
+        action="store_true",
+        help="If passed, pad all samples to `max_seq_length`. Otherwise, dynamic padding is used.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--doc_stride",
+        type=int,
+        default=128,
+        help="When splitting up a long document into chunks how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--n_best_size",
+        type=int,
+        default=20,
+        help="The total number of n-best predictions to generate when looking for an answer.",
+    )
+    parser.add_argument(
+        "--null_score_diff_threshold",
+        type=float,
+        default=0.0,
+        help="The threshold used to select the null answer: if the best answer has a score that is less than "
+        "the score of the null answer minus this threshold, the null answer is selected for this example. "
+        "Only useful when `version_2_with_negative=True`.",
+    )
+    parser.add_argument(
+        "--version_2_with_negative",
+        type=bool,
+        default=False,
+        help="If true, some of the examples do not have an answer.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        type=int,
+        default=30,
+        help="The maximum length of an answer that can be generated. This is needed because the start "
+        "and end predictions are not conditioned on one another.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help="For debugging purposes or quicker training, truncate the number of training examples to this "
+        "value if set.",
+    )
+    parser.add_argument(
+        "--max_val_samples",
+        type=int,
+        default=None,
+        help="For debugging purposes or quicker training, truncate the number of validation examples to this "
+        "value if set.",
+    )
+    parser.add_argument(
+        "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument(
+        "--max_test_samples",
+        type=int,
+        default=None,
+        help="For debugging purposes or quicker training, truncate the number of test examples to this",
+    )
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Model type to use if training from scratch.",
+        choices=MODEL_TYPES,
+    )
+
+    args = parser.parse_args()
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_file is None and args.validation_file is None:
+        raise ValueError("Need either a dataset name or a training/validation file.")
+    else:
+        if args.train_file is not None:
+            extension = args.train_file.split(".")[-1]
+            assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+        if args.validation_file is not None:
+            extension = args.validation_file.split(".")[-1]
+            assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    accelerator = Accelerator()
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state)
+
+    # Setup logging, we only want one process per machine to log things on the screen.
+    # accelerator.is_local_main_process is only True for one process per machine.
+    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+    else:
+        data_files = {}
+        if args.train_file is not None:
+            data_files["train"] = args.train_file
+        if args.validation_file is not None:
+            data_files["validation"] = args.validation_file
+        extension = args.train_file.split(".")[-1]
+        raw_datasets = load_dataset(extension, data_files=data_files)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    if args.config_name:
+        config = AutoConfig.from_pretrained(args.config_name)
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(args.model_name_or_path)
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True)
+    elif args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=True)
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+
+    if args.model_name_or_path:
+        model = AutoModelForQuestionAnswering.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForQuestionAnswering.from_config(config)
+
+    # Preprocessing the datasets.
+    # Preprocessing is slighlty different for training and evaluation.
+
+    column_names = raw_datasets["train"].column_names
+
+    question_column_name = "question" if "question" in column_names else column_names[0]
+    context_column_name = "context" if "context" in column_names else column_names[1]
+    answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+    # Padding side determines if we do (question|context) or (context|question).
+    pad_on_right = tokenizer.padding_side == "right"
+
+    if args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+
+    max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)
+
+    # Training preprocessing
+    def prepare_train_features(examples):
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length" if args.pad_to_max_length else False,
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # The offset mappings will give us a map from token to character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+
+        # Let's label those examples!
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+
+        for i, offsets in enumerate(offset_mapping):
+            # We will label impossible answers with the index of the CLS token.
+            input_ids = tokenized_examples["input_ids"][i]
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            answers = examples[answer_column_name][sample_index]
+            # If no answers are given, set the cls_index as answer.
+            if len(answers["answer_start"]) == 0:
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                # Start/end character index of the answer in the text.
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+
+                # Start token index of the current span in the text.
+                token_start_index = 0
+                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+                    token_start_index += 1
+
+                # End token index of the current span in the text.
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+                    token_end_index -= 1
+
+                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                else:
+                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                    # Note: we could go after the last offset if the answer is the last word (edge case).
+                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
+                        token_start_index += 1
+                    tokenized_examples["start_positions"].append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    tokenized_examples["end_positions"].append(token_end_index + 1)
+
+        return tokenized_examples
+
+    if "train" not in raw_datasets:
+        raise ValueError("--do_train requires a train dataset")
+    train_dataset = raw_datasets["train"]
+    if args.max_train_samples is not None:
+        # We will select sample from whole data if agument is specified
+        train_dataset = train_dataset.select(range(args.max_train_samples))
+    # Create train feature from dataset
+    train_dataset = train_dataset.map(
+        prepare_train_features,
+        batched=True,
+        num_proc=args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=not args.overwrite_cache,
+    )
+    if args.max_train_samples is not None:
+        # Number of samples might increase during Feature Creation, We select only specified max samples
+        train_dataset = train_dataset.select(range(args.max_train_samples))
+
+    # Validation preprocessing
+    def prepare_validation_features(examples):
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length" if args.pad_to_max_length else False,
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+        # corresponding example_id and we will store the offset mappings.
+        tokenized_examples["example_id"] = []
+
+        for i in range(len(tokenized_examples["input_ids"])):
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            context_index = 1 if pad_on_right else 0
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+
+        return tokenized_examples
+
+    if "validation" not in raw_datasets:
+        raise ValueError("--do_eval requires a validation dataset")
+    eval_examples = raw_datasets["validation"]
+    if args.max_val_samples is not None:
+        # We will select sample from whole data
+        eval_examples = eval_examples.select(range(args.max_val_samples))
+    # Validation Feature Creation
+    eval_dataset = eval_examples.map(
+        prepare_validation_features,
+        batched=True,
+        num_proc=args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=not args.overwrite_cache,
+    )
+
+    if args.max_val_samples is not None:
+        # During Feature creation dataset samples might increase, we will select required samples again
+        eval_dataset = eval_dataset.select(range(args.max_val_samples))
+
+    if args.do_predict:
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        test_examples = raw_datasets["test"]
+        if args.max_test_samples is not None:
+            # We will select sample from whole data
+            test_examples = test_examples.select(range(args.max_test_samples))
+        # Test Feature Creation
+        test_dataset = test_examples.map(
+            prepare_validation_features,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not args.overwrite_cache,
+        )
+        if args.max_test_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            test_dataset = test_dataset.select(range(args.max_test_samples))
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # DataLoaders creation:
+    if args.pad_to_max_length:
+        # If padding was already done ot max length, we use the default data collator that will just convert everything
+        # to tensors.
+        data_collator = default_data_collator
+    else:
+        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
+        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
+        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))
+
+    train_dataloader = DataLoader(
+        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
+    )
+
+    eval_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
+    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
+
+    if args.do_predict:
+        test_dataset.set_format(type="torch", columns=["attention_mask", "input_ids", "token_type_ids"])
+        test_dataloader = DataLoader(
+            test_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
+        )
+
+    # Post-processing:
+    def post_processing_function(examples, features, predictions, stage="eval"):
+        # Post-processing: we match the start logits and end logits to answers in the original context.
+        predictions = postprocess_qa_predictions(
+            examples=examples,
+            features=features,
+            predictions=predictions,
+            version_2_with_negative=args.version_2_with_negative,
+            n_best_size=args.n_best_size,
+            max_answer_length=args.max_answer_length,
+            null_score_diff_threshold=args.null_score_diff_threshold,
+            output_dir=args.output_dir,
+            prefix=stage,
+        )
+        # Format the result to the format the metric expects.
+        if args.version_2_with_negative:
+            formatted_predictions = [
+                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+            ]
+        else:
+            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+    metric = load_metric("squad_v2" if args.version_2_with_negative else "squad")
+
+    # Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
+    def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
+        """
+        Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
+
+        Args:
+            start_or_end_logits(:obj:`tensor`):
+                This is the output predictions of the model. We can only enter either start or end logits.
+            eval_dataset: Evaluation dataset
+            max_len(:obj:`int`):
+                The maximum length of the output tensor. ( See the model.eval() part for more details )
+        """
+
+        step = 0
+        # create a numpy array and fill it with -100.
+        logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float64)
+        # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather
+        for i, output_logit in enumerate(start_or_end_logits):  # populate columns
+            # We have to fill it such that we have to take the whole tensor and replace it on the newly created array
+            # And after every iteration we have to change the step
+
+            batch_size = output_logit.shape[0]
+            cols = output_logit.shape[1]
+
+            if step + batch_size < len(dataset):
+                logits_concat[step : step + batch_size, :cols] = output_logit
+            else:
+                logits_concat[step:, :cols] = output_logit[: len(dataset) - step]
+
+            step += batch_size
+
+        return logits_concat
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader
+    )
+
+    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
+    # shorter in multiprocess)
+
+    # Scheduler and math around the number of training steps.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    else:
+        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # Train!
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+
+    for epoch in range(args.num_train_epochs):
+        model.train()
+        for step, batch in enumerate(train_dataloader):
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss = loss / args.gradient_accumulation_steps
+            accelerator.backward(loss)
+            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+    # Validation
+    all_start_logits = []
+    all_end_logits = []
+    for step, batch in enumerate(eval_dataloader):
+        with torch.no_grad():
+            outputs = model(**batch)
+            start_logits = outputs.start_logits
+            end_logits = outputs.end_logits
+
+            if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
+                start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100)
+                end_logits = accelerator.pad_across_processes(end_logits, dim=1, pad_index=-100)
+
+            all_start_logits.append(accelerator.gather(start_logits).cpu().numpy())
+            all_end_logits.append(accelerator.gather(end_logits).cpu().numpy())
+
+    max_len = max([x.shape[1] for x in all_start_logits])  # Get the max_length of the tensor
+
+    # concatenate the numpy array
+    start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, max_len)
+    end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset, max_len)
+
+    # delete the list of numpy arrays
+    del all_start_logits
+    del all_end_logits
+
+    eval_dataset.set_format(type=None, columns=list(eval_dataset.features.keys()))
+    outputs_numpy = (start_logits_concat, end_logits_concat)
+    prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy)
+    eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
+    logger.info(f"Evaluation metrics: {eval_metric}")
+
+    # Prediction
+    if args.do_predict:
+        all_start_logits = []
+        all_end_logits = []
+        for step, batch in enumerate(test_dataloader):
+            with torch.no_grad():
+                outputs = model(**batch)
+                start_logits = outputs.start_logits
+                end_logits = outputs.end_logits
+
+                if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
+                    start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100)
+                    end_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100)
+
+                all_start_logits.append(accelerator.gather(start_logits).cpu().numpy())
+                all_end_logits.append(accelerator.gather(end_logits).cpu().numpy())
+
+        max_len = max([x.shape[1] for x in all_start_logits])  # Get the max_length of the tensor
+        # concatenate the numpy array
+        start_logits_concat = create_and_fill_np_array(all_start_logits, test_dataset, max_len)
+        end_logits_concat = create_and_fill_np_array(all_end_logits, test_dataset, max_len)
+
+        # delete the list of numpy arrays
+        del all_start_logits
+        del all_end_logits
+
+        # Now we need to add extra columns which we removed for post processing
+        test_dataset.set_format(type=None, columns=list(test_dataset.features.keys()))
+        outputs_numpy = (start_logits_concat, end_logits_concat)
+        prediction = post_processing_function(test_examples, test_dataset, outputs_numpy)
+        eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
+        logger.info(f"Test metrics: {eval_metric}")
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/question-answering/run_tf_squad.py
+++ b/examples/question-answering/run_tf_squad.py
@@ -181,7 +181,7 @@ def main():
    # Get datasets
    if data_args.use_tfds:
        if data_args.version_2_with_negative:
-            logger.warn("tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically")
+            logger.warning("tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically")

        try:
            import tensorflow_datasets as tfds
--- a/examples/question-answering/utils_qa.py
+++ b/examples/question-answering/utils_qa.py
@@ -335,9 +335,9 @@ def postprocess_qa_predictions_with_beam_search(
            # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
            for i in range(start_n_top):
                for j in range(end_n_top):
-                    start_index = start_indexes[i]
+                    start_index = int(start_indexes[i])
                    j_index = i * end_n_top + j
-                    end_index = end_indexes[j_index]
+                    end_index = int(end_indexes[j_index])
                    # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
                    # p_mask but let's not take any risk)
                    if (
--- a/examples/research_projects/movement-pruning/masked_run_squad.py
+++ b/examples/research_projects/movement-pruning/masked_run_squad.py
@@ -629,7 +629,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

            if args.version_2_with_negative:
-                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")
+                logger.warning("tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
--- a/examples/research_projects/seq2seq-distillation/run_eval.py
+++ b/examples/research_projects/seq2seq-distillation/run_eval.py
@@ -115,7 +115,8 @@ def run_generate(verbose=True):
    parsed_args = parse_numeric_n_bool_cl_kwargs(rest)
    if parsed_args and verbose:
        print(f"parsed the following generate kwargs: {parsed_args}")
-    examples = [" " + x.rstrip() if "t5" in args.model_name else x.rstrip() for x in open(args.input_path).readlines()]
+    with open(args.input_path) as f:
+        examples = [" " + x.rstrip() if "t5" in args.model_name else x.rstrip() for x in f.readlines()]
    if args.n_obs > 0:
        examples = examples[: args.n_obs]
    Path(args.save_path).parent.mkdir(exist_ok=True)
--- a/examples/research_projects/wav2vec2/run_common_voice.py
+++ b/examples/research_projects/wav2vec2/run_common_voice.py
@@ -476,13 +476,14 @@ def main():
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()

-        # save the feature_extractor and the tokenizer
+        # Save the feature_extractor and the tokenizer
        if is_main_process(training_args.local_rank):
            processor.save_pretrained(training_args.output_dir)

+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()
+
        metrics = train_result.metrics
        max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
--- a/examples/seq2seq/run_summarization.py
+++ b/examples/seq2seq/run_summarization.py
@@ -46,7 +46,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0")
+check_min_version("4.6.0.dev0")

 logger = logging.getLogger(__name__)

@@ -310,7 +310,7 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
    else:
        data_files = {}
        if data_args.train_file is not None:
@@ -322,7 +322,7 @@ def main():
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files)
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -394,7 +394,7 @@ def main():
    padding = "max_length" if data_args.pad_to_max_length else False

    if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
-        logger.warn(
+        logger.warning(
            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
        )
--- a/examples/seq2seq/run_translation.py
+++ b/examples/seq2seq/run_translation.py
@@ -34,6 +34,9 @@ from transformers import (
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
+    M2M100Tokenizer,
+    MBart50Tokenizer,
+    MBart50TokenizerFast,
    MBartTokenizer,
    MBartTokenizerFast,
    Seq2SeqTrainer,
@@ -46,10 +49,13 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0")
+check_min_version("4.6.0.dev0")

 logger = logging.getLogger(__name__)

+# A list of all multilingual tokenizer which require src_lang and tgt_lang attributes.
+MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast, M2M100Tokenizer]
+

@dataclass
 class ModelArguments:
@@ -191,6 +197,14 @@ class DataTrainingArguments:
    source_prefix: Optional[str] = field(
        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
    )
+    forced_bos_token: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The token to force as the first generated token after the :obj:`decoder_start_token_id`."
+            "Useful for multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token "
+            "needs to be the target language token.(Usually it is the target language token)"
+        },
+    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -280,7 +294,7 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
    else:
        data_files = {}
        if data_args.train_file is not None:
@@ -292,7 +306,7 @@ def main():
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files)
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -325,9 +339,6 @@ def main():

    # Set decoder_start_token_id
    if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        assert (
-            data_args.target_lang is not None and data_args.source_lang is not None
-        ), "mBart requires --target_lang and --source_lang"
        if isinstance(tokenizer, MBartTokenizer):
            model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang]
        else:
@@ -352,11 +363,21 @@ def main():

    # For translation we set the codes of our source and target languages (only useful for mBART, the others will
    # ignore those attributes).
-    if isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        if data_args.source_lang is not None:
-            tokenizer.src_lang = data_args.source_lang
-        if data_args.target_lang is not None:
-            tokenizer.tgt_lang = data_args.target_lang
+    if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
+        assert data_args.target_lang is not None and data_args.source_lang is not None, (
+            f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --source_lang and "
+            "--target_lang arguments."
+        )
+
+        tokenizer.src_lang = data_args.source_lang
+        tokenizer.tgt_lang = data_args.target_lang
+
+        # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token
+        # as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument.
+        forced_bos_token_id = (
+            tokenizer.lang_code_to_id[data_args.forced_bos_token] if data_args.forced_bos_token is not None else None
+        )
+        model.config.foced_bos_token_id = forced_bos_token_id

    # Get the language codes for input/target.
    source_lang = data_args.source_lang.split("_")[0]
@@ -367,7 +388,7 @@ def main():
    padding = "max_length" if data_args.pad_to_max_length else False

    if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
-        logger.warn(
+        logger.warning(
            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
        )
--- a/examples/tests/deepspeed/test_deepspeed.py
+++ b/examples/tests/deepspeed/test_deepspeed.py
@@ -1,427 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import dataclasses
-import io
-import json
-import os
-import sys
-import unittest
-from copy import deepcopy
-
-from transformers import TrainingArguments
-from transformers.file_utils import WEIGHTS_NAME
-from transformers.integrations import is_deepspeed_available
-from transformers.testing_utils import (
-    CaptureStd,
-    TestCasePlus,
-    execute_subprocess_async,
-    get_gpu_count,
-    mockenv_context,
-    require_torch_gpu,
-    require_torch_multi_gpu,
-    slow,
-)
-from transformers.trainer_utils import set_seed
-
-
-bindir = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(f"{bindir}/../../../tests")
-from test_trainer import TrainerIntegrationCommon, get_regression_trainer  # noqa
-
-
-set_seed(42)
-MBART_TINY = "sshleifer/tiny-mbart"
-
-
-def load_json(path):
-    with open(path) as f:
-        return json.load(f)
-
-
-# a candidate for testing_utils
-def require_deepspeed(test_case):
-    """
-    Decorator marking a test that requires deepspeed
-    """
-    if not is_deepspeed_available():
-        return unittest.skip("test requires deepspeed")(test_case)
-    else:
-        return test_case
-
-
-@require_deepspeed
-@require_torch_gpu
-class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
-    """
-
-    This class is for testing directly via get_regression_trainer
-
-    It mixes in `TrainerIntegrationCommon` which already has a lot of helper validation methods which we can re-use here.
-    """
-
-    def setUp(self):
-        super().setUp()
-
-        args = TrainingArguments(".")
-        self.n_epochs = args.num_train_epochs
-        self.batch_size = args.train_batch_size
-
-        self.dist_env_1_gpu = dict(
-            MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
-        )
-        self.ds_config_file = f"{self.test_file_dir_str}/ds_config.json"
-        with io.open(self.ds_config_file, "r", encoding="utf-8") as f:
-            self.ds_config_dict = json.load(f)
-
-    def test_fake_notebook_no_launcher(self):
-        # this setup emulates a notebook where a launcher needs to be emulated by hand
-        with CaptureStd() as cs:  # noqa
-            with mockenv_context(**self.dist_env_1_gpu):
-                trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file)
-                trainer.train()
-        # fixme:
-        # assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
-
-    # Test various combos
-    # 1. DS scheduler + DS optimizer: this is already tested by most other tests
-    # 2. HF scheduler + HF optimizer:
-    # 3. DS scheduler + HF optimizer:
-    # 4. HF scheduler + DS optimizer:
-
-    def test_hf_scheduler_hf_optimizer(self):
-        a = 0
-        with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_dict = deepcopy(self.ds_config_dict)
-            del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
-            del ds_config_dict["scheduler"]  # force default HF Trainer scheduler
-            ds_config_dict["zero_optimization"]["cpu_offload"] = False
-            ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, deepspeed=ds_config_dict)
-            trainer.train()
-        new_a = trainer.model.a.item()
-        self.assertNotEqual(new_a, a)
-
-    def test_ds_scheduler_hf_optimizer(self):
-        a = 0
-        with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_dict = deepcopy(self.ds_config_dict)
-            del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
-            ds_config_dict["zero_optimization"]["cpu_offload"] = False
-            ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, deepspeed=ds_config_dict)
-            trainer.train()
-        new_a = trainer.model.a.item()
-        self.assertNotEqual(new_a, a)
-
-    def test_hf_scheduler_ds_optimizer(self):
-        # this combo is not possible at the moment
-        with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_dict = deepcopy(self.ds_config_dict)
-            del ds_config_dict["scheduler"]  # force default HF Trainer scheduler
-            ds_config_dict["zero_optimization"]["cpu_offload"] = False
-            ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict)
-            with self.assertRaises(Exception) as context:
-                trainer.train()
-        self.assertTrue("HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception))
-
-    def test_hf_optimizer_with_offload(self):
-        # must not allow non-DS optimizer when using ZERO-offload
-        with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_dict = deepcopy(self.ds_config_dict)
-            del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
-            ds_config_dict["zero_optimization"]["cpu_offload"] = True
-            # sanity check - should the default config change
-            assert (
-                "cpu_offload" in ds_config_dict["zero_optimization"]
-                and ds_config_dict["zero_optimization"]["cpu_offload"] is True
-            ), "ensure the config is set up correctly"
-            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict)
-            with self.assertRaises(Exception) as context:
-                trainer.train()
-        self.assertTrue("ZeRO Offload can only work with DeepSpeed optimizers" in str(context.exception))
-
-    def test_early_get_last_lr(self):
-        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
-        # not run for the first few dozen steps while loss scale is too large, and thus during
-        # that time `get_last_lr` will fail if called during that warm up stage,
-        #
-        # setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls
-        # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
-        with mockenv_context(**self.dist_env_1_gpu):
-            a = b = 0.0
-            trainer = get_regression_trainer(
-                a=a,
-                b=b,
-                local_rank=0,
-                train_len=8,
-                deepspeed=self.ds_config_file,
-                per_device_train_batch_size=8,
-                logging_steps=1,
-            )
-            trainer.train()
-            no_grad_accum_a = trainer.model.a.item()
-
-            # it's enough that train didn't fail for this test, but we must check that
-            # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
-            self.assertEqual(no_grad_accum_a, a)
-
-    def test_gradient_accumulation(self):
-
-        # this test measures that we get identical weights and similar loss with:
-        # 1. per_device_train_batch_size=8, gradient_accumulation_steps=1
-        # 2. per_device_train_batch_size=4, gradient_accumulation_steps=2
-        # since the 2nd should produce the effective batch of 1st, with the same results
-        #
-        # I can get an identical loss for a small train_len=32, plus the power of the initial
-        # dynamic loss scale value set to:
-        #   "fp16.initial_scale_power": 1
-        # plus having the same WarmupLR's warmup_min_lr == warmup_max_lr in the config file
-        # but for some reason going to train_len=64 the weights, weights start to mismatch with this setup.
-        # the culprit seems to be `initial_scale_power` - putting it back to its default 32 keeps the weights identical
-
-        train_len = 64
-        a = b = 0.0
-
-        with mockenv_context(**self.dist_env_1_gpu):
-            no_grad_accum_trainer = get_regression_trainer(
-                a=a,
-                b=b,
-                local_rank=0,
-                train_len=train_len,
-                deepspeed=self.ds_config_file,
-                per_device_train_batch_size=8,
-                gradient_accumulation_steps=1,
-            )
-            no_grad_accum_result = no_grad_accum_trainer.train()
-            no_grad_accum_loss = no_grad_accum_result.training_loss
-            no_grad_accum_a = no_grad_accum_trainer.model.a.item()
-            no_grad_accum_b = no_grad_accum_trainer.model.b.item()
-            # make sure the optimizer kicked in - if it hasn't changed from the original value of a then make train_len bigger
-            self.assertNotEqual(no_grad_accum_a, a)
-
-        with mockenv_context(**self.dist_env_1_gpu):
-            yes_grad_accum_trainer = get_regression_trainer(
-                a=a,
-                b=b,
-                local_rank=0,
-                train_len=train_len,
-                deepspeed=self.ds_config_file,
-                per_device_train_batch_size=4,
-                gradient_accumulation_steps=2,
-            )
-            yes_grad_accum_result = yes_grad_accum_trainer.train()
-            yes_grad_accum_loss = yes_grad_accum_result.training_loss
-            yes_grad_accum_a = yes_grad_accum_trainer.model.a.item()
-            yes_grad_accum_b = yes_grad_accum_trainer.model.b.item()
-            self.assertNotEqual(yes_grad_accum_a, a)
-
-        # training with half the batch size but accumulation steps as 2 should give the same weights
-        self.assertEqual(no_grad_accum_a, yes_grad_accum_a)
-        self.assertEqual(no_grad_accum_b, yes_grad_accum_b)
-
-        # see the note above how to get identical loss on a small bs
-        self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=5)
-
-    def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, is_pretrained=True):
-        # adapted from TrainerIntegrationCommon.check_saved_checkpoints
-
-        file_list = [WEIGHTS_NAME, "training_args.bin", "trainer_state.json", "config.json"]
-        ds_file_list = ["mp_rank_00_model_states.pt", "zero_pp_rank_0_mp_rank_00optim_states.pt"]
-
-        for step in range(freq, total, freq):
-            checkpoint = os.path.join(output_dir, f"checkpoint-{step}")
-            self.assertTrue(os.path.isdir(checkpoint))
-
-            # common files
-            for filename in file_list:
-                self.assertTrue(os.path.isfile(os.path.join(checkpoint, filename)))
-
-            # ds files
-            ds_path = os.path.join(checkpoint, f"global_step{step}")
-            for filename in ds_file_list:
-                # filename = os.path.join(path, filename)
-                # print(filename)
-                self.assertTrue(os.path.isfile(os.path.join(ds_path, filename)))
-
-    def test_save_checkpoints(self):
-        # adapted from  TrainerIntegrationTest.test_save_checkpoints
-
-        output_dir = self.get_auto_remove_tmp_dir()
-        ds_config_dict = deepcopy(self.ds_config_dict)
-        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-        freq = 5
-
-        # save checkpoints
-        with mockenv_context(**self.dist_env_1_gpu):
-            trainer = get_regression_trainer(
-                output_dir=output_dir,
-                save_steps=freq,
-                deepspeed=ds_config_dict,
-            )
-            trainer.train()
-
-        total = int(self.n_epochs * 64 / self.batch_size)
-        self.check_saved_checkpoints_deepspeed(output_dir, freq, total)
-
-    def test_can_resume_training(self):
-        # adapted from TrainerIntegrationTest.test_can_resume_training
-
-        output_dir = self.get_auto_remove_tmp_dir()
-        ds_config_dict = deepcopy(self.ds_config_dict)
-        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-        kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict)
-
-        with mockenv_context(**self.dist_env_1_gpu):
-            trainer = get_regression_trainer(**kwargs)
-            trainer.train()
-            (a, b) = trainer.model.a.item(), trainer.model.b.item()
-            state = dataclasses.asdict(trainer.state)
-
-            checkpoint = os.path.join(output_dir, "checkpoint-5")
-
-            # Reinitialize trainer
-            trainer = get_regression_trainer(**kwargs)
-
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
-
-            # Now check with a later checkpoint that it also works when we span over one epoch
-            checkpoint = os.path.join(output_dir, "checkpoint-15")
-
-            # Reinitialize trainer and load model
-            trainer = get_regression_trainer(**kwargs)
-
-            trainer.train(resume_from_checkpoint=checkpoint)
-            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
-            state1 = dataclasses.asdict(trainer.state)
-            self.assertEqual(a, a1)
-            self.assertEqual(b, b1)
-            self.check_trainer_state_are_the_same(state, state1)
-
-            # Now check failures
-
-            # 1. fail to find a bogus checkpoint
-            trainer = get_regression_trainer(**kwargs)
-            with self.assertRaises(Exception) as context:
-                trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
-            self.assertTrue("failed to resume from checkpoint" in str(context.exception))
-
-            # 2. fail to find any checkpoint - due a fresh output_dir
-            output_dir2 = self.get_auto_remove_tmp_dir()
-            trainer = get_regression_trainer(output_dir=output_dir2, deepspeed=ds_config_dict)
-            with self.assertRaises(Exception) as context:
-                trainer.train(resume_from_checkpoint=True)
-            self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))
-
-
-@slow
-@require_deepspeed
-@require_torch_gpu
-class TestDeepSpeed(TestCasePlus):
-    """ This class is for testing via an external script """
-
-    @require_torch_multi_gpu
-    def test_basic_distributed(self):
-        self.run_quick(distributed=True)
-
-    def test_do_eval_no_train(self):
-        # we should not fail if train is skipped
-        output_dir = self.run_trainer(
-            eval_steps=1,
-            max_len=12,
-            model_name=MBART_TINY,
-            num_train_epochs=1,
-            distributed=False,
-            extra_args_str="--do_eval",
-            remove_args_str="--do_train",
-        )
-        val_metrics = load_json(os.path.join(output_dir, "eval_results.json"))
-        assert "eval_bleu" in val_metrics
-
-    # XXX: need to do better validation beyond just that the run was successful
-    def run_quick(self, distributed=True, extra_args_str=None, remove_args_str=None):
-        output_dir = self.run_trainer(
-            eval_steps=1,
-            max_len=12,
-            model_name=MBART_TINY,
-            num_train_epochs=1,
-            distributed=distributed,
-            extra_args_str=extra_args_str,
-            remove_args_str=remove_args_str,
-        )
-        train_metrics = load_json(os.path.join(output_dir, "train_results.json"))
-        assert "train_runtime" in train_metrics
-
-    def run_trainer(
-        self,
-        eval_steps: int,
-        max_len: str,
-        model_name: str,
-        num_train_epochs: int,
-        distributed: bool = True,
-        extra_args_str: str = None,
-        remove_args_str: str = None,
-    ):
-        data_dir = self.examples_dir / "test_data/wmt_en_ro"
-        output_dir = self.get_auto_remove_tmp_dir()
-        args = f"""
-            --model_name_or_path {model_name}
-            --train_file {data_dir}/train.json
-            --validation_file {data_dir}/val.json
-            --output_dir {output_dir}
-            --overwrite_output_dir
-            --max_train_samples 8
-            --max_val_samples 8
-            --max_source_length {max_len}
-            --max_target_length {max_len}
-            --val_max_target_length {max_len}
-            --do_train
-            --num_train_epochs {str(num_train_epochs)}
-            --per_device_train_batch_size 4
-            --learning_rate 3e-3
-            --warmup_steps 8
-            --predict_with_generate
-            --logging_steps 0
-            --save_steps {str(eval_steps)}
-            --group_by_length
-            --label_smoothing_factor 0.1
-            --adafactor
-            --target_lang ro_RO
-            --source_lang en_XX
-        """.split()
-
-        if extra_args_str is not None:
-            args.extend(extra_args_str.split())
-
-        if remove_args_str is not None:
-            remove_args = remove_args_str.split()
-            args = [x for x in args if x not in remove_args]
-
-        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config.json".split()
-        script = [f"{self.examples_dir_str}/seq2seq/run_translation.py"]
-        num_gpus = get_gpu_count() if distributed else 1
-        launcher = f"deepspeed --num_gpus {num_gpus}".split()
-
-        cmd = launcher + script + args + ds_args
-        # keep for quick debug
-        # print(" ".join([f"PYTHONPATH={self.src_dir_str}"] +cmd)); die
-        execute_subprocess_async(cmd, env=self.get_env())
-
-        return output_dir
--- a/examples/text-classification/run_glue.py
+++ b/examples/text-classification/run_glue.py
@@ -45,7 +45,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0")
+check_min_version("4.6.0.dev0")

 task_to_keys = {
    "cola": ("sentence", None),
@@ -239,7 +239,7 @@ def main():
    # download the dataset.
    if data_args.task_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset("glue", data_args.task_name)
+        datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir)
    else:
        # Loading a dataset from your local files.
        # CSV/JSON training and evaluation files are needed.
@@ -263,10 +263,10 @@ def main():

        if data_args.train_file.endswith(".csv"):
            # Loading a dataset from local csv files
-            datasets = load_dataset("csv", data_files=data_files)
+            datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir)
        else:
            # Loading a dataset from local json files
-            datasets = load_dataset("json", data_files=data_files)
+            datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -351,7 +351,7 @@ def main():
        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
            label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
        else:
-            logger.warn(
+            logger.warning(
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                "\nIgnoring the model labels as a result.",
@@ -360,7 +360,7 @@ def main():
        label_to_id = {v: i for i, v in enumerate(label_list)}

    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warn(
+        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
--- a/examples/text-classification/run_glue_no_trainer.py
+++ b/examples/text-classification/run_glue_no_trainer.py
@@ -274,7 +274,7 @@ def main():
            )
            label_to_id = {i: label_name_to_id[label_list[i]] for i in range(num_labels)}
        else:
-            logger.warn(
+            logger.warning(
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                "\nIgnoring the model labels as a result.",
--- a/examples/text-classification/run_xnli.py
+++ b/examples/text-classification/run_xnli.py
@@ -45,7 +45,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0")
+check_min_version("4.6.0.dev0")

 logger = logging.getLogger(__name__)

@@ -209,17 +209,19 @@ def main():
    # Downloading and loading xnli dataset from the hub.
    if training_args.do_train:
        if model_args.train_language is None:
-            train_dataset = load_dataset("xnli", model_args.language, split="train")
+            train_dataset = load_dataset("xnli", model_args.language, split="train", cache_dir=model_args.cache_dir)
        else:
-            train_dataset = load_dataset("xnli", model_args.train_language, split="train")
+            train_dataset = load_dataset(
+                "xnli", model_args.train_language, split="train", cache_dir=model_args.cache_dir
+            )
        label_list = train_dataset.features["label"].names

    if training_args.do_eval:
-        eval_dataset = load_dataset("xnli", model_args.language, split="validation")
+        eval_dataset = load_dataset("xnli", model_args.language, split="validation", cache_dir=model_args.cache_dir)
        label_list = eval_dataset.features["label"].names

    if training_args.do_predict:
-        test_dataset = load_dataset("xnli", model_args.language, split="test")
+        test_dataset = load_dataset("xnli", model_args.language, split="test", cache_dir=model_args.cache_dir)
        label_list = test_dataset.features["label"].names

    # Labels
@@ -332,13 +334,15 @@ def main():

    # Training
    if training_args.do_train:
+        checkpoint = None
        if last_checkpoint is not None:
-            model_path = last_checkpoint
+            checkpoint = last_checkpoint
        elif os.path.isdir(model_args.model_name_or_path):
-            model_path = model_args.model_name_or_path
-        else:
-            model_path = None
-        train_result = trainer.train(model_path=model_path)
+            # Check the config from that potential checkpoint has the right number of labels before using it as a
+            # checkpoint.
+            if AutoConfig.from_pretrained(model_args.model_name_or_path).num_labels == num_labels:
+                checkpoint = model_args.model_name_or_path
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
--- a/examples/token-classification/README.md
+++ b/examples/token-classification/README.md
@@ -103,7 +103,7 @@ and reply to the questions asked. Then
 accelerate test
 ```

-that will check everything is ready for training. Finally, you cna launch training with
+that will check everything is ready for training. Finally, you can launch training with

 ```bash
 export TASK_NAME=ner
--- a/examples/token-classification/run_ner.py
+++ b/examples/token-classification/run_ner.py
@@ -45,7 +45,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.5.0")
+check_min_version("4.6.0.dev0")

 logger = logging.getLogger(__name__)

@@ -229,7 +229,7 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
    else:
        data_files = {}
        if data_args.train_file is not None:
@@ -239,7 +239,7 @@ def main():
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
        extension = data_args.train_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files)
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

--- a/scripts/stale.py
+++ b/scripts/stale.py
@@ -26,6 +26,7 @@ LABELS_TO_EXEMPT = [
    "good second issue",
    "feature request",
    "new model",
+    "wip",
 ]


@@ -35,32 +36,29 @@ def main():
    open_issues = repo.get_issues(state="open")

    for issue in open_issues:
+        comments = sorted([comment for comment in issue.get_comments()], key=lambda i: i.created_at, reverse=True)
+        last_comment = comments[0] if len(comments) > 0 else None
        if (
-            not issue.assignees
-            and (dt.utcnow() - issue.updated_at).days > 21
+            last_comment is not None and last_comment.user.login == "github-actions[bot]"
+            and (dt.utcnow() - issue.updated_at).days > 7
            and (dt.utcnow() - issue.created_at).days >= 30
            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
        ):
-            print("Closing", issue)
-            # issue.create_comment(
-            #     "This issue has been automatically marked as stale and been closed because it has not had "
-            #     "recent activity. Thank you for your contributions.\n\nIf you think this still needs to be addressed"
-            #     " please comment on this thread."
-            # )
-            # issue.add_to_labels("wontfix")
-            # issue.edit(state="closed")
+            # print(f"Would close issue {issue.number} since it has been 7 days of inactivity since bot mention.")
+            issue.edit(state="closed")
        elif (
-            len(issue.assignees) > 0
-            and (dt.utcnow() - issue.updated_at).days > 21
+            (dt.utcnow() - issue.updated_at).days > 23
            and (dt.utcnow() - issue.created_at).days >= 30
+            and not any(label.name.lower() in LABELS_TO_EXEMPT for label in issue.get_labels())
        ):
-            for assignee in issue.assignees:
-                print(f"Issue {issue.number}. Pinging {assignee.name} with message")
-                print(f"Hey @{assignee.login}, could you take a second look at this issue?")
-
-                # issue.create_comment(
-                #    f"Hey @{assignee.login}, could you take a second look at this issue?"
-                # )
+            # print(f"Would add stale comment to {issue.number}")
+            issue.create_comment(
+                "This issue has been automatically marked as stale because it has not had "
+                "recent activity. If you think this still needs to be addressed "
+                "please comment on this thread.\n\nPlease note that issues that do not follow the "
+                "[contributing guidelines](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md) "
+                "are likely to be ignored."
+            )


 if __name__ == "__main__":
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@ To create the package for pypi.

 1. Run `make pre-release` (or `make pre-patch` for a patch release) then run `make fix-copies` to fix the index of the
   documentation.
-   
+
 2. Run Tests for Amazon Sagemaker. The documentation is located in `./tests/sagemaker/README.md`, otherwise @philschmid.

 3. Unpin specific versions from setup.py that use a git install.
@@ -85,11 +85,14 @@ if stale_egg_info.exists():
 # 1. all dependencies should be listed here with their version requirements if any
 # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
 _deps = [
+    "Pillow",
    "black>=20.8b1",
    "cookiecutter==1.7.2",
    "dataclasses",
    "datasets",
+    "deepspeed>=0.3.14",
    "docutils==0.16.0",
+    "fairscale>0.3",
    "faiss-cpu",
    "fastapi",
    "filelock",
@@ -101,14 +104,15 @@ _deps = [
    "isort>=5.5.4",
    "jax>=0.2.8",
    "jaxlib>=0.1.59",
+    "jieba",
    "keras2onnx",
+    "nltk",
    "numpy>=1.17",
    "onnxconverter-common",
    "onnxruntime-tools>=1.4.2",
    "onnxruntime>=1.4.0",
    "packaging",
    "parameterized",
-    "Pillow",
    "protobuf",
    "psutil",
    "pydantic",
@@ -119,15 +123,18 @@ _deps = [
    "recommonmark",
    "regex!=2019.12.17",
    "requests",
+    "rouge-score",
+    "sacrebleu>=1.4.12",
    "sacremoses",
+    "sagemaker>=2.31.0",
    "scikit-learn",
    "sentencepiece==0.1.91",
    "soundfile",
    "sphinx-copybutton",
    "sphinx-markdown-tables",
    "sphinx-rtd-theme==0.4.3",  # sphinx-rtd-theme==0.5.0 introduced big changes in the style.
-    "sphinxext-opengraph==0.4.1",
    "sphinx==3.2.1",
+    "sphinxext-opengraph==0.4.1",
    "starlette",
    "tensorflow-cpu>=2.3",
    "tensorflow>=2.3",
@@ -139,7 +146,6 @@ _deps = [
    "unidic>=1.0.2",
    "unidic_lite>=1.0.7",
    "uvicorn",
-    "sagemaker>=2.31.0",
 ]


@@ -230,6 +236,8 @@ extras["onnx"] = deps_list("onnxconverter-common", "keras2onnx") + extras["onnxr
 extras["modelcreation"] = deps_list("cookiecutter")

 extras["sagemaker"] = deps_list("sagemaker")
+extras["deepspeed"] = deps_list("deepspeed")
+extras["fairscale"] = deps_list("fairscale")

 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
 extras["speech"] = deps_list("soundfile", "torchaudio")
@@ -238,20 +246,12 @@ extras["vision"] = deps_list("Pillow")
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["testing"] = (
    deps_list(
-        "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar", "black"
+        "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar", "black", "sacrebleu", "rouge-score", "nltk"
    )
    + extras["retrieval"]
    + extras["modelcreation"]
 )
-extras["docs"] = deps_list(
-    "docutils",
-    "recommonmark",
-    "sphinx",
-    "sphinx-markdown-tables",
-    "sphinx-rtd-theme",
-    "sphinx-copybutton",
-    "sphinxext-opengraph",
-)
+
 extras["quality"] = deps_list("black", "isort", "flake8")

 extras["all"] = (
@@ -264,12 +264,24 @@ extras["all"] = (
    + extras["vision"]
 )

+extras["docs_specific"] = deps_list(
+    "docutils",
+    "recommonmark",
+    "sphinx",
+    "sphinx-markdown-tables",
+    "sphinx-rtd-theme",
+    "sphinx-copybutton",
+    "sphinxext-opengraph",
+)
+# "docs" needs "all" to resolve all the references
+extras["docs"] = extras["all"] + extras["docs_specific"]
+
 extras["dev"] = (
    extras["all"]
    + extras["testing"]
    + extras["quality"]
    + extras["ja"]
-    + extras["docs"]
+    + extras["docs_specific"]
    + extras["sklearn"]
    + extras["modelcreation"]
 )
@@ -305,7 +317,7 @@ install_requires = [

 setup(
    name="transformers",
-    version="4.5.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.6.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Sylvain Gugger, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
    author_email="thomas@huggingface.co",
    description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -22,7 +22,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).

-__version__ = "4.5.0"
+__version__ = "4.6.0.dev0"

 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.
@@ -45,6 +45,7 @@ from .file_utils import (
    _BaseLazyModule,
    is_flax_available,
    is_sentencepiece_available,
+    is_speech_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
@@ -102,6 +103,7 @@ _import_structure = {
        "is_py3nvml_available",
        "is_sentencepiece_available",
        "is_sklearn_available",
+        "is_speech_available",
        "is_tf_available",
        "is_tokenizers_available",
        "is_torch_available",
@@ -133,9 +135,11 @@ _import_structure = {
    "models.auto": [
        "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "CONFIG_MAPPING",
+        "FEATURE_EXTRACTOR_MAPPING",
        "MODEL_NAMES_MAPPING",
        "TOKENIZER_MAPPING",
        "AutoConfig",
+        "AutoFeatureExtractor",
        "AutoTokenizer",
    ],
    "models.bart": ["BartConfig", "BartTokenizer"],
@@ -159,9 +163,11 @@ _import_structure = {
    ],
    "models.camembert": ["CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CamembertConfig"],
    "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
+    "models.cpm": ["CpmTokenizer"],
    "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
    "models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
    "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
+    "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
    "models.distilbert": ["DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DistilBertConfig", "DistilBertTokenizer"],
    "models.dpr": [
        "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -187,6 +193,7 @@ _import_structure = {
    "models.m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config"],
    "models.marian": ["MarianConfig"],
    "models.mbart": ["MBartConfig"],
+    "models.megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
    "models.mmbt": ["MMBTConfig"],
    "models.mobilebert": ["MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileBertConfig", "MobileBertTokenizer"],
    "models.mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig", "MPNetTokenizer"],
@@ -202,7 +209,6 @@ _import_structure = {
    "models.speech_to_text": [
        "SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "Speech2TextConfig",
-        "Speech2TextFeatureExtractor",
    ],
    "models.squeezebert": ["SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "SqueezeBertConfig", "SqueezeBertTokenizer"],
    "models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"],
@@ -288,7 +294,6 @@ if is_sentencepiece_available():
    _import_structure["models.pegasus"].append("PegasusTokenizer")
    _import_structure["models.reformer"].append("ReformerTokenizer")
    _import_structure["models.speech_to_text"].append("Speech2TextTokenizer")
-    _import_structure["models.speech_to_text"].append("Speech2TextProcessor")
    _import_structure["models.t5"].append("T5Tokenizer")
    _import_structure["models.xlm_prophetnet"].append("XLMProphetNetTokenizer")
    _import_structure["models.xlm_roberta"].append("XLMRobertaTokenizer")
@@ -337,8 +342,6 @@ if is_tokenizers_available():
    _import_structure["models.xlnet"].append("XLNetTokenizerFast")
    _import_structure["tokenization_utils_fast"] = ["PreTrainedTokenizerFast"]

-    if is_sentencepiece_available():
-        _import_structure["convert_slow_tokenizer"] = ["SLOW_TO_FAST_CONVERTERS", "convert_slow_tokenizer"]
 else:
    from .utils import dummy_tokenizers_objects

@@ -346,9 +349,39 @@ else:
        name for name in dir(dummy_tokenizers_objects) if not name.startswith("_")
    ]

+if is_sentencepiece_available() and is_tokenizers_available():
+    _import_structure["convert_slow_tokenizer"] = ["SLOW_TO_FAST_CONVERTERS", "convert_slow_tokenizer"]
+else:
+    from .utils import dummy_sentencepiece_and_tokenizers_objects
+
+    _import_structure["utils.dummy_sentencepiece_and_tokenizers_objects"] = [
+        name for name in dir(dummy_sentencepiece_and_tokenizers_objects) if not name.startswith("_")
+    ]
+
+# Speech-specific objects
+if is_speech_available():
+    _import_structure["models.speech_to_text"].append("Speech2TextFeatureExtractor")
+
+else:
+    from .utils import dummy_speech_objects
+
+    _import_structure["utils.dummy_speech_objects"] = [
+        name for name in dir(dummy_speech_objects) if not name.startswith("_")
+    ]
+
+if is_sentencepiece_available() and is_speech_available():
+    _import_structure["models.speech_to_text"].append("Speech2TextProcessor")
+else:
+    from .utils import dummy_sentencepiece_and_speech_objects
+
+    _import_structure["utils.dummy_sentencepiece_and_speech_objects"] = [
+        name for name in dir(dummy_sentencepiece_and_speech_objects) if not name.startswith("_")
+    ]
+
 # Vision-specific objects
 if is_vision_available():
    _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
+    _import_structure["models.deit"].append("DeiTFeatureExtractor")
    _import_structure["models.vit"].append("ViTFeatureExtractor")
 else:
    from .utils import dummy_vision_objects
@@ -425,6 +458,7 @@ if is_torch_available():
            "load_tf_weights_in_albert",
        ]
    )
+
    _import_structure["models.auto"].extend(
        [
            "MODEL_FOR_CAUSAL_LM_MAPPING",
@@ -579,6 +613,15 @@ if is_torch_available():
            "DebertaV2PreTrainedModel",
        ]
    )
+    _import_structure["models.deit"].extend(
+        [
+            "DEIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "DeiTForImageClassification",
+            "DeiTForImageClassificationWithTeacher",
+            "DeiTModel",
+            "DeiTPreTrainedModel",
+        ]
+    )
    _import_structure["models.distilbert"].extend(
        [
            "DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -736,6 +779,20 @@ if is_torch_available():
            "MBartModel",
        ]
    )
+    _import_structure["models.megatron_bert"].extend(
+        [
+            "MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "MegatronBertForCausalLM",
+            "MegatronBertForMaskedLM",
+            "MegatronBertForMultipleChoice",
+            "MegatronBertForNextSentencePrediction",
+            "MegatronBertForPreTraining",
+            "MegatronBertForQuestionAnswering",
+            "MegatronBertForSequenceClassification",
+            "MegatronBertForTokenClassification",
+            "MegatronBertModel",
+        ]
+    )
    _import_structure["models.mmbt"].extend(["MMBTForClassification", "MMBTModel", "ModalEmbeddings"])
    _import_structure["models.mobilebert"].extend(
        [
@@ -1394,6 +1451,7 @@ if TYPE_CHECKING:
        is_py3nvml_available,
        is_sentencepiece_available,
        is_sklearn_available,
+        is_speech_available,
        is_tf_available,
        is_tokenizers_available,
        is_torch_available,
@@ -1429,9 +1487,11 @@ if TYPE_CHECKING:
    from .models.auto import (
        ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
        CONFIG_MAPPING,
+        FEATURE_EXTRACTOR_MAPPING,
        MODEL_NAMES_MAPPING,
        TOKENIZER_MAPPING,
        AutoConfig,
+        AutoFeatureExtractor,
        AutoTokenizer,
    )
    from .models.bart import BartConfig, BartTokenizer
@@ -1454,9 +1514,11 @@ if TYPE_CHECKING:
    )
    from .models.camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
    from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
+    from .models.cpm import CpmTokenizer
    from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
    from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer
    from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
+    from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
    from .models.distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, DistilBertTokenizer
    from .models.dpr import (
        DPR_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -1482,6 +1544,7 @@ if TYPE_CHECKING:
    from .models.m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config
    from .models.marian import MarianConfig
    from .models.mbart import MBartConfig
+    from .models.megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
    from .models.mmbt import MMBTConfig
    from .models.mobilebert import MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileBertConfig, MobileBertTokenizer
    from .models.mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig, MPNetTokenizer
@@ -1494,11 +1557,7 @@ if TYPE_CHECKING:
    from .models.reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig
    from .models.retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig, RetriBertTokenizer
    from .models.roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaTokenizer
-    from .models.speech_to_text import (
-        SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        Speech2TextConfig,
-        Speech2TextFeatureExtractor,
-    )
+    from .models.speech_to_text import SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Speech2TextConfig
    from .models.squeezebert import SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, SqueezeBertConfig, SqueezeBertTokenizer
    from .models.t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
    from .models.tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig, TapasTokenizer
@@ -1585,7 +1644,7 @@ if TYPE_CHECKING:
        from .models.mt5 import MT5Tokenizer
        from .models.pegasus import PegasusTokenizer
        from .models.reformer import ReformerTokenizer
-        from .models.speech_to_text import Speech2TextProcessor, Speech2TextTokenizer
+        from .models.speech_to_text import Speech2TextTokenizer
        from .models.t5 import T5Tokenizer
        from .models.xlm_prophetnet import XLMProphetNetTokenizer
        from .models.xlm_roberta import XLMRobertaTokenizer
@@ -1625,13 +1684,28 @@ if TYPE_CHECKING:
        from .models.xlnet import XLNetTokenizerFast
        from .tokenization_utils_fast import PreTrainedTokenizerFast

-        if is_sentencepiece_available():
-            from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, convert_slow_tokenizer
    else:
        from .utils.dummy_tokenizers_objects import *

+    if is_sentencepiece_available() and is_tokenizers_available():
+        from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, convert_slow_tokenizer
+    else:
+        from .utils.dummies_sentencepiece_and_tokenizers_objects import *
+
+    if is_speech_available():
+        from .models.speech_to_text import Speech2TextFeatureExtractor
+
+    else:
+        from .utils.dummy_speech_objects import *
+
+    if is_speech_available() and is_sentencepiece_available():
+        from .models.speech_to_text import Speech2TextProcessor
+    else:
+        from .utils.dummy_sentencepiece_and_speech_objects import *
+
    if is_vision_available():
        from .image_utils import ImageFeatureExtractionMixin
+        from .models.deit import DeiTFeatureExtractor
        from .models.vit import ViTFeatureExtractor
    else:
        from .utils.dummy_vision_objects import *
@@ -1832,6 +1906,13 @@ if TYPE_CHECKING:
            DebertaV2Model,
            DebertaV2PreTrainedModel,
        )
+        from .models.deit import (
+            DEIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DeiTForImageClassification,
+            DeiTForImageClassificationWithTeacher,
+            DeiTModel,
+            DeiTPreTrainedModel,
+        )
        from .models.distilbert import (
            DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            DistilBertForMaskedLM,
@@ -1957,6 +2038,18 @@ if TYPE_CHECKING:
            MBartForSequenceClassification,
            MBartModel,
        )
+        from .models.megatron_bert import (
+            MEGATRON_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            MegatronBertForCausalLM,
+            MegatronBertForMaskedLM,
+            MegatronBertForMultipleChoice,
+            MegatronBertForNextSentencePrediction,
+            MegatronBertForPreTraining,
+            MegatronBertForQuestionAnswering,
+            MegatronBertForSequenceClassification,
+            MegatronBertForTokenClassification,
+            MegatronBertModel,
+        )
        from .models.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
        from .models.mobilebert import (
            MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
--- a/src/transformers/commands/run.py
+++ b/src/transformers/commands/run.py
@@ -14,7 +14,7 @@

 from argparse import ArgumentParser

-from ..pipelines import SUPPORTED_TASKS, Pipeline, PipelineDataFormat, pipeline
+from ..pipelines import SUPPORTED_TASKS, TASK_ALIASES, Pipeline, PipelineDataFormat, pipeline
 from ..utils import logging
 from . import BaseTransformersCLICommand

@@ -63,7 +63,9 @@ class RunCommand(BaseTransformersCLICommand):
    @staticmethod
    def register_subcommand(parser: ArgumentParser):
        run_parser = parser.add_parser("run", help="Run a pipeline through the CLI")
-        run_parser.add_argument("--task", choices=SUPPORTED_TASKS.keys(), help="Task to run")
+        run_parser.add_argument(
+            "--task", choices=list(SUPPORTED_TASKS.keys()) + list(TASK_ALIASES.keys()), help="Task to run"
+        )
        run_parser.add_argument("--input", type=str, help="Path to the file to use for inference")
        run_parser.add_argument("--output", type=str, help="Path to the file that will be used post to write results.")
        run_parser.add_argument("--model", type=str, help="Name or path to the model to instantiate.")
--- a/src/transformers/commands/serving.py
+++ b/src/transformers/commands/serving.py
@@ -15,7 +15,7 @@
 from argparse import ArgumentParser, Namespace
 from typing import Any, List, Optional

-from ..pipelines import SUPPORTED_TASKS, Pipeline, pipeline
+from ..pipelines import SUPPORTED_TASKS, TASK_ALIASES, Pipeline, pipeline
 from ..utils import logging
 from . import BaseTransformersCLICommand

@@ -102,7 +102,10 @@ class ServeCommand(BaseTransformersCLICommand):
            "serve", help="CLI tool to run inference requests through REST and GraphQL endpoints."
        )
        serve_parser.add_argument(
-            "--task", type=str, choices=SUPPORTED_TASKS.keys(), help="The task to run the pipeline on"
+            "--task",
+            type=str,
+            choices=list(SUPPORTED_TASKS.keys()) + list(TASK_ALIASES.keys()),
+            help="The task to run the pipeline on",
        )
        serve_parser.add_argument("--host", type=str, default="localhost", help="Interface the server will listen on.")
        serve_parser.add_argument("--port", type=int, default=8888, help="Port the serving will listen to.")
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -262,7 +262,7 @@ class PretrainedConfig(object):

        # TPU arguments
        if kwargs.pop("xla_device", None) is not None:
-            logger.warn(
+            logger.warning(
                "The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can "
                "safely remove it from your `config.json` file."
            )
@@ -399,10 +399,11 @@ class PretrainedConfig(object):

        """
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        if config_dict.get("model_type", False) and hasattr(cls, "model_type"):
-            assert (
-                config_dict["model_type"] == cls.model_type
-            ), f"You tried to initiate a model of type '{cls.model_type}' with a pretrained model of type '{config_dict['model_type']}'"
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warn(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )

        return cls.from_dict(config_dict, **kwargs)

--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -24,7 +24,7 @@ from typing import Dict, List, Tuple
 from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece

-from .file_utils import requires_protobuf, requires_sentencepiece
+from .file_utils import requires_backends


 class SentencePieceExtractor:
@@ -33,7 +33,7 @@ class SentencePieceExtractor:
    """

    def __init__(self, model: str):
-        requires_sentencepiece(self)
+        requires_backends(self, "sentencepiece")
        from sentencepiece import SentencePieceProcessor

        self.sp = SentencePieceProcessor()
@@ -298,14 +298,15 @@ class RobertaConverter(Converter):

 class SpmConverter(Converter):
    def __init__(self, *args):
-        requires_protobuf(self)
+        requires_backends(self, "protobuf")

        super().__init__(*args)

        from .utils import sentencepiece_model_pb2 as model_pb2

        m = model_pb2.ModelProto()
-        m.ParseFromString(open(self.original_tokenizer.vocab_file, "rb").read())
+        with open(self.original_tokenizer.vocab_file, "rb") as f:
+            m.ParseFromString(f.read())
        self.proto = m

    def vocab(self, proto):
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -192,7 +192,7 @@ class DataCollatorForTokenClassification:
        return batch


-def _collate_batch(examples, tokenizer):
+def _collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
    # Tensorize if necessary.
    if isinstance(examples[0], (list, tuple)):
@@ -201,7 +201,7 @@ def _collate_batch(examples, tokenizer):
    # Check if padding is necessary.
    length_of_first = examples[0].size(0)
    are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
-    if are_tensors_same_length:
+    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
        return torch.stack(examples, dim=0)

    # If yes, check if we have a `pad_token`.
@@ -213,6 +213,8 @@ def _collate_batch(examples, tokenizer):

    # Creating the full tensor and filling it with our data.
    max_length = max(x.size(0) for x in examples)
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
    result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
    for i, example in enumerate(examples):
        if tokenizer.padding_side == "right":
@@ -311,6 +313,8 @@ class DataCollatorForLanguageModeling:
            non-masked tokens and the value to predict for the masked token.
        mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
            The probability with which to (randomly) mask tokens in the input, when :obj:`mlm` is set to :obj:`True`.
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.

    .. note::

@@ -323,6 +327,7 @@ class DataCollatorForLanguageModeling:
    tokenizer: PreTrainedTokenizerBase
    mlm: bool = True
    mlm_probability: float = 0.15
+    pad_to_multiple_of: Optional[int] = None

    def __post_init__(self):
        if self.mlm and self.tokenizer.mask_token is None:
@@ -336,9 +341,9 @@ class DataCollatorForLanguageModeling:
    ) -> Dict[str, torch.Tensor]:
        # Handle dict or lists with proper padding and conversion to tensor.
        if isinstance(examples[0], (dict, BatchEncoding)):
-            batch = self.tokenizer.pad(examples, return_tensors="pt")
+            batch = self.tokenizer.pad(examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of)
        else:
-            batch = {"input_ids": _collate_batch(examples, self.tokenizer)}
+            batch = {"input_ids": _collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)}

        # If special token mask has been preprocessed, pop it from the dict.
        special_tokens_mask = batch.pop("special_tokens_mask", None)
--- a/src/transformers/data/datasets/squad.py
+++ b/src/transformers/data/datasets/squad.py
@@ -152,7 +152,7 @@ class SquadDataset(Dataset):
                )

                if self.dataset is None or self.examples is None:
-                    logger.warn(
+                    logger.warning(
                        f"Deleting cached file {cached_features_file} will allow dataset and examples to be cached in future run"
                    )
            else:
--- a/src/transformers/data/metrics/init.py
+++ b/src/transformers/data/metrics/init.py
@@ -16,7 +16,7 @@

 import warnings

-from ...file_utils import is_sklearn_available, requires_sklearn
+from ...file_utils import is_sklearn_available, requires_backends


 if is_sklearn_available():
@@ -34,13 +34,13 @@ DEPRECATION_WARNING = (

 def simple_accuracy(preds, labels):
    warnings.warn(DEPRECATION_WARNING, FutureWarning)
-    requires_sklearn(simple_accuracy)
+    requires_backends(simple_accuracy, "sklearn")
    return (preds == labels).mean()


 def acc_and_f1(preds, labels):
    warnings.warn(DEPRECATION_WARNING, FutureWarning)
-    requires_sklearn(acc_and_f1)
+    requires_backends(acc_and_f1, "sklearn")
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds)
    return {
@@ -52,7 +52,7 @@ def acc_and_f1(preds, labels):

 def pearson_and_spearman(preds, labels):
    warnings.warn(DEPRECATION_WARNING, FutureWarning)
-    requires_sklearn(pearson_and_spearman)
+    requires_backends(pearson_and_spearman, "sklearn")
    pearson_corr = pearsonr(preds, labels)[0]
    spearman_corr = spearmanr(preds, labels)[0]
    return {
@@ -64,7 +64,7 @@ def pearson_and_spearman(preds, labels):

 def glue_compute_metrics(task_name, preds, labels):
    warnings.warn(DEPRECATION_WARNING, FutureWarning)
-    requires_sklearn(glue_compute_metrics)
+    requires_backends(glue_compute_metrics, "sklearn")
    assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}"
    if task_name == "cola":
        return {"mcc": matthews_corrcoef(labels, preds)}
@@ -94,7 +94,7 @@ def glue_compute_metrics(task_name, preds, labels):

 def xnli_compute_metrics(task_name, preds, labels):
    warnings.warn(DEPRECATION_WARNING, FutureWarning)
-    requires_sklearn(xnli_compute_metrics)
+    requires_backends(xnli_compute_metrics, "sklearn")
    assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}"
    if task_name == "xnli":
        return {"acc": simple_accuracy(preds, labels)}
--- a/src/transformers/dependency_versions_check.py
+++ b/src/transformers/dependency_versions_check.py
@@ -14,7 +14,7 @@
 import sys

 from .dependency_versions_table import deps
-from .utils.versions import require_version_core
+from .utils.versions import require_version, require_version_core


 # define which module versions we always want to check at run time
@@ -41,3 +41,7 @@ for pkg in pkgs_to_check_at_runtime:
        require_version_core(deps[pkg])
    else:
        raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py")
+
+
+def dep_version_check(pkg, hint=None):
+    require_version(deps[pkg], hint)
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -2,11 +2,14 @@
 # 1. modify the `_deps` dict in setup.py
 # 2. run `make deps_table_update``
 deps = {
+    "Pillow": "Pillow",
    "black": "black>=20.8b1",
    "cookiecutter": "cookiecutter==1.7.2",
    "dataclasses": "dataclasses",
    "datasets": "datasets",
+    "deepspeed": "deepspeed>=0.3.14",
    "docutils": "docutils==0.16.0",
+    "fairscale": "fairscale>0.3",
    "faiss-cpu": "faiss-cpu",
    "fastapi": "fastapi",
    "filelock": "filelock",
@@ -18,14 +21,15 @@ deps = {
    "isort": "isort>=5.5.4",
    "jax": "jax>=0.2.8",
    "jaxlib": "jaxlib>=0.1.59",
+    "jieba": "jieba",
    "keras2onnx": "keras2onnx",
+    "nltk": "nltk",
    "numpy": "numpy>=1.17",
    "onnxconverter-common": "onnxconverter-common",
    "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
    "onnxruntime": "onnxruntime>=1.4.0",
    "packaging": "packaging",
    "parameterized": "parameterized",
-    "Pillow": "Pillow",
    "protobuf": "protobuf",
    "psutil": "psutil",
    "pydantic": "pydantic",
@@ -36,7 +40,10 @@ deps = {
    "recommonmark": "recommonmark",
    "regex": "regex!=2019.12.17",
    "requests": "requests",
+    "rouge-score": "rouge-score",
+    "sacrebleu": "sacrebleu>=1.4.12",
    "sacremoses": "sacremoses",
+    "sagemaker": "sagemaker>=2.31.0",
    "scikit-learn": "scikit-learn",
    "sentencepiece": "sentencepiece==0.1.91",
    "soundfile": "soundfile",
@@ -44,6 +51,7 @@ deps = {
    "sphinx-markdown-tables": "sphinx-markdown-tables",
    "sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3",
    "sphinx": "sphinx==3.2.1",
+    "sphinxext-opengraph": "sphinxext-opengraph==0.4.1",
    "starlette": "starlette",
    "tensorflow-cpu": "tensorflow-cpu>=2.3",
    "tensorflow": "tensorflow>=2.3",
@@ -55,5 +63,4 @@ deps = {
    "unidic": "unidic>=1.0.2",
    "unidic_lite": "unidic_lite>=1.0.7",
    "uvicorn": "uvicorn",
-    "sagemaker": "sagemaker>=2.31.0",
 }
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -325,6 +325,13 @@ class FeatureExtractionMixin:
        local_files_only = kwargs.pop("local_files_only", False)
        revision = kwargs.pop("revision", None)

+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+
+        user_agent = {"file_type": "feature extractor", "from_auto_class": from_auto_class}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline
+
        if is_offline_mode() and not local_files_only:
            logger.info("Offline mode: forcing local_files_only=True")
            local_files_only = True
@@ -349,6 +356,7 @@ class FeatureExtractionMixin:
                resume_download=resume_download,
                local_files_only=local_files_only,
                use_auth_token=use_auth_token,
+                user_agent=user_agent,
            )
            # Load feature_extractor dict
            with open(resolved_feature_extractor_file, "r", encoding="utf-8") as reader:
@@ -426,6 +434,7 @@ class FeatureExtractionMixin:
            :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance.
        """
        output = copy.deepcopy(self.__dict__)
+        output["feature_extractor_type"] = self.__class__.__name__

        return output

--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -194,7 +194,7 @@ if (
    and "PYTORCH_TRANSFORMERS_CACHE" not in os.environ
    and "TRANSFORMERS_CACHE" not in os.environ
 ):
-    logger.warn(
+    logger.warning(
        "In Transformers v4.0.0, the default path to cache downloaded models changed from "
        "'~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have overridden "
        "and '~/.cache/torch/transformers' is a directory that exists, we're moving it to "
@@ -397,6 +397,11 @@ def is_torchaudio_available():
    return _torchaudio_available


+def is_speech_available():
+    # For now this depends on torchaudio but the exact dependency might evolve in the future.
+    return _torchaudio_available
+
+
 def torch_only_method(fn):
    def wrapper(*args, **kwargs):
        if not _torch_available:
@@ -513,6 +518,13 @@ explained here: https://pandas.pydata.org/pandas-docs/stable/getting_started/ins
 """


+# docstyle-ignore
+SPEECH_IMPORT_ERROR = """
+{0} requires the torchaudio library but it was not found in your environment. You can install it with pip:
+`pip install torchaudio`
+"""
+
+
 # docstyle-ignore
 VISION_IMPORT_ERROR = """
 {0} requires the PIL library but it was not found in your environment. You can install it with pip:
@@ -520,76 +532,32 @@ VISION_IMPORT_ERROR = """
 """


-def requires_datasets(obj):
+BACKENDS_MAPPING = OrderedDict(
+    [
+        ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
+        ("faiss", (is_faiss_available, FAISS_IMPORT_ERROR)),
+        ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
+        ("pandas", (is_pandas_available, PANDAS_IMPORT_ERROR)),
+        ("protobuf", (is_protobuf_available, PROTOBUF_IMPORT_ERROR)),
+        ("scatter", (is_scatter_available, SCATTER_IMPORT_ERROR)),
+        ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
+        ("sklearn", (is_sklearn_available, SKLEARN_IMPORT_ERROR)),
+        ("speech", (is_speech_available, SPEECH_IMPORT_ERROR)),
+        ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
+        ("tokenziers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
+        ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
+        ("vision", (is_vision_available, VISION_IMPORT_ERROR)),
+    ]
+)
+
+
+def requires_backends(obj, backends):
+    if not isinstance(backends, (list, tuple)):
+        backends = [backends]
+
    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_datasets_available():
-        raise ImportError(DATASETS_IMPORT_ERROR.format(name))
-
-
-def requires_faiss(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_faiss_available():
-        raise ImportError(FAISS_IMPORT_ERROR.format(name))
-
-
-def requires_pytorch(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_torch_available():
-        raise ImportError(PYTORCH_IMPORT_ERROR.format(name))
-
-
-def requires_sklearn(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_sklearn_available():
-        raise ImportError(SKLEARN_IMPORT_ERROR.format(name))
-
-
-def requires_tf(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_tf_available():
-        raise ImportError(TENSORFLOW_IMPORT_ERROR.format(name))
-
-
-def requires_flax(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_flax_available():
-        raise ImportError(FLAX_IMPORT_ERROR.format(name))
-
-
-def requires_tokenizers(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_tokenizers_available():
-        raise ImportError(TOKENIZERS_IMPORT_ERROR.format(name))
-
-
-def requires_sentencepiece(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_sentencepiece_available():
-        raise ImportError(SENTENCEPIECE_IMPORT_ERROR.format(name))
-
-
-def requires_protobuf(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_protobuf_available():
-        raise ImportError(PROTOBUF_IMPORT_ERROR.format(name))
-
-
-def requires_pandas(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_pandas_available():
-        raise ImportError(PANDAS_IMPORT_ERROR.format(name))
-
-
-def requires_scatter(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_scatter_available():
-        raise ImportError(SCATTER_IMPORT_ERROR.format(name))
-
-
-def requires_vision(obj):
-    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
-    if not is_vision_available():
-        raise ImportError(VISION_IMPORT_ERROR.format(name))
+    if not all(BACKENDS_MAPPING[backend][0]() for backend in backends):
+        raise ImportError("".join([BACKENDS_MAPPING[backend][1].format(name) for backend in backends]))


 def add_start_docstrings(*docstr):
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -39,8 +39,8 @@ LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""

            `What are input IDs? <../glossary.html#input-ids>`__
        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`):
-            Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
-            or scores for each vocabulary token after SoftMax.
+            Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
+            search or log softmax for each vocabulary token when using beam search
        kwargs:
            Additional logits processor specific kwargs.

@@ -77,7 +77,7 @@ class LogitsProcessorList(list):
    This class can be used to create a list of :class:`~transformers.LogitsProcessor` or
    :class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from
    list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or
-    :class:`~transformers.LogitsProcessor` to the inputs.
+    :class:`~transformers.LogitsWarper` to the inputs.
    """

    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
@@ -152,17 +152,30 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
            <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
    """

-    def __init__(self, penalty: float):
+    def __init__(self, penalty: float, m=3.33, penalize_last=250):
        if not isinstance(penalty, float) or not (penalty > 0):
            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")

        self.penalty = penalty
+        self.penalize_last = None
+        if not m is None and not penalize_last is None:
+            self.penalty = (torch.arange(penalize_last)/(penalize_last - 1)) * 2. - 1
+            self.penalty = (m * self.penalty) / (1 + torch.abs(self.penalty) * (m - 1))
+            self.penalty = 1 + ((self.penalty + 1) / 2).unsqueeze(0) * (penalty - 1)
+            self.penalize_last = penalize_last

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if not self.penalize_last is None:
+            penality_len = min(input_ids.shape[1], self.penalize_last)
+            input_ids = input_ids[:, -penality_len:]
        score = torch.gather(scores, 1, input_ids)

        # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
-        score = torch.where(score < 0, score * self.penalty, score / self.penalty)
+        if not self.penalize_last is None:
+            penalty = self.penalty.type(score.dtype).to(score.device)
+            score = torch.where(score < 0, score * penalty[:, -penality_len:], score / penalty[:, -penality_len:])
+        else:
+            score = torch.where(score < 0, score * self.penalty, score / self.penalty)

        scores.scatter_(1, input_ids, score)
        return scores
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -18,6 +18,7 @@ from dataclasses import dataclass
 from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union

 import torch
+import torch.distributed as dist
 from torch.nn import functional as F

 from .file_utils import ModelOutput
@@ -569,6 +570,8 @@ class GenerationMixin:
    def _get_logits_processor(
        self,
        repetition_penalty: float,
+        repetition_penalty_range: int,
+        repetition_penalty_slope: float,
        no_repeat_ngram_size: int,
        encoder_no_repeat_ngram_size: int,
        encoder_input_ids: torch.LongTensor,
@@ -624,7 +627,7 @@ class GenerationMixin:
                )
            )
        if repetition_penalty is not None and repetition_penalty != 1.0:
-            processors.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
+            processors.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty, m=repetition_penalty_slope, penalize_last=repetition_penalty_range))
        if no_repeat_ngram_size is not None and no_repeat_ngram_size > 0:
            processors.append(NoRepeatNGramLogitsProcessor(no_repeat_ngram_size))
        if encoder_no_repeat_ngram_size is not None and encoder_no_repeat_ngram_size > 0:
@@ -674,6 +677,8 @@ class GenerationMixin:
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        repetition_penalty: Optional[float] = None,
+        repetition_penalty_range: Optional[int] = None,
+        repetition_penalty_slope: Optional[float] = 3.33,
        bad_words_ids: Optional[Iterable[int]] = None,
        bos_token_id: Optional[int] = None,
        pad_token_id: Optional[int] = None,
@@ -695,6 +700,7 @@ class GenerationMixin:
        forced_bos_token_id: Optional[int] = None,
        forced_eos_token_id: Optional[int] = None,
        remove_invalid_values: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
        **model_kwargs,
    ) -> Union[GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput, torch.LongTensor]:
        r"""
@@ -800,6 +806,8 @@ class GenerationMixin:
            remove_invalid_values (:obj:`bool`, `optional`):
                Whether to remove possible `nan` and `inf` outputs of the model to prevent the generation method to
                crash. Note that using ``remove_invalid_values`` can slow down generation.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)

            model_kwargs:
                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If the
@@ -963,6 +971,8 @@ class GenerationMixin:
        # get distribution pre_processing samplers
        logits_processor = self._get_logits_processor(
            repetition_penalty=repetition_penalty,
+            repetition_penalty_range=repetition_penalty_range,
+            repetition_penalty_slope=repetition_penalty_slope,
            no_repeat_ngram_size=no_repeat_ngram_size,
            encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size,
            encoder_input_ids=encoder_input_ids,
@@ -1000,6 +1010,7 @@ class GenerationMixin:
                eos_token_id=eos_token_id,
                output_scores=output_scores,
                return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
                **model_kwargs,
            )

@@ -1028,6 +1039,7 @@ class GenerationMixin:
                eos_token_id=eos_token_id,
                output_scores=output_scores,
                return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
                **model_kwargs,
            )

@@ -1063,6 +1075,7 @@ class GenerationMixin:
                eos_token_id=eos_token_id,
                output_scores=output_scores,
                return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
                **model_kwargs,
            )

@@ -1102,6 +1115,7 @@ class GenerationMixin:
                eos_token_id=eos_token_id,
                output_scores=output_scores,
                return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
                **model_kwargs,
            )

@@ -1141,6 +1155,7 @@ class GenerationMixin:
                eos_token_id=eos_token_id,
                output_scores=output_scores,
                return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
                **model_kwargs,
            )

@@ -1156,13 +1171,12 @@ class GenerationMixin:
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
        **model_kwargs,
    ) -> Union[GreedySearchOutput, torch.LongTensor]:
        r"""
        Generates sequences for models with a language modeling head using greedy decoding.

-
-
        Parameters:

            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -1175,6 +1189,7 @@ class GenerationMixin:
            stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
                An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
                :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
+
            max_length (:obj:`int`, `optional`, defaults to 20):
                The maximum length of the sequence to be generated.
            pad_token_id (:obj:`int`, `optional`):
@@ -1191,6 +1206,8 @@ class GenerationMixin:
                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the :obj:`forward` function of the
                model. If model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
@@ -1265,7 +1282,19 @@ class GenerationMixin:
            input_ids, max_length
        )

+        this_peer_finished = False  # used by synced_gpus only
        while cur_len < max_length:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
            # prepare model inputs
            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)

@@ -1276,6 +1305,11 @@ class GenerationMixin:
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
            )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
            next_token_logits = outputs.logits[:, -1, :]

            # Store scores, attentions and hidden_states when required
@@ -1321,16 +1355,16 @@ class GenerationMixin:
                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
            )

-            # stop when there is a </s> in each sentence, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0:
-                break
-
-            if stopping_criteria(input_ids, scores):
-                break
-
            # increase cur_len
            cur_len = cur_len + 1

+            # stop when there is a </s> in each sentence, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
+
        if return_dict_in_generate:
            if self.config.is_encoder_decoder:
                return GreedySearchEncoderDecoderOutput(
@@ -1365,6 +1399,7 @@ class GenerationMixin:
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
        **model_kwargs,
    ) -> Union[SampleOutput, torch.LongTensor]:
        r"""
@@ -1402,6 +1437,8 @@ class GenerationMixin:
                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
            model_kwargs:
                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
                model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
@@ -1485,8 +1522,20 @@ class GenerationMixin:
            input_ids, max_length
        )

+        this_peer_finished = False  # used by synced_gpus only
        # auto-regressive generation
        while cur_len < max_length:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
            # prepare model inputs
            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)

@@ -1497,6 +1546,11 @@ class GenerationMixin:
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
            )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
            next_token_logits = outputs.logits[:, -1, :]

            # pre-process distribution
@@ -1533,7 +1587,6 @@ class GenerationMixin:

            # add token and increase length by one
            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            cur_len = cur_len + 1

            # update sequence length
            if eos_token_id is not None:
@@ -1541,18 +1594,21 @@ class GenerationMixin:
                    sequence_lengths, unfinished_sequences, cur_len, next_tokens == eos_token_id
                )

-            # stop when there is a </s> in each sentence, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0:
-                break
-
-            if stopping_criteria(input_ids, scores):
-                break
-
            # update model kwargs
            model_kwargs = self._update_model_kwargs_for_generation(
                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
            )

+            # increase cur_len
+            cur_len = cur_len + 1
+
+            # stop when there is a </s> in each sentence, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
+
        if return_dict_in_generate:
            if self.config.is_encoder_decoder:
                return SampleEncoderDecoderOutput(
@@ -1587,6 +1643,7 @@ class GenerationMixin:
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
        **model_kwargs,
    ) -> Union[BeamSearchOutput, torch.LongTensor]:
        r"""
@@ -1624,6 +1681,8 @@ class GenerationMixin:
                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
            model_kwargs:
                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
                model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
@@ -1726,7 +1785,19 @@ class GenerationMixin:
        beam_scores[:, 1:] = -1e9
        beam_scores = beam_scores.view((batch_size * num_beams,))

+        this_peer_finished = False  # used by synced_gpus only
        while cur_len < max_length:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)

            outputs = self(
@@ -1735,6 +1806,11 @@ class GenerationMixin:
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
            )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
            next_token_logits = outputs.logits[:, -1, :]

            # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
@@ -1792,19 +1868,20 @@ class GenerationMixin:

            input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)

-            cur_len = cur_len + 1
-
            model_kwargs = self._update_model_kwargs_for_generation(
                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
            )
            if model_kwargs["past"] is not None:
                model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], beam_idx)

-            if beam_scorer.is_done:
-                break
+            # increase cur_len
+            cur_len = cur_len + 1

-            if stopping_criteria(input_ids, scores):
-                break
+            if beam_scorer.is_done or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True

        sequence_outputs = beam_scorer.finalize(
            input_ids, beam_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id
@@ -1849,6 +1926,7 @@ class GenerationMixin:
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
        **model_kwargs,
    ) -> Union[BeamSampleOutput, torch.LongTensor]:
        r"""
@@ -1890,6 +1968,8 @@ class GenerationMixin:
                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
            model_kwargs:
                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
                model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
@@ -1993,7 +2073,19 @@ class GenerationMixin:
        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
        beam_scores = beam_scores.view((batch_size * num_beams,))

+        this_peer_finished = False  # used by synced_gpus only
        while cur_len < max_length:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)

            outputs = self(
@@ -2002,6 +2094,11 @@ class GenerationMixin:
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
            )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
            next_token_logits = outputs.logits[:, -1, :]

            # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
@@ -2063,7 +2160,6 @@ class GenerationMixin:
            beam_idx = beam_outputs["next_beam_indices"]

            input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
-            cur_len = cur_len + 1

            model_kwargs = self._update_model_kwargs_for_generation(
                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
@@ -2071,11 +2167,14 @@ class GenerationMixin:
            if model_kwargs["past"] is not None:
                model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], beam_idx)

-            if beam_scorer.is_done:
-                break
+            # increase cur_len
+            cur_len = cur_len + 1

-            if stopping_criteria(input_ids, scores):
-                break
+            if beam_scorer.is_done or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True

        sequence_outputs = beam_scorer.finalize(
            input_ids, beam_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id
@@ -2119,6 +2218,7 @@ class GenerationMixin:
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
        **model_kwargs,
    ):
        r"""
@@ -2156,6 +2256,9 @@ class GenerationMixin:
                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+
            model_kwargs:
                Additional model specific kwargs that will be forwarded to the :obj:`forward` function of the model. If
                model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
@@ -2266,7 +2369,19 @@ class GenerationMixin:
        beam_scores[:, ::num_sub_beams] = 0
        beam_scores = beam_scores.view((batch_size * num_beams,))

+        this_peer_finished = False  # used by synced_gpus only
        while cur_len < max_length:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
            # predicted tokens in cur_len step
            current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)

@@ -2282,6 +2397,10 @@ class GenerationMixin:
                output_hidden_states=output_hidden_states,
            )

+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
            for beam_group_idx in range(num_beam_groups):
                group_start_idx = beam_group_idx * num_sub_beams
                group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
@@ -2372,19 +2491,22 @@ class GenerationMixin:
                        else (outputs.hidden_states,)
                    )

+            input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
+
            model_kwargs = self._update_model_kwargs_for_generation(
                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
            )
            if model_kwargs["past"] is not None:
                model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], reordering_indices)

-            input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
+            # increase cur_len
            cur_len = cur_len + 1
-            if beam_scorer.is_done:
-                break

-            if stopping_criteria(input_ids, scores):
-                break
+            if beam_scorer.is_done or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True

        sequence_outputs = beam_scorer.finalize(
            input_ids, beam_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -19,6 +19,10 @@ import PIL.Image
 from .file_utils import _is_torch, is_torch_available


+IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
+
+
 def is_torch_tensor(obj):
    return _is_torch(obj) if is_torch_available() else False

--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -19,12 +19,13 @@ import io
 import json
 import numbers
 import os
+import sys
 import tempfile
 from copy import deepcopy
 from pathlib import Path

+from .dependency_versions_check import dep_version_check
 from .utils import logging
-from .utils.versions import require_version


 logger = logging.get_logger(__name__)
@@ -54,7 +55,7 @@ from .trainer_utils import PREFIX_CHECKPOINT_DIR, BestRun, IntervalStrategy  # n
 def is_wandb_available():
    # any value of WANDB_DISABLED disables wandb
    if os.getenv("WANDB_DISABLED", "").upper() in ENV_VARS_TRUE_VALUES:
-        logger.warn(
+        logger.warning(
            "Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the "
            "--report_to flag to control the integrations used for logging result (for instance --report_to none)."
        )
@@ -268,7 +269,77 @@ def rewrite_logs(d):
    return new_d


-def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
+_is_deepspeed_zero3_enabled = None
+
+
+def is_deepspeed_zero3_enabled():
+    """
+    This function answers to the question of whether DeepSpeed is going to be used and run using ZeRO Stage 3.
+
+    It includes an auto-discovery method, see comments in the code for details.
+
+    Returns: ``True`` if either it was explicitly enabled via ``deepspeed_zero3_enable(True)`` or the auto-detector was
+    able to derive that the ``Trainer`` will be running via DeepSpeed ZeRO stage 3.
+    """
+    global _is_deepspeed_zero3_enabled
+    if _is_deepspeed_zero3_enabled is None:
+        _is_deepspeed_zero3_enabled = False
+        # Try to auto-discover if we are about to use DeepSpeed with ZeRO3 enabled. This will only
+        # work for scripts using cli to pass --deepspeed ds_config.json. If cmd args aren't used,
+        # then to get the model efficiently loaded across multiple-gpus one has to explicitly call
+        # is_deepspeed_zero3_enabled(True) **before** instantiating a model object
+        if "--deepspeed" in sys.argv:
+            idx = sys.argv.index("--deepspeed")
+            ds_config = sys.argv[idx + 1]
+            if not os.path.exists(ds_config):
+                raise ValueError("--deepspeed requires a valid path to a config file")
+            config = deepspeed_parse_config(ds_config)
+            if (
+                "zero_optimization" in config
+                and "stage" in config["zero_optimization"]
+                and config["zero_optimization"]["stage"] == 3
+            ):
+                _is_deepspeed_zero3_enabled = True
+
+    return _is_deepspeed_zero3_enabled
+
+
+def deepspeed_zero3_enable(enable=True):
+    """
+    ``is_deepspeed_zero3_enabled()`` tries to derive automatically if DeepSpeed ZeRO 3 is going to be used by looking
+    at ``sys.argv`` which may or may contain information about where to find the DeepSpeed config if any.
+
+    This function allows for explicit enabling/disabling of this global flag.
+
+    Args:
+        enable: if set to ``True`` will make ``is_deepspeed_zero3_enabled()`` return ``True``
+    """
+    global _is_deepspeed_zero3_enabled
+    _is_deepspeed_zero3_enabled = enable
+
+
+def deepspeed_parse_config(ds_config):
+    """
+    If ``ds_config`` isn't already a dict, read it from the config file.
+
+    If it's already a dict, return a copy of it, so that we can freely modify it.
+    """
+    dep_version_check("deepspeed")
+
+    if isinstance(ds_config, dict):
+        # Don't modify user's data should they want to reuse it (e.g. in tests), because once we
+        # modified it, it will not be accepted here again, since some config params must be not set by users
+        config = deepcopy(ds_config)
+    elif isinstance(ds_config, str):
+        with io.open(ds_config, "r", encoding="utf-8") as f:
+            config = json.load(f)
+    else:
+        raise ValueError("expecting either a path to a config file or a pre-populated dict")
+
+    return config
+
+
+def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
    """
    Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.

@@ -284,21 +355,10 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
    """
    import deepspeed

-    require_version("deepspeed>0.3.12")
-
    args = trainer.args
-    ds_config_file = args.deepspeed
    model = trainer.model

-    if isinstance(args.deepspeed, dict):
-        # Don't modify user's data should they want to reuse it (e.g. in tests), because once we
-        # modified it, it will not be accepted here again, since some config params must be not set by users
-        config = deepcopy(args.deepspeed)
-    elif isinstance(args.deepspeed, str):
-        with io.open(ds_config_file, "r", encoding="utf-8") as f:
-            config = json.load(f)
-    else:
-        raise ValueError("expecting either a path to a config file or a pre-populated dict")
+    config = deepspeed_parse_config(args.deepspeed)

    # The following code translates relevant trainer's cl args into the DS config

@@ -324,9 +384,7 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
    config["gradient_accumulation_steps"] = args.gradient_accumulation_steps

    if "gradient_clipping" in config:
-        logger.info(
-            f"Keeping the `gradient_clipping` config from {ds_config_file} intact, ignoring any gradient clipping-specific cl args"
-        )
+        logger.info("Keeping the `gradient_clipping` config intact, ignoring any gradient clipping-specific cl args")
    else:  # override only if the ds config doesn't already have this section
        config["gradient_clipping"] = args.max_grad_norm

@@ -336,6 +394,7 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
    # 2. HF scheduler + HF optimizer: Yes
    # 3. DS scheduler + HF optimizer: Yes
    # 4. HF scheduler + DS optimizer: No
+    #
    # Unless Offload is enabled in which case it's:
    # 1. DS scheduler + DS optimizer: Yes
    # 2. HF scheduler + HF optimizer: No
@@ -344,7 +403,7 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):

    optimizer = None
    if "optimizer" in config:
-        logger.info(f"Updating the `scheduler` config from {ds_config_file} with other command line arguments")
+        logger.info("Updating the `scheduler` config with other command line arguments")

        # to avoid inconsistent values of lr and warm up steps the command line args override config
        params = dict(
@@ -384,7 +443,7 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
    # WarmupDecayLR| linear               | get_linear_schedule_with_warmup   |
    lr_scheduler = None
    if "scheduler" in config:
-        logger.info(f"Updating the `scheduler` config from {ds_config_file} with other command line arguments")
+        logger.info("Updating the `scheduler` config with other command line arguments")
        # the user won't easily know the correct num_training_steps should they use WarmupDecayLR,
        # so let's set it to the correct value
        if config["scheduler"]["type"] == "WarmupDecayLR":
@@ -417,9 +476,7 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
        # - `amp`: which delegates amp work to apex (which needs to be available), but it cannot be used with any ZeRO features, so probably best to be avoided.
        if trainer.fp16_backend == "apex":
            if "amp" in config:
-                logger.info(
-                    f"Keeping the `amp` config from {ds_config_file} intact, ignoring any amp-specific cl args"
-                )
+                logger.info("Keeping the `amp` config intact, ignoring any amp-specific cl args")
            else:
                config["amp"] = {
                    "enabled": True,
@@ -427,19 +484,33 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
                }
        elif trainer.fp16_backend == "amp":
            if "fp16" in config:
-                logger.info(
-                    f"Keeping the `fp16` config from {ds_config_file} intact, ignoring any fp16-specific cl args"
-                )
+                logger.info("Keeping the `fp16` config intact, ignoring any fp16-specific cl args")
            else:
                config["fp16"] = {
                    "enabled": True,
                }

+    # zero
+    if "zero_optimization" in config:
+        zero = config["zero_optimization"]
+
+        # now we know for sure if zero3 is enabled
+        deepspeed_zero3_enable(zero.get("stage") == 3)
+
+        # automatically assign the optimal config values based on model config
+        hidden_size = model.config.hidden_size
+        if zero.get("reduce_bucket_size") == 0:
+            zero["reduce_bucket_size"] = hidden_size * hidden_size
+        if zero.get("stage3_prefetch_bucket_size") == 0:
+            zero["stage3_prefetch_bucket_size"] = 0.9 * hidden_size * hidden_size
+        if zero.get("stage3_param_persistence_threshold") == 0:
+            zero["stage3_param_persistence_threshold"] = 10 * hidden_size
+
    # keep for quick debug:
    # from pprint import pprint; pprint(config)

-    # init that takes part of the config via `args`, and the bulk of it via `config_params`
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
+
    model, optimizer, _, lr_scheduler = deepspeed.initialize(
        model=model,
        model_parameters=model_parameters,
@@ -448,14 +519,26 @@ def init_deepspeed(trainer, num_training_steps, resume_from_checkpoint=None):
        lr_scheduler=lr_scheduler,
    )

-    if resume_from_checkpoint is not None:  # and os.path.isdir(resume_from_checkpoint):
-        logger.info(f"Attempting to resume from {resume_from_checkpoint}")
-        # this magically updates self.optimizer and self.lr_scheduler
-        load_path, _ = model.load_checkpoint(
-            resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True
-        )
-        if load_path is None:
-            raise ValueError(f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}")
+    if resume_from_checkpoint is not None:
+
+        # it's possible that the user is trying to resume from model_path, which doesn't necessarily
+        # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's
+        # a resume from a checkpoint and not just a local pretrained weight. So we check here if the
+        # path contains what looks like a deepspeed checkpoint
+        import glob
+
+        deepspeed_checkpoint_dirs = sorted(glob.glob(f"{resume_from_checkpoint}/global_step*"))
+
+        if len(deepspeed_checkpoint_dirs) > 0:
+            logger.info(f"Attempting to resume from {resume_from_checkpoint}")
+            # this magically updates self.optimizer and self.lr_scheduler
+            load_path, _ = model.load_checkpoint(
+                resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True
+            )
+            if load_path is None:
+                raise ValueError(f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}")
+        else:
+            logger.info(f"{resume_from_checkpoint} doesn't have deepspeed checkpoints, doing nothing")

    return model, optimizer, lr_scheduler

@@ -521,9 +604,11 @@ class TensorBoardCallback(TrainerCallback):
                self.tb_writer.add_hparams(args.to_sanitized_dict(), metric_dict={})

    def on_log(self, args, state, control, logs=None, **kwargs):
-        if state.is_world_process_zero:
-            if self.tb_writer is None:
-                self._init_summary_writer(args)
+        if not state.is_world_process_zero:
+            return
+
+        if self.tb_writer is None:
+            self._init_summary_writer(args)

        if self.tb_writer is not None:
            logs = rewrite_logs(logs)
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -387,6 +387,7 @@ class FlaxPreTrainedModel(ABC):
        # get abs dir
        save_directory = os.path.abspath(save_directory)
        # save config as well
+        self.config.architectures = [self.__class__.__name__[4:]]
        self.config.save_pretrained(save_directory)

        # save model
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -380,6 +380,16 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
    missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
    missing_keys += missing_keys_pt

+    # Some models may have keys that are not in the state by design, removing them before needlessly warning
+    # the user.
+    if pt_model._keys_to_ignore_on_load_missing is not None:
+        for pat in pt_model._keys_to_ignore_on_load_missing:
+            missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
+
+    if pt_model._keys_to_ignore_on_load_unexpected is not None:
+        for pat in pt_model._keys_to_ignore_on_load_unexpected:
+            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
    if len(unexpected_keys) > 0:
        logger.warning(
            f"Some weights of the TF 2.0 model were not used when "
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -290,7 +290,7 @@ def booleans_processing(config, **kwargs):
            or kwargs["output_hidden_states"] is not None
            or ("use_cache" in kwargs and kwargs["use_cache"] is not None)
        ):
-            tf_logger.warn(
+            tf_logger.warning(
                "The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model."
                "They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`)."
            )
@@ -299,7 +299,9 @@ def booleans_processing(config, **kwargs):
        final_booleans["output_hidden_states"] = config.output_hidden_states

        if kwargs["return_dict"] is not None:
-            tf_logger.warn("The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.")
+            tf_logger.warning(
+                "The parameter `return_dict` cannot be set in graph mode and will always be set to `True`."
+            )
        final_booleans["return_dict"] = True

        if "use_cache" in kwargs:
@@ -398,7 +400,7 @@ def input_processing(func, config, input_ids, **kwargs):
            if isinstance(v, allowed_types) or v is None:
                output[k] = v
            elif k not in parameter_names and "args" not in parameter_names:
-                logger.warn(
+                logger.warning(
                    f"The parameter {k} does not belongs to the parameter list {parameter_names} and will be ignored."
                )
                continue
@@ -1035,6 +1037,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
            logger.info(f"Saved model created in {saved_model_dir}")

        # Save configuration file
+        self.config.architectures = [self.__class__.__name__[2:]]
        self.config.save_pretrained(save_directory)

        # If we save using the predefined names, we can load using `from_pretrained`
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -41,6 +41,7 @@ from .file_utils import (
    replace_return_docstrings,
 )
 from .generation_utils import GenerationMixin
+from .integrations import is_deepspeed_zero3_enabled
 from .utils import logging


@@ -660,7 +661,14 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
        if new_num_tokens is None:
            return old_embeddings

-        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(old_embeddings.weight, modifier_rank=None):
+                old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        else:
+            old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+
        if old_num_tokens == new_num_tokens:
            return old_embeddings

@@ -677,8 +685,17 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
        self._init_weights(new_embeddings)

        # Copy token embeddings from the previous weights
-        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
-        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
+
+        # numbers of tokens to copy
+        n = min(old_num_tokens, new_num_tokens)
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(old_embeddings.weight, modifier_rank=0):
+                if torch.distributed.get_rank() == 0:
+                    new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :]
+        else:
+            new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :]

        return new_embeddings

@@ -932,6 +949,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):

            Passing :obj:`use_auth_token=True` is required when you want to use a private model.

+        .. note::
+
+            Activate the special `"offline-mode"
+            <https://huggingface.co/transformers/installation.html#offline-mode>`__ to use this method in a firewalled
+            environment.
+
        Examples::

            >>> from transformers import BertConfig, BertModel
@@ -1056,7 +1079,16 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
        config.name_or_path = pretrained_model_name_or_path

        # Instantiate model.
-        model = cls(config, *model_args, **model_kwargs)
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
+            # this immediately partitions the model to avoid the overhead in time and memory copying it on CPU or each GPU first
+            with deepspeed.zero.Init():
+                model = cls(config, *model_args, **model_kwargs)
+        else:
+            model = cls(config, *model_args, **model_kwargs)

        if state_dict is None and not from_tf:
            try:
@@ -1106,23 +1138,27 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):

            # copy state_dict so _load_from_state_dict can modify it
            metadata = getattr(state_dict, "_metadata", None)
-            state_dict = state_dict.copy()
-            if metadata is not None:
-                state_dict._metadata = metadata
+            #state_dict = state_dict.copy()
+            #if metadata is not None:
+            #    state_dict._metadata = metadata

            # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
            # so we need to apply the function recursively.
            def load(module: nn.Module, prefix=""):
                local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-                module._load_from_state_dict(
-                    state_dict,
-                    prefix,
-                    local_metadata,
-                    True,
-                    missing_keys,
-                    unexpected_keys,
-                    error_msgs,
-                )
+                args = (state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+                if is_deepspeed_zero3_enabled():
+                    import deepspeed
+
+                    # because zero3 puts placeholders in model params, this context
+                    # manager gathers (unpartitions) the params of the current layer, then loads from
+                    # the state dict and then re-partitions them again
+                    with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0):
+                        if torch.distributed.get_rank() == 0:
+                            module._load_from_state_dict(*args)
+                else:
+                    module._load_from_state_dict(*args)
+
                for name, child in module._modules.items():
                    if child is not None:
                        load(child, prefix + name + ".")
--- a/src/transformers/models/init.py
+++ b/src/transformers/models/init.py
@@ -30,8 +30,10 @@ from . import (
    blenderbot_small,
    camembert,
    convbert,
+    cpm,
    ctrl,
    deberta,
+    deit,
    dialogpt,
    distilbert,
    dpr,
@@ -50,6 +52,7 @@ from . import (
    m2m_100,
    marian,
    mbart,
+    megatron_bert,
    mmbt,
    mobilebert,
    mpnet,
--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -267,12 +267,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
        """

        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -184,37 +184,6 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
            return cls + token_ids_0 + sep
        return cls + token_ids_0 + sep + token_ids_1 + sep

-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True if the token list is already formatted with special tokens for the model
-
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
--- a/src/transformers/models/auto/init.py
+++ b/src/transformers/models/auto/init.py
@@ -22,7 +22,9 @@ from ...file_utils import _BaseLazyModule, is_flax_available, is_tf_available, i


 _import_structure = {
+    "auto_factory": ["get_values"],
    "configuration_auto": ["ALL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CONFIG_MAPPING", "MODEL_NAMES_MAPPING", "AutoConfig"],
+    "feature_extraction_auto": ["FEATURE_EXTRACTOR_MAPPING", "AutoFeatureExtractor"],
    "tokenization_auto": ["TOKENIZER_MAPPING", "AutoTokenizer"],
 }

@@ -103,7 +105,9 @@ if is_flax_available():


 if TYPE_CHECKING:
+    from .auto_factory import get_values
    from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP, CONFIG_MAPPING, MODEL_NAMES_MAPPING, AutoConfig
+    from .feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
    from .tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer

    if is_torch_available():
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -328,6 +328,26 @@ FROM_PRETRAINED_FLAX_DOCSTRING = """
 """


+def _get_model_class(config, model_mapping):
+    supported_models = model_mapping[type(config)]
+    if not isinstance(supported_models, (list, tuple)):
+        return supported_models
+
+    name_to_model = {model.__name__: model for model in supported_models}
+    architectures = getattr(config, "architectures", [])
+    for arch in architectures:
+        if arch in name_to_model:
+            return name_to_model[arch]
+        elif f"TF{arch}" in name_to_model:
+            return name_to_model[f"TF{arch}"]
+        elif f"Flax{arch}" in name_to_model:
+            return name_to_model[f"Flax{arch}"]
+
+    # If not architecture is set in the config or match the supported models, the first element of the tuple is the
+    # defaults.
+    return supported_models[0]
+
+
 class _BaseAutoModelClass:
    # Base class for auto models.
    _model_mapping = None
@@ -341,7 +361,8 @@ class _BaseAutoModelClass:

    def from_config(cls, config, **kwargs):
        if type(config) in cls._model_mapping.keys():
-            return cls._model_mapping[type(config)](config, **kwargs)
+            model_class = _get_model_class(config, cls._model_mapping)
+            return model_class(config, **kwargs)
        raise ValueError(
            f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
            f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
@@ -356,9 +377,8 @@ class _BaseAutoModelClass:
            )

        if type(config) in cls._model_mapping.keys():
-            return cls._model_mapping[type(config)].from_pretrained(
-                pretrained_model_name_or_path, *model_args, config=config, **kwargs
-            )
+            model_class = _get_model_class(config, cls._model_mapping)
+            return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
        raise ValueError(
            f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
            f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
@@ -418,3 +438,14 @@ def auto_class_factory(name, model_mapping, checkpoint_for_example="bert-base-ca
    from_pretrained = replace_list_option_in_docstrings(model_mapping)(from_pretrained)
    new_class.from_pretrained = classmethod(from_pretrained)
    return new_class
+
+
+def get_values(model_mapping):
+    result = []
+    for model in model_mapping.values():
+        if isinstance(model, (list, tuple)):
+            result += list(model)
+        else:
+            result.append(model)
+
+    return result
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -33,6 +33,7 @@ from ..convbert.configuration_convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE
 from ..ctrl.configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
 from ..deberta.configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig
 from ..deberta_v2.configuration_deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
+from ..deit.configuration_deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
 from ..distilbert.configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
 from ..dpr.configuration_dpr import DPR_PRETRAINED_CONFIG_ARCHIVE_MAP, DPRConfig
 from ..electra.configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig
@@ -50,6 +51,7 @@ from ..lxmert.configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
 from ..m2m_100.configuration_m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config
 from ..marian.configuration_marian import MarianConfig
 from ..mbart.configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig
+from ..megatron_bert.configuration_megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
 from ..mobilebert.configuration_mobilebert import MobileBertConfig
 from ..mpnet.configuration_mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig
 from ..mt5.configuration_mt5 import MT5Config
@@ -83,8 +85,10 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
    (key, value)
    for pretrained_map in [
        # Add archive maps here
+        DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP,
        BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        VIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -133,6 +137,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
 CONFIG_MAPPING = OrderedDict(
    [
        # Add configs here
+        ("deit", DeiTConfig),
        ("gpt_neo", GPTNeoConfig),
        ("big_bird", BigBirdConfig),
        ("speech_to_text", Speech2TextConfig),
@@ -155,6 +160,7 @@ CONFIG_MAPPING = OrderedDict(
        ("pegasus", PegasusConfig),
        ("marian", MarianConfig),
        ("mbart", MBartConfig),
+        ("megatron_bert", MegatronBertConfig),
        ("mpnet", MPNetConfig),
        ("bart", BartConfig),
        ("blenderbot", BlenderbotConfig),
@@ -189,6 +195,7 @@ CONFIG_MAPPING = OrderedDict(
 MODEL_NAMES_MAPPING = OrderedDict(
    [
        # Add full (and cased) model names here
+        ("deit", "DeiT"),
        ("gpt_neo", "GPT Neo"),
        ("big_bird", "BigBird"),
        ("speech_to_text", "Speech2Text"),
@@ -211,6 +218,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
        ("blenderbot", "Blenderbot"),
        ("marian", "Marian"),
        ("mbart", "mBART"),
+        ("megatron_bert", "MegatronBert"),
        ("bart", "BART"),
        ("reformer", "Reformer"),
        ("longformer", "Longformer"),
@@ -243,29 +251,38 @@ MODEL_NAMES_MAPPING = OrderedDict(
 )


+def _get_class_name(model_class):
+    if isinstance(model_class, (list, tuple)):
+        return " or ".join([f":class:`~transformers.{c.__name__}`" for c in model_class])
+    return f":class:`~transformers.{model_class.__name__}`"
+
+
 def _list_model_options(indent, config_to_class=None, use_model_types=True):
    if config_to_class is None and not use_model_types:
        raise ValueError("Using `use_model_types=False` requires a `config_to_class` dictionary.")
    if use_model_types:
        if config_to_class is None:
-            model_type_to_name = {model_type: config.__name__ for model_type, config in CONFIG_MAPPING.items()}
+            model_type_to_name = {
+                model_type: f":class:`~transformers.{config.__name__}`"
+                for model_type, config in CONFIG_MAPPING.items()
+            }
        else:
            model_type_to_name = {
-                model_type: config_to_class[config].__name__
+                model_type: _get_class_name(config_to_class[config])
                for model_type, config in CONFIG_MAPPING.items()
                if config in config_to_class
            }
        lines = [
-            f"{indent}- **{model_type}** -- :class:`~transformers.{model_type_to_name[model_type]}` ({MODEL_NAMES_MAPPING[model_type]} model)"
+            f"{indent}- **{model_type}** -- {model_type_to_name[model_type]} ({MODEL_NAMES_MAPPING[model_type]} model)"
            for model_type in sorted(model_type_to_name.keys())
        ]
    else:
-        config_to_name = {config.__name__: clas.__name__ for config, clas in config_to_class.items()}
+        config_to_name = {config.__name__: _get_class_name(clas) for config, clas in config_to_class.items()}
        config_to_model_name = {
            config.__name__: MODEL_NAMES_MAPPING[model_type] for model_type, config in CONFIG_MAPPING.items()
        }
        lines = [
-            f"{indent}- :class:`~transformers.{config_name}` configuration class: :class:`~transformers.{config_to_name[config_name]}` ({config_to_model_name[config_name]} model)"
+            f"{indent}- :class:`~transformers.{config_name}` configuration class: {config_to_name[config_name]} ({config_to_model_name[config_name]} model)"
            for config_name in sorted(config_to_name.keys())
        ]
    return "\n".join(lines)
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -0,0 +1,153 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" AutoFeatureExtractor class. """
+
+from collections import OrderedDict
+
+from ...feature_extraction_utils import FeatureExtractionMixin
+from ...file_utils import is_speech_available, is_vision_available
+from ..wav2vec2.feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
+from .configuration_auto import replace_list_option_in_docstrings
+
+
+if is_speech_available():
+    from ..speech_to_text.feature_extraction_speech_to_text import Speech2TextFeatureExtractor
+else:
+    Speech2TextFeatureExtractor = None
+
+if is_vision_available():
+    from ..deit.feature_extraction_deit import DeiTFeatureExtractor
+    from ..vit.feature_extraction_vit import ViTFeatureExtractor
+else:
+    DeiTFeatureExtractor = None
+    ViTFeatureExtractor = None
+
+
+# Build the list of all feature extractors
+FEATURE_EXTRACTOR_MAPPING = OrderedDict(
+    [
+        ("deit", DeiTFeatureExtractor),
+        ("s2t", Speech2TextFeatureExtractor),
+        ("vit", ViTFeatureExtractor),
+        ("wav2vec2", Wav2Vec2FeatureExtractor),
+    ]
+)
+
+
+def feature_extractor_class_from_name(class_name: str):
+    for c in FEATURE_EXTRACTOR_MAPPING.values():
+        if c is not None and c.__name__ == class_name:
+            return c
+
+
+class AutoFeatureExtractor:
+    r"""
+    This is a generic feature extractor class that will be instantiated as one of the feature extractor classes of the
+    library when created with the :meth:`AutoFeatureExtractor.from_pretrained` class method.
+
+    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoFeatureExtractor is designed to be instantiated "
+            "using the `AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path)` method."
+        )
+
+    @classmethod
+    @replace_list_option_in_docstrings(FEATURE_EXTRACTOR_MAPPING)
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r"""
+        Instantiate one of the feature extractor classes of the library from a pretrained model vocabulary.
+
+        The tokenizer class to instantiate is selected based on the :obj:`model_type` property of the config object
+        (either passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's
+        missing, by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
+
+        List options
+
+        Params:
+            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+                This can be either:
+
+                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
+                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing a feature extractor file saved using the
+                  :func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` method, e.g.,
+                  ``./my_model_directory/``.
+                - a path or url to a saved feature extractor JSON `file`, e.g.,
+                  ``./my_model_directory/feature_extraction_config.json``.
+            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                Path to a directory in which a downloaded pretrained model feature extractor should be cached if the
+                standard cache should not be used.
+            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to force to (re-)download the feature extractor files and override the cached versions
+                if they exist.
+            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
+                exists.
+            proxies (:obj:`Dict[str, str]`, `optional`):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            use_auth_token (:obj:`str` or `bool`, `optional`):
+                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
+                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
+            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                identifier allowed by git.
+            return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If :obj:`False`, then this function returns just the final feature extractor object. If :obj:`True`,
+                then this functions returns a :obj:`Tuple(feature_extractor, unused_kwargs)` where `unused_kwargs` is a
+                dictionary consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the
+                part of ``kwargs`` which has not been used to update ``feature_extractor`` and is otherwise ignored.
+            kwargs (:obj:`Dict[str, Any]`, `optional`):
+                The values in kwargs of any keys which are feature extractor attributes will be used to override the
+                loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
+                controlled by the ``return_unused_kwargs`` keyword parameter.
+
+        .. note::
+
+            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+
+        Examples::
+
+            >>> from transformers import AutoFeatureExtractor
+
+            >>> # Download vocabulary from huggingface.co and cache.
+            >>> feature_extractor = AutoFeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h')
+
+            >>> # If vocabulary files are in a directory (e.g. feature extractor was saved using `save_pretrained('./test/saved_model/')`)
+            >>> feature_extractor = AutoFeatureExtractor.from_pretrained('./test/saved_model/')
+
+        """
+        kwargs["_from_auto"] = True
+        config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
+
+        if "feature_extractor_type" in config_dict:
+            feature_extractor_class = feature_extractor_class_from_name(config_dict["feature_extractor_type"])
+            return feature_extractor_class.from_dict(config_dict, **kwargs)
+        else:
+            # Fallback: use pattern matching on the string.
+            for pattern, feature_extractor_class in FEATURE_EXTRACTOR_MAPPING.items():
+                if pattern in str(pretrained_model_name_or_path):
+                    return feature_extractor_class.from_dict(config_dict, **kwargs)
+
+        raise ValueError(
+            f"Unrecognized model in {pretrained_model_name_or_path}. Should have a `feature_extractor_type` key in "
+            "its feature_extraction_config.json, or contain one of the following strings "
+            f"in its name: {', '.join(FEATURE_EXTRACTOR_MAPPING.keys())}"
+        )
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -19,6 +19,8 @@ import warnings
 from collections import OrderedDict

 from ...utils import logging
+
+# Add modeling imports here
 from ..albert.modeling_albert import (
    AlbertForMaskedLM,
    AlbertForMultipleChoice,
@@ -95,6 +97,7 @@ from ..deberta_v2.modeling_deberta_v2 import (
    DebertaV2ForTokenClassification,
    DebertaV2Model,
 )
+from ..deit.modeling_deit import DeiTForImageClassification, DeiTForImageClassificationWithTeacher, DeiTModel
 from ..distilbert.modeling_distilbert import (
    DistilBertForMaskedLM,
    DistilBertForMultipleChoice,
@@ -124,6 +127,7 @@ from ..flaubert.modeling_flaubert import (
 )
 from ..fsmt.modeling_fsmt import FSMTForConditionalGeneration, FSMTModel
 from ..funnel.modeling_funnel import (
+    FunnelBaseModel,
    FunnelForMaskedLM,
    FunnelForMultipleChoice,
    FunnelForPreTraining,
@@ -133,8 +137,6 @@ from ..funnel.modeling_funnel import (
    FunnelModel,
 )
 from ..gpt2.modeling_gpt2 import GPT2ForSequenceClassification, GPT2LMHeadModel, GPT2Model
-
-# Add modeling imports here
 from ..gpt_neo.modeling_gpt_neo import GPTNeoForCausalLM, GPTNeoModel
 from ..ibert.modeling_ibert import (
    IBertForMaskedLM,
@@ -174,6 +176,17 @@ from ..mbart.modeling_mbart import (
    MBartForSequenceClassification,
    MBartModel,
 )
+from ..megatron_bert.modeling_megatron_bert import (
+    MegatronBertForCausalLM,
+    MegatronBertForMaskedLM,
+    MegatronBertForMultipleChoice,
+    MegatronBertForNextSentencePrediction,
+    MegatronBertForPreTraining,
+    MegatronBertForQuestionAnswering,
+    MegatronBertForSequenceClassification,
+    MegatronBertForTokenClassification,
+    MegatronBertModel,
+)
 from ..mobilebert.modeling_mobilebert import (
    MobileBertForMaskedLM,
    MobileBertForMultipleChoice,
@@ -281,6 +294,7 @@ from .configuration_auto import (
    CTRLConfig,
    DebertaConfig,
    DebertaV2Config,
+    DeiTConfig,
    DistilBertConfig,
    DPRConfig,
    ElectraConfig,
@@ -298,6 +312,7 @@ from .configuration_auto import (
    M2M100Config,
    MarianConfig,
    MBartConfig,
+    MegatronBertConfig,
    MobileBertConfig,
    MPNetConfig,
    MT5Config,
@@ -327,6 +342,7 @@ logger = logging.get_logger(__name__)
 MODEL_MAPPING = OrderedDict(
    [
        # Base model mapping
+        (DeiTConfig, DeiTModel),
        (GPTNeoConfig, GPTNeoModel),
        (BigBirdConfig, BigBirdModel),
        (Speech2TextConfig, Speech2TextModel),
@@ -355,6 +371,7 @@ MODEL_MAPPING = OrderedDict(
        (BertConfig, BertModel),
        (OpenAIGPTConfig, OpenAIGPTModel),
        (GPT2Config, GPT2Model),
+        (MegatronBertConfig, MegatronBertModel),
        (MobileBertConfig, MobileBertModel),
        (TransfoXLConfig, TransfoXLModel),
        (XLNetConfig, XLNetModel),
@@ -364,7 +381,7 @@ MODEL_MAPPING = OrderedDict(
        (CTRLConfig, CTRLModel),
        (ElectraConfig, ElectraModel),
        (ReformerConfig, ReformerModel),
-        (FunnelConfig, FunnelModel),
+        (FunnelConfig, (FunnelModel, FunnelBaseModel)),
        (LxmertConfig, LxmertModel),
        (BertGenerationConfig, BertGenerationEncoder),
        (DebertaConfig, DebertaModel),
@@ -398,6 +415,7 @@ MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
        (BigBirdConfig, BigBirdForPreTraining),
        (OpenAIGPTConfig, OpenAIGPTLMHeadModel),
        (GPT2Config, GPT2LMHeadModel),
+        (MegatronBertConfig, MegatronBertForPreTraining),
        (MobileBertConfig, MobileBertForPreTraining),
        (TransfoXLConfig, TransfoXLLMHeadModel),
        (XLNetConfig, XLNetLMHeadModel),
@@ -441,6 +459,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
        (BertConfig, BertForMaskedLM),
        (OpenAIGPTConfig, OpenAIGPTLMHeadModel),
        (GPT2Config, GPT2LMHeadModel),
+        (MegatronBertConfig, MegatronBertForMaskedLM),
        (MobileBertConfig, MobileBertForMaskedLM),
        (TransfoXLConfig, TransfoXLLMHeadModel),
        (XLNetConfig, XLNetLMHeadModel),
@@ -456,6 +475,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
        (DebertaConfig, DebertaForMaskedLM),
        (DebertaV2Config, DebertaV2ForMaskedLM),
        (IBertConfig, IBertForMaskedLM),
+        (MegatronBertConfig, MegatronBertForCausalLM),
    ]
 )

@@ -487,6 +507,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict(
        (MarianConfig, MarianForCausalLM),
        (BlenderbotConfig, BlenderbotForCausalLM),
        (BlenderbotSmallConfig, BlenderbotSmallForCausalLM),
+        (MegatronBertConfig, MegatronBertForCausalLM),
    ]
 )

@@ -494,6 +515,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = OrderedDict(
    [
        # Model for Image Classification mapping
        (ViTConfig, ViTForImageClassification),
+        (DeiTConfig, (DeiTForImageClassification, DeiTForImageClassificationWithTeacher)),
    ]
 )

@@ -514,6 +536,7 @@ MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
        (RobertaConfig, RobertaForMaskedLM),
        (SqueezeBertConfig, SqueezeBertForMaskedLM),
        (BertConfig, BertForMaskedLM),
+        (MegatronBertConfig, MegatronBertForMaskedLM),
        (MobileBertConfig, MobileBertForMaskedLM),
        (FlaubertConfig, FlaubertWithLMHeadModel),
        (XLMConfig, XLMWithLMHeadModel),
@@ -566,6 +589,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
        (LayoutLMConfig, LayoutLMForSequenceClassification),
        (BertConfig, BertForSequenceClassification),
        (XLNetConfig, XLNetForSequenceClassification),
+        (MegatronBertConfig, MegatronBertForSequenceClassification),
        (MobileBertConfig, MobileBertForSequenceClassification),
        (FlaubertConfig, FlaubertForSequenceClassification),
        (XLMConfig, XLMForSequenceClassification),
@@ -602,6 +626,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
        (BertConfig, BertForQuestionAnswering),
        (XLNetConfig, XLNetForQuestionAnsweringSimple),
        (FlaubertConfig, FlaubertForQuestionAnsweringSimple),
+        (MegatronBertConfig, MegatronBertForQuestionAnswering),
        (MobileBertConfig, MobileBertForQuestionAnswering),
        (XLMConfig, XLMForQuestionAnsweringSimple),
        (ElectraConfig, ElectraForQuestionAnswering),
@@ -637,6 +662,7 @@ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
        (RobertaConfig, RobertaForTokenClassification),
        (SqueezeBertConfig, SqueezeBertForTokenClassification),
        (BertConfig, BertForTokenClassification),
+        (MegatronBertConfig, MegatronBertForTokenClassification),
        (MobileBertConfig, MobileBertForTokenClassification),
        (XLNetConfig, XLNetForTokenClassification),
        (AlbertConfig, AlbertForTokenClassification),
@@ -663,6 +689,7 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
        (SqueezeBertConfig, SqueezeBertForMultipleChoice),
        (BertConfig, BertForMultipleChoice),
        (DistilBertConfig, DistilBertForMultipleChoice),
+        (MegatronBertConfig, MegatronBertForMultipleChoice),
        (MobileBertConfig, MobileBertForMultipleChoice),
        (XLNetConfig, XLNetForMultipleChoice),
        (AlbertConfig, AlbertForMultipleChoice),
@@ -677,6 +704,7 @@ MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
 MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = OrderedDict(
    [
        (BertConfig, BertForNextSentencePrediction),
+        (MegatronBertConfig, MegatronBertForNextSentencePrediction),
        (MobileBertConfig, MobileBertForNextSentencePrediction),
    ]
 )
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -91,6 +91,7 @@ from ..flaubert.modeling_tf_flaubert import (
    TFFlaubertWithLMHeadModel,
 )
 from ..funnel.modeling_tf_funnel import (
+    TFFunnelBaseModel,
    TFFunnelForMaskedLM,
    TFFunnelForMultipleChoice,
    TFFunnelForPreTraining,
@@ -242,7 +243,7 @@ TF_MODEL_MAPPING = OrderedDict(
        (XLMConfig, TFXLMModel),
        (CTRLConfig, TFCTRLModel),
        (ElectraConfig, TFElectraModel),
-        (FunnelConfig, TFFunnelModel),
+        (FunnelConfig, (TFFunnelModel, TFFunnelBaseModel)),
        (DPRConfig, TFDPRQuestionEncoder),
        (MPNetConfig, TFMPNetModel),
        (BartConfig, TFBartModel),
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -115,6 +115,7 @@ if is_sentencepiece_available():
    from ..bert_generation.tokenization_bert_generation import BertGenerationTokenizer
    from ..big_bird.tokenization_big_bird import BigBirdTokenizer
    from ..camembert.tokenization_camembert import CamembertTokenizer
+    from ..cpm.tokenization_cpm import CpmTokenizer
    from ..deberta_v2.tokenization_deberta_v2 import DebertaV2Tokenizer
    from ..m2m_100 import M2M100Tokenizer
    from ..marian.tokenization_marian import MarianTokenizer
@@ -134,6 +135,7 @@ else:
    BertGenerationTokenizer = None
    BigBirdTokenizer = None
    CamembertTokenizer = None
+    CpmTokenizer = None
    DebertaV2Tokenizer = None
    MarianTokenizer = None
    MBartTokenizer = None
@@ -273,6 +275,7 @@ TOKENIZER_MAPPING = OrderedDict(
 NO_CONFIG_TOKENIZER = [
    BertJapaneseTokenizer,
    BertweetTokenizer,
+    CpmTokenizer,
    HerbertTokenizer,
    HerbertTokenizerFast,
    PhobertTokenizer,
@@ -409,7 +412,7 @@ class AutoTokenizer:
        # if model is an encoder decoder, the encoder tokenizer class is used by default
        if isinstance(config, EncoderDecoderConfig):
            if type(config.decoder) is not type(config.encoder):  # noqa: E721
-                logger.warn(
+                logger.warning(
                    f"The encoder model config class: {config.encoder.__class__} is different from the decoder model "
                    f"config class: {config.decoder.__class}. It is not recommended to use the "
                    "`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder "
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -23,7 +23,7 @@ import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import CrossEntropyLoss
+from torch.nn import CrossEntropyLoss, MSELoss

 from ...activations import ACT2FN
 from ...file_utils import (
@@ -1011,7 +1011,7 @@ class BartDecoder(BartPretrainedModel):
            if getattr(self.config, "gradient_checkpointing", False) and self.training:

                if use_cache:
-                    logger.warn(
+                    logger.warning(
                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
                        "`use_cache=False`..."
                    )
@@ -1437,8 +1437,13 @@ class BartForSequenceClassification(BartPretrainedModel):

        loss = None
        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+            if self.config.num_labels == 1:
+                # regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[1:]
--- a/src/transformers/models/barthez/tokenization_barthez.py
+++ b/src/transformers/models/barthez/tokenization_barthez.py
@@ -180,12 +180,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )

        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]
--- a/src/transformers/models/barthez/tokenization_barthez_fast.py
+++ b/src/transformers/models/barthez/tokenization_barthez_fast.py
@@ -164,36 +164,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep

-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
--- a/Show More
+++ b/Show More