Release: v4.6.0

[Lazy init] Force fall back to slow init for composite models (#11705 )
* fix encoder-decoder & RAG * finalize * Update src/transformers/models/encoder_decoder/modeling_encoder_decoder.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * Update src/transformers/models/rag/modeling_rag.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> Co-authored-by: Patrick von Platen <patrick@huggingface.co> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
2021-05-12 17:03:03 +02:00 · 2021-05-12 10:52:54 -04:00 · 2021-05-12 09:48:52 -04:00 · 2021-05-12 09:11:10 -04:00 · 2021-05-12 13:52:52 +01:00 · 2021-05-12 15:28:30 +05:30
584 changed files with 42474 additions and 7964 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -145,7 +145,7 @@ jobs:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -s --make-reports=tests_torch ./tests/ | tee tests_output.txt
+            - run: python -m pytest -n 3 --dist=loadfile -s --make-reports=tests_torch ./tests/ | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -277,7 +277,7 @@ jobs:
                      - v0.4-custom_tokenizers-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[ja,testing,sentencepiece]
+            - run: pip install .[ja,testing,sentencepiece,jieba]
            - run: python -m unidic download
            - save_cache:
                  key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
@@ -306,35 +306,44 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,torch,sentencepiece,testing]
-            - run: pip install -r examples/_tests_requirements.txt
+            - run: pip install -r examples/pytorch/_tests_requirements.txt
            - save_cache:
                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/ | tee examples_output.txt
+            - run: TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee examples_output.txt
            - store_artifacts:
                  path: ~/transformers/examples_output.txt
            - store_artifacts:
                  path: ~/transformers/reports

-    run_tests_git_lfs:
+    run_tests_hub:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.7
        environment:
+            HUGGINGFACE_CO_STAGING: yes
            RUN_GIT_LFS_TESTS: yes
            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-hub-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get install git-lfs
            - run: |
                git config --global user.email "ci@dummy.com"
                git config --global user.name "ci"
            - run: pip install --upgrade pip
-            - run: pip install .[testing]
-            - run: python -m pytest -sv ./tests/test_hf_api.py -k "HfLargefilesTest"
+            - run: pip install .[torch,sentencepiece,testing]
+            - save_cache:
+                  key: v0.4-hub-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python -m pytest -sv ./tests/ -m is_staging_test

    build_doc:
        working_directory: ~/transformers
@@ -348,7 +357,7 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install ."[all, docs]"
+            - run: pip install ."[docs]"
            - save_cache:
                  key: v0.4-build_doc-{{ checksum "setup.py" }}
                  paths:
@@ -370,7 +379,7 @@ jobs:
                  keys:
                      - v0.4-deploy_doc-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: pip install ."[all,docs]"
+            - run: pip install ."[docs]"
            - save_cache:
                  key: v0.4-deploy_doc-{{ checksum "setup.py" }}
                  paths:
@@ -382,6 +391,8 @@ jobs:
        docker:
            - image: circleci/python:3.6
        resource_class: medium
+        environment:
+            TRANSFORMERS_IS_CI: yes
        parallelism: 1
        steps:
            - checkout
@@ -469,7 +480,7 @@ workflows:
            - run_tests_flax
            - run_tests_pipelines_torch
            - run_tests_pipelines_tf
-            - run_tests_git_lfs
+            - run_tests_hub
            - build_doc
            - deploy_doc: *workflow_filters
 #    tpu_testing_jobs:
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -60,4 +60,6 @@ deploy_doc "7d9a9d0" v4.2.2
 deploy_doc "bae0c79" v4.3.3
 deploy_doc "c988db5" v4.4.0
 deploy_doc "c5d6a28" v4.4.1
-deploy_doc "6bc89ed"  # v4.4.2 Latest stable release
+deploy_doc "6bc89ed" v4.4.2
+deploy_doc "4906a29" v4.5.0
+deploy_doc "4bae96e"  # v4.5.1 Latest stable release
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -34,7 +34,7 @@ Models:
 - funnel: @sgugger
 - gpt2: @patrickvonplaten, @LysandreJik
 - rag: @patrickvonplaten, @lhoestq
- tensorflow: @LysandreJik
+- tensorflow: @Rocketknight1

 Library:

--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -30,7 +30,7 @@ Fixes # (issue)
 ## Who can review?

 Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
-members/contributors which may be interested in your PR.
+members/contributors who may be interested in your PR.

 <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @

--- a/.github/conda/meta.yaml
+++ b/.github/conda/meta.yaml
@@ -16,6 +16,8 @@ requirements:
    - pip
    - numpy >=1.17
    - dataclasses
+    - importlib_metadata
+    - huggingface_hub
    - packaging
    - filelock
    - requests
@@ -28,6 +30,8 @@ requirements:
    - python
    - numpy >=1.17
    - dataclasses
+    - importlib_metadata
+    - huggingface_hub
    - packaging
    - filelock
    - requests
--- a/.github/workflows/model-templates.yml
+++ b/.github/workflows/model-templates.yml
@@ -1,6 +1,9 @@
 name: Model templates runner

 on:
+  push:
+    branches:
+      - master
  pull_request:
    paths:
      - "src/**"
@@ -34,6 +37,7 @@ jobs:
      - name: Install dependencies
        run: |
          pip install --upgrade pip
+          sudo apt -y update && sudo apt install -y libsndfile1-dev
          pip install .[dev]
      - name: Create model files
        run: |
@@ -46,6 +50,7 @@ jobs:
          make style
          python utils/check_table.py --fix_and_overwrite
          python utils/check_dummies.py --fix_and_overwrite
+          python utils/check_copies.py --fix_and_overwrite

      - name: Run all non-slow tests
        run: |
--- a/.github/workflows/release-conda.yml
+++ b/.github/workflows/release-conda.yml
@@ -24,6 +24,7 @@ jobs:
        with:
          auto-update-conda: true
          auto-activate-base: false
+          python-version: 3.8
          activate-environment: "build-transformers"
          channels: huggingface

--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -5,6 +5,7 @@ on:
    branches:
      - master
      - ci_*
+      - ci-*
    paths:
      - "src/**"
      - "tests/**"
@@ -62,6 +63,7 @@ jobs:

  run_tests_tf_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
+    timeout-minutes: 120
    container:
      image: tensorflow/tensorflow:2.4.1-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -88,7 +90,7 @@ jobs:
          TF_NUM_INTRAOP_THREADS: 8
          TF_NUM_INTEROP_THREADS: 1
        run: |
-          python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_gpu tests
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -147,6 +149,7 @@ jobs:

  run_tests_tf_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    timeout-minutes: 120
    container:
      image: tensorflow/tensorflow:2.4.1-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -173,7 +176,7 @@ jobs:
          TF_NUM_INTRAOP_THREADS: 8
          TF_NUM_INTEROP_THREADS: 1
        run: |
-          python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -186,11 +189,101 @@ jobs:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports

+  run_tests_torch_cuda_extensions_gpu:
+    runs-on: [self-hosted, docker-gpu, single-gpu]
+    container:
+      image: nvcr.io/nvidia/pytorch:21.03-py3
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          apt -y update && apt install -y libaio-dev
+          pip install --upgrade pip
+          pip install .[testing,deepspeed]
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Cuda version:', torch.version.cuda)"
+          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+      - name: Run all tests on GPU
+        run: |
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_tests_torch_cuda_extensions_gpu_test_reports
+          path: reports
+
+  run_tests_torch_cuda_extensions_multi_gpu:
+    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    container:
+      image: nvcr.io/nvidia/pytorch:21.03-py3
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          apt -y update && apt install -y libaio-dev
+          pip install --upgrade pip
+          pip install .[testing,deepspeed,fairscale]
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Cuda version:', torch.version.cuda)"
+          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+      - name: Run all tests on GPU
+        run: |
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
+          path: reports
+
+
  send_results:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
-    needs: [run_tests_torch_gpu, run_tests_tf_gpu, run_tests_torch_multi_gpu, run_tests_tf_multi_gpu]
+    needs: [
+        run_tests_torch_gpu,
+        run_tests_tf_gpu,
+        run_tests_torch_multi_gpu,
+        run_tests_tf_multi_gpu,
+        run_tests_torch_cuda_extensions_gpu,
+        run_tests_torch_cuda_extensions_multi_gpu
+    ]
    steps:
      - uses: actions/checkout@v2

@@ -203,4 +296,4 @@ jobs:

        run: |
          pip install slack_sdk
-          python utils/notification_service.py push
+          python utils/notification_service.py push
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -59,7 +59,7 @@ jobs:
          HF_HOME: /mnt/cache
          TRANSFORMERS_IS_CI: yes
        run: |
-          pip install -r examples/_tests_requirements.txt
+          pip install -r examples/pytorch/_tests_requirements.txt
          python -m pytest -n 1 --dist=loadfile --make-reports=examples_torch_gpu examples

      - name: Failure short reports
@@ -246,11 +246,100 @@ jobs:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports

+  run_all_tests_torch_cuda_extensions_gpu:
+    runs-on: [self-hosted, docker-gpu, single-gpu]
+    container:
+      image: nvcr.io/nvidia/pytorch:21.03-py3
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          apt -y update && apt install -y libaio-dev
+          pip install --upgrade pip
+          pip install .[testing,deepspeed]
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Cuda version:', torch.version.cuda)"
+          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+      - name: Run all tests on GPU
+        run: |
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_tests_torch_cuda_extensions_gpu_test_reports
+          path: reports
+
+  run_all_tests_torch_cuda_extensions_multi_gpu:
+    runs-on: [self-hosted, docker-gpu, multi-gpu]
+    container:
+      image: nvcr.io/nvidia/pytorch:21.03-py3
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          apt -y update && apt install -y libaio-dev
+          pip install --upgrade pip
+          pip install .[testing,deepspeed,fairscale]
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Cuda version:', torch.version.cuda)"
+          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+      - name: Run all tests on GPU
+        run: |
+          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
+          path: reports
+
  send_results:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
-    needs: [run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu]
+    needs: [
+        run_all_tests_torch_gpu,
+        run_all_tests_tf_gpu,
+        run_all_tests_torch_multi_gpu,
+        run_all_tests_tf_multi_gpu,
+        run_all_tests_torch_cuda_extensions_gpu,
+        run_all_tests_torch_cuda_extensions_multi_gpu
+    ]
    steps:
      - uses: actions/checkout@v2

--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -2,7 +2,7 @@ name: Stale Bot

 on:
  schedule:
-    - cron: "0 0 * * *"
+    - cron: "0 15 * * *"

 jobs:
  close_stale_issues:
--- a/.gitignore
+++ b/.gitignore
@@ -9,8 +9,7 @@ __pycache__/
 *.so

 # tests and logs
-tests/fixtures/*
-!tests/fixtures/sample_text_no_unicode.txt
+tests/fixtures/cached_*_text.txt
 logs/
 lightning_logs/
 lang_code_data/
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -36,6 +36,13 @@ There are 4 ways you can contribute to transformers:
 * Contributing to the examples or to the documentation;
 * Submitting issues related to bugs or desired new features.

+In particular there is a special [Good First
+Issue](https://github.com/huggingface/transformers/contribute) listing. Tt will give you a list of
+open Issues that are open to anybody to work on. Just comment in the issue that you'd like to work
+on it. In that same listing you will also find some Issues with `Good Second Issue` label. These are
+typically slightly more complicated than the Issues with just `Good First Issue` label. But if you
+feel you know what you're doing, go for it.
+
 *All are equally valuable to the community.*

 ## Submitting a new issue or feature request
@@ -46,7 +53,7 @@ feedback.

 ### Did you find a bug?

-The transformers are robust and reliable thanks to the users who notify us of
+The 🤗 Transformers library is robust and reliable thanks to the users who notify us of
 the problems they encounter. So thank you for reporting an issue.

 First, we would really appreciate it if you could **make sure the bug was not
@@ -285,7 +292,7 @@ $ python -m pytest -n auto --dist=loadfile -s -v ./tests/
 and for the examples:

 ```bash
-$ pip install -r examples/requirements.txt  # only needed the first time
+$ pip install -r examples/xxx/requirements.txt  # only needed the first time
 $ python -m pytest -n auto --dist=loadfile -s -v ./examples/
 ```
 In fact, that's how `make test` and `make test-examples` are implemented (sans the `pip install` line)!
@@ -343,7 +350,7 @@ You can now use `make` from any terminal (Powershell, cmd.exe, etc) 🎉

 ### Syncing forked master with upstream (HuggingFace) master

-To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnessary notifications to the developers involved in these PRs, 
+To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnessary notifications to the developers involved in these PRs,
 when syncing the master branch of a forked repository, please, follow these steps:
 1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead merge directly into the forked master.
 2. If a PR is absolutely necessary, use the following steps after checking out your branch:
--- a/4
+++ b/4
@@ -1,5 +1,7 @@
 .PHONY: deps_table_update modified_only_fixup extra_quality_checks quality style fixup fix-copies test test-examples docs

+# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
+export PYTHONPATH = src

 check_dirs := examples tests src utils

@@ -73,7 +75,7 @@ test:
 # Run tests for examples

 test-examples:
-	python -m pytest -n auto --dist=loadfile -s -v ./examples/
+	python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/

 # Run tests for SageMaker DLC release

--- a/README.md
+++ b/README.md
@@ -38,18 +38,18 @@ limitations under the License.
 </p>

 <h3 align="center">
-<p>State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
+<p>State-of-the-art Natural Language Processing for Jax, PyTorch and TensorFlow
 </h3>

-🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation, etc in 100+ languages. Its aim is to make cutting-edge NLP easier to use for everyone.
+🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation and more in over 100 languages. Its aim is to make cutting-edge NLP easier to use for everyone.

-🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets then share them with the community on our [model hub](https://huggingface.co/models). At the same time, each python module defining an architecture can be used as a standalone and modified to enable quick research experiments.
+🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and then share them with the community on our [model hub](https://huggingface.co/models). At the same time, each python module defining an architecture is fully standalone and can be modified to enable quick research experiments.

-🤗 Transformers is backed by the two most popular deep learning libraries, [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/), with a seamless integration between them, allowing you to train your models with one then load it for inference with the other.
+🤗 Transformers is backed by the three most popular deep learning libraries — [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/) — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.

 ## Online demos

-You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer [private model hosting, versioning, & an inference API](https://huggingface.co/pricing) to use those models.
+You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer [private model hosting, versioning, & an inference API](https://huggingface.co/pricing) for public and private models.

 Here are a few examples:
 - [Masked word completion with BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
@@ -64,20 +64,20 @@ Here are a few examples:

 ## Quick tour

-To immediately use a model on a given text, we provide the `pipeline` API. Pipelines group together a pretrained model with the preprocessing that was used during that model training. Here is how to quickly use a pipeline to classify positive versus negative texts
+To immediately use a model on a given text, we provide the `pipeline` API. Pipelines group together a pretrained model with the preprocessing that was used during that model's training. Here is how to quickly use a pipeline to classify positive versus negative texts:

 ```python
 >>> from transformers import pipeline

 # Allocate a pipeline for sentiment-analysis
 >>> classifier = pipeline('sentiment-analysis')
->>> classifier('We are very happy to include pipeline into the transformers repository.')
-[{'label': 'POSITIVE', 'score': 0.9978193640708923}]
+>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
 ```

-The second line of code downloads and caches the pretrained model used by the pipeline, the third line evaluates it on the given text. Here the answer is "positive" with a confidence of 99.8%.
+The second line of code downloads and caches the pretrained model used by the pipeline, while the third evaluates it on the given text. Here the answer is "positive" with a confidence of 99.97%.

-This is another example of pipeline used for that can extract question answers from some context:
+Many NLP tasks have a pre-trained `pipeline` ready to go. For example, we can easily extract question answers given context:

 ``` python
 >>> from transformers import pipeline
@@ -86,15 +86,15 @@ This is another example of pipeline used for that can extract question answers f
 >>> question_answerer = pipeline('question-answering')
 >>> question_answerer({
 ...     'question': 'What is the name of the repository ?',
-...     'context': 'Pipeline have been included in the huggingface/transformers repository'
+...     'context': 'Pipeline has been included in the huggingface/transformers repository'
 ... })
-{'score': 0.5135612454720828, 'start': 35, 'end': 59, 'answer': 'huggingface/transformers'}
+{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}

 ```

-On top of the answer, the pretrained model used here returned its confidence score, along with the start position and its end position in the tokenized sentence. You can learn more about the tasks supported by the `pipeline` API in [this tutorial](https://huggingface.co/transformers/task_summary.html).
+In addition to the answer, the pretrained model used here returned its confidence score, along with the start position and end position of the answer in the tokenized sentence. You can learn more about the tasks supported by the `pipeline` API in [this tutorial](https://huggingface.co/transformers/task_summary.html).

-To download and use any of the pretrained models on your given task, you just need to use those three lines of codes (PyTorch version):
+To download and use any of the pretrained models on your given task, all it takes is three lines of code. Here is the PyTorch version:
 ```python
 >>> from transformers import AutoTokenizer, AutoModel

@@ -104,7 +104,7 @@ To download and use any of the pretrained models on your given task, you just ne
 >>> inputs = tokenizer("Hello world!", return_tensors="pt")
 >>> outputs = model(**inputs)
 ```
-or for TensorFlow:
+And here is the equivalent code for TensorFlow:
 ```python
 >>> from transformers import AutoTokenizer, TFAutoModel

@@ -115,9 +115,9 @@ or for TensorFlow:
 >>> outputs = model(**inputs)
 ```

-The tokenizer is responsible for all the preprocessing the pretrained model expects, and can be called directly on one (or list) of texts (as we can see on the fourth line of both code examples). It will output a dictionary you can directly pass to your model (which is done on the fifth line).
+The tokenizer is responsible for all the preprocessing the pretrained model expects, and can be called directly on a single string (as in the above examples) or a list. It will output a dictionary that you can use in downstream code or simply directly pass to your model using the ** argument unpacking operator.

-The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) or a [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (depending on your backend) which you can use normally. For instance, [this tutorial](https://huggingface.co/transformers/training.html) explains how to integrate such a model in classic PyTorch or TensorFlow training loop, or how to use our `Trainer` API to quickly fine-tune the on a new dataset.
+The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) or a [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (depending on your backend) which you can use normally. [This tutorial](https://huggingface.co/transformers/training.html) explains how to integrate such a model into a classic PyTorch or TensorFlow training loop, or how to use our `Trainer` API to quickly fine-tune on a new dataset.

 ## Why should I use transformers?

@@ -135,16 +135,16 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta
 1. Choose the right framework for every part of a model's lifetime:
    - Train state-of-the-art models in 3 lines of code.
    - Move a single model between TF2.0/PyTorch frameworks at will.
-    - Seamlessly pick the right framework for training, evaluation, production.
+    - Seamlessly pick the right framework for training, evaluation and production.

 1. Easily customize a model or an example to your needs:
-    - Examples for each architecture to reproduce the results by the official authors of said architecture.
-    - Expose the models internal as consistently as possible.
+    - We provide examples for each architecture to reproduce the results published by its original authors.
+    - Model internals are exposed as consistently as possible.
    - Model files can be used independently of the library for quick experiments.

 ## Why shouldn't I use transformers?

- This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving in additional abstractions/files.
+- This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
 - The training API is not intended to work on any model but is optimized to work with the models provided by the library. For generic machine learning loops, you should use another library.
 - While we strive to present as many use cases as possible, the scripts in our [examples folder](https://github.com/huggingface/transformers/tree/master/examples) are just that: examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs.

@@ -152,16 +152,16 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta

 ### With pip

-This repository is tested on Python 3.6+, PyTorch 1.0.0+ (PyTorch 1.3.1+ for [examples](https://github.com/huggingface/transformers/tree/master/examples)) and TensorFlow 2.0.
+This repository is tested on Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+ and TensorFlow 2.3+.

 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).

 First, create a virtual environment with the version of Python you're going to use and activate it.

-Then, you will need to install at least one of TensorFlow 2.0, PyTorch or Flax.
-Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform and/or [Flax installation page](https://github.com/google/flax#quick-install).
+Then, you will need to install at least one of Flax, PyTorch or TensorFlow.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax installation page](https://github.com/google/flax#quick-install) regarding the specific install command for your platform.

-When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
+When one of those backends has been installed, 🤗 Transformers can be installed using pip as follows:

 ```bash
 pip install transformers
@@ -179,9 +179,9 @@ Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
 conda install -c huggingface transformers
 ```

-Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda.
+Follow the installation pages of Flax, PyTorch or TensorFlow to see how to install them with conda.

-## Models architectures
+## Model architectures

 **[All the model checkpoints](https://huggingface.co/models)** provided by 🤗 Transformers are seamlessly integrated from the huggingface.co [model hub](https://huggingface.co) where they are uploaded directly by [users](https://huggingface.co/users) and [organizations](https://huggingface.co/organizations).

@@ -195,14 +195,18 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CLIP](https://huggingface.co/transformers/model_doc/clip.html)** from (OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[CPM](https://huggingface.co/transformers/model_doc/cpm.html)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeiT](https://huggingface.co/transformers/model_doc/deit.html)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
 1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
 1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
@@ -218,11 +222,14 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
 1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](https://huggingface.co/transformers/model_doc/megatron_bert.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
@@ -243,9 +250,9 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[XLSR-Wav2Vec2](https://huggingface.co/transformers/model_doc/xlsr_wav2vec2.html)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.

-To check if each model has an implementation in PyTorch/TensorFlow/Flax or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/transformers/index.html#bigtable)
+To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/transformers/index.html#bigtable).

-These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations. You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
+These implementations have been tested on several datasets (see the example scripts) and should match the performance of the original implementations. You can find more details on performance in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).


 ## Learn more
--- a/docker/transformers-pytorch-tpu/Dockerfile
+++ b/docker/transformers-pytorch-tpu/Dockerfile
@@ -53,7 +53,7 @@ RUN git clone https://github.com/huggingface/transformers.git && \
    git checkout CI && \
    cd .. && \
    pip install ./transformers && \
-    pip install -r ./transformers/examples/requirements.txt && \
+    pip install -r ./transformers/examples/pytorch/_test_requirements.txt && \
    pip install pytest

 RUN python -c "import torch_xla; print(torch_xla.__version__)"
--- a/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet
+++ b/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet
@@ -27,7 +27,7 @@ local bertBaseCased = base.BaseTest {
  },
  command: utils.scriptCommand(
    |||
-      python -m pytest -s transformers/examples/test_xla_examples.py -v
+      python -m pytest -s transformers/examples/pytorch/test_xla_examples.py -v
      test_exit_code=$?
      echo "\nFinished running commands.\n"
      test $test_exit_code -eq 0
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,10 +1,11 @@
 // These two things need to be updated at each release for the version selector.
 // Last stable version
-const stableVersion = "v4.4.2"
+const stableVersion = "v4.5.1"
 // Dictionary doc folder to label. The last stable version should have an empty key.
 const versionMapping = {
    "master": "master",
-    "": "v4.4.0/v4.4.1/v4.4.2 (stable)",
+    "": "v4.5.0/v4.5.1 (stable)",
+    "v4.4.2": "v4.4.0/v4.4.1/v4.4.2",
    "v4.3.3": "v4.3.0/v4.3.1/v4.3.2/v4.3.3",
    "v4.2.2": "v4.2.0/v4.2.1/v4.2.2",
    "v4.1.1": "v4.1.0/v4.1.1",
--- a/docs/source/add_new_model.rst
+++ b/docs/source/add_new_model.rst
@@ -388,7 +388,7 @@ Next, you can finally start adding new code to 🤗 Transformers. Go into the cl

 ::

-   cd transformers
+    cd transformers

 In the special case that you are adding a model whose architecture exactly matches the model architecture of an
 existing model you only have to add a conversion script as described in `this section <#write-a-conversion-script>`__.
@@ -417,27 +417,27 @@ You should do the following:

 ::

-   git checkout -b add_brand_new_bert
+    git checkout -b add_brand_new_bert

 2. Commit the automatically generated code:

 ::

-   git add .
-   git commit
+    git add .
+    git commit

 3. Fetch and rebase to current master

 ::

-   git fetch upstream
-   git rebase upstream/master
+    git fetch upstream
+    git rebase upstream/master

 4. Push the changes to your account using:

 ::

-   git push -u origin a-descriptive-name-for-my-changes
+    git push -u origin a-descriptive-name-for-my-changes

 5. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the
   GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for
@@ -451,8 +451,8 @@ time to time by doing:

 ::

-   git fetch upstream
-   git merge upstream/master
+    git fetch upstream
+    git merge upstream/master

 In general, all questions you might have regarding the model or your implementation should be asked in your PR and
 discussed/solved in the PR. This way, the Hugging Face team will always be notified when you are committing new code or
--- a/docs/source/benchmarks.rst
+++ b/docs/source/benchmarks.rst
@@ -65,10 +65,10 @@ respectively.
 .. code-block:: bash

    ## PYTORCH CODE
-    python examples/benchmarking/run_benchmark.py --help
+    python examples/pytorch/benchmarking/run_benchmark.py --help

    ## TENSORFLOW CODE
-    python examples/benchmarking/run_benchmark_tf.py --help
+    python examples/tensorflow/benchmarking/run_benchmark_tf.py --help


 An instantiated benchmark object can then simply be run by calling ``benchmark.run()``.
--- a/docs/source/community.md
+++ b/docs/source/community.md
@@ -51,3 +51,8 @@ This page regroups resources around 🤗 Transformers developed by the community
 |[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | How to decode CTC sequence with language model adjustment | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)|
 |[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | How to fine-tune BART for summarization in two languages with Trainer class | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
 |[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | How to evaluate BigBird on long document question answering on Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|
+| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | How to create YouTube captions from any video by transcribing the audio with Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) |
+| [Evaluate LUKE on Open Entity, an entity typing dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | How to evaluate *LukeForEntityClassification* on the Open Entity dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) |
+| [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | How to evaluate *LukeForEntityPairClassification* on the TACRED dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) |
+| [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | How to evaluate *LukeForEntitySpanClassification* on the CoNLL-2003 dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) |
+| [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | How to evaluate *BigBirdPegasusForConditionalGeneration* on PubMed dataset | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) |
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -33,8 +33,8 @@ You can convert any TensorFlow checkpoint for BERT (in particular `the pre-train
 This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated
 configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights
 from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that
-can be imported using ``from_pretrained()`` (see example in :doc:`quicktour` , `run_glue.py
-<https://github.com/huggingface/transformers/blob/master/examples/text-classification/run_glue.py>`_\ ).
+can be imported using ``from_pretrained()`` (see example in :doc:`quicktour` , :prefix_link:`run_glue.py
+<examples/pytorch/text-classification/run_glue.py>` \ ).

 You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
 checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\
@@ -47,12 +47,12 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas

 .. code-block:: shell

-   export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
+    export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12

-   transformers-cli convert --model_type bert \
-     --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
-     --config $BERT_BASE_DIR/bert_config.json \
-     --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
+    transformers-cli convert --model_type bert \
+      --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
+      --config $BERT_BASE_DIR/bert_config.json \
+      --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin

 You can download Google's pre-trained models for the conversion `here
 <https://github.com/google-research/bert#pre-trained-models>`__.
@@ -72,12 +72,12 @@ Here is an example of the conversion process for the pre-trained ``ALBERT Base``

 .. code-block:: shell

-   export ALBERT_BASE_DIR=/path/to/albert/albert_base
+    export ALBERT_BASE_DIR=/path/to/albert/albert_base

-   transformers-cli convert --model_type albert \
-     --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
-     --config $ALBERT_BASE_DIR/albert_config.json \
-     --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
+    transformers-cli convert --model_type albert \
+      --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
+      --config $ALBERT_BASE_DIR/albert_config.json \
+      --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin

 You can download Google's pre-trained models for the conversion `here
 <https://github.com/google-research/albert#pre-trained-models>`__.
@@ -91,13 +91,13 @@ save as the same format than OpenAI pretrained model (see `here <https://github.

 .. code-block:: shell

-   export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
+    export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights

-   transformers-cli convert --model_type gpt \
-     --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--config OPENAI_GPT_CONFIG] \
-     [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
+    transformers-cli convert --model_type gpt \
+      --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+      [--config OPENAI_GPT_CONFIG] \
+      [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \


 OpenAI GPT-2
@@ -108,13 +108,13 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode

 .. code-block:: shell

-   export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
+    export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights

-   transformers-cli convert --model_type gpt2 \
-     --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--config OPENAI_GPT2_CONFIG] \
-     [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
+    transformers-cli convert --model_type gpt2 \
+      --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+      [--config OPENAI_GPT2_CONFIG] \
+      [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]

 Transformer-XL
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -124,13 +124,13 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo

 .. code-block:: shell

-   export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
+    export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint

-   transformers-cli convert --model_type transfo_xl \
-     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--config TRANSFO_XL_CONFIG] \
-     [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
+    transformers-cli convert --model_type transfo_xl \
+      --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+      [--config TRANSFO_XL_CONFIG] \
+      [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]


 XLNet
@@ -140,14 +140,14 @@ Here is an example of the conversion process for a pre-trained XLNet model:

 .. code-block:: shell

-   export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
-   export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
+    export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
+    export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config

-   transformers-cli convert --model_type xlnet \
-     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
-     --config $TRANSFO_XL_CONFIG_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-     [--finetuning_task_name XLNET_FINETUNED_TASK] \
+    transformers-cli convert --model_type xlnet \
+      --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
+      --config $TRANSFO_XL_CONFIG_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+      [--finetuning_task_name XLNET_FINETUNED_TASK] \


 XLM
@@ -157,13 +157,13 @@ Here is an example of the conversion process for a pre-trained XLM model:

 .. code-block:: shell

-   export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
+    export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint

-   transformers-cli convert --model_type xlm \
-     --tf_checkpoint $XLM_CHECKPOINT_PATH \
-     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
-    [--config XML_CONFIG] \
-    [--finetuning_task_name XML_FINETUNED_TASK]
+    transformers-cli convert --model_type xlm \
+      --tf_checkpoint $XLM_CHECKPOINT_PATH \
+      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
+     [--config XML_CONFIG] \
+     [--finetuning_task_name XML_FINETUNED_TASK]


 T5
@@ -173,9 +173,9 @@ Here is an example of the conversion process for a pre-trained T5 model:

 .. code-block:: shell

-   export T5=/path/to/t5/uncased_L-12_H-768_A-12
+    export T5=/path/to/t5/uncased_L-12_H-768_A-12

-   transformers-cli convert --model_type t5 \
-     --tf_checkpoint $T5/t5_model.ckpt \
-     --config $T5/t5_config.json \
-     --pytorch_dump_output $T5/pytorch_model.bin
+    transformers-cli convert --model_type t5 \
+      --tf_checkpoint $T5/t5_model.ckpt \
+      --config $T5/t5_config.json \
+      --pytorch_dump_output $T5/pytorch_model.bin
--- a/docs/source/debugging.rst
+++ b/docs/source/debugging.rst
@@ -0,0 +1,295 @@
+..
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+
+
+Debugging
+=======================================================================================================================
+
+Underflow and Overflow Detection
+-----------------------------------------------------------------------------------------------------------------------
+
+.. note::
+
+   This feature is currently available for PyTorch-only.
+
+.. note::
+
+   This feature can be used with any ``nn.Module``-based model
+
+If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf`` or ``nan`` in
+activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
+you can accomplish that easily by activating a special module that will do the detection automatically.
+
+If you're using :class:`~transformers.Trainer`, you just need to add:
+
+.. code-block:: bash
+
+    --debug underflow_overflow
+
+to the normal command line arguments, or pass ``debug="underflow_overflow"`` when creating the
+:class:`~transformers.TrainingArguments` object.
+
+If you're using your own training loop or another Trainer you can accomplish the same with:
+
+.. code-block:: python
+
+    from .debug_utils import DebugUnderflowOverflow
+    debug_overflow = DebugUnderflowOverflow(model)
+
+:class:`~transformers.debug_utils.DebugUnderflowOverflow` inserts hooks into the model that immediately after each
+forward call will test input and output variables and also the corresponding module's weights. As soon as ``inf`` or
+``nan`` is detected in at least one element of the activations or weights, the program will assert and print a report
+like this (this was caught with ``google/mt5-small`` under fp16 mixed precision):
+
+.. code-block::
+
+    Detected inf/nan during batch_number=0
+    Last 21 forward frames:
+    abs min  abs max  metadata
+                      encoder.block.1.layer.1.DenseReluDense.dropout Dropout
+    0.00e+00 2.57e+02 input[0]
+    0.00e+00 2.85e+02 output
+    [...]
+                      encoder.block.2.layer.0 T5LayerSelfAttention
+    6.78e-04 3.15e+03 input[0]
+    2.65e-04 3.42e+03 output[0]
+                 None output[1]
+    2.25e-01 1.00e+04 output[2]
+                      encoder.block.2.layer.1.layer_norm T5LayerNorm
+    8.69e-02 4.18e-01 weight
+    2.65e-04 3.42e+03 input[0]
+    1.79e-06 4.65e+00 output
+                      encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+    2.17e-07 4.50e+00 weight
+    1.79e-06 4.65e+00 input[0]
+    2.68e-06 3.70e+01 output
+                      encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+    8.08e-07 2.66e+01 weight
+    1.79e-06 4.65e+00 input[0]
+    1.27e-04 2.37e+02 output
+                      encoder.block.2.layer.1.DenseReluDense.dropout Dropout
+    0.00e+00 8.76e+03 input[0]
+    0.00e+00 9.74e+03 output
+                      encoder.block.2.layer.1.DenseReluDense.wo Linear
+    1.01e-06 6.44e+00 weight
+    0.00e+00 9.74e+03 input[0]
+    3.18e-04 6.27e+04 output
+                      encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+    1.79e-06 4.65e+00 input[0]
+    3.18e-04 6.27e+04 output
+                      encoder.block.2.layer.1.dropout Dropout
+    3.18e-04 6.27e+04 input[0]
+    0.00e+00      inf output
+
+The example output has been trimmed in the middle for brevity.
+
+The second column shows the value of the absolute largest element, so if you have a closer look at the last few frames,
+the inputs and outputs were in the range of ``1e4``. So when this training was done under fp16 mixed precision the very
+last step overflowed (since under ``fp16`` the largest number before ``inf`` is ``64e3``). To avoid overflows under
+``fp16`` the activations must remain way below ``1e4``, because ``1e4 * 1e4 = 1e8`` so any matrix multiplication with
+large activations is going to lead to a numerical overflow condition.
+
+At the very start of the trace you can discover at which batch number the problem occurred (here ``Detected inf/nan
+during batch_number=0`` means the problem occurred on the first batch).
+
+Each reported frame starts by declaring the fully qualified entry for the corresponding module this frame is reporting
+for. If we look just at this frame:
+
+.. code-block::
+
+                      encoder.block.2.layer.1.layer_norm T5LayerNorm
+    8.69e-02 4.18e-01 weight
+    2.65e-04 3.42e+03 input[0]
+    1.79e-06 4.65e+00 output
+
+Here, ``encoder.block.2.layer.1.layer_norm`` indicates that it was a layer norm for the first layer, of the second
+block of the encoder. And the specific calls of the ``forward`` is ``T5LayerNorm``.
+
+Let's look at the last few frames of that report:
+
+.. code-block::
+
+        Detected inf/nan during batch_number=0
+        Last 21 forward frames:
+        abs min  abs max  metadata
+        [...]
+                          encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+        2.17e-07 4.50e+00 weight
+        1.79e-06 4.65e+00 input[0]
+        2.68e-06 3.70e+01 output
+                          encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+        8.08e-07 2.66e+01 weight
+        1.79e-06 4.65e+00 input[0]
+        1.27e-04 2.37e+02 output
+                          encoder.block.2.layer.1.DenseReluDense.wo Linear
+        1.01e-06 6.44e+00 weight
+        0.00e+00 9.74e+03 input[0]
+        3.18e-04 6.27e+04 output
+                          encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+        1.79e-06 4.65e+00 input[0]
+        3.18e-04 6.27e+04 output
+                          encoder.block.2.layer.1.dropout Dropout
+        3.18e-04 6.27e+04 input[0]
+        0.00e+00      inf output
+
+The last frame reports for ``Dropout.forward`` function with the first entry for the only input and the second for the
+only output. You can see that it was called from an attribute ``dropout`` inside ``DenseReluDense`` class. We can see
+that it happened during the first layer, of the 2nd block, during the very first batch. Finally, the absolute largest
+input elements was ``6.27e+04`` and same for the output was ``inf``.
+
+You can see here, that ``T5DenseGatedGeluDense.forward`` resulted in output activations, whose absolute max value was
+around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have ``Dropout`` which renormalizes
+the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an
+overlow (``inf``).
+
+As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16
+numbers.
+
+Let's match the report to the code from ``models/t5/modeling_t5.py``:
+
+.. code-block:: python
+
+    class T5DenseGatedGeluDense(nn.Module):
+        def __init__(self, config):
+            super().__init__()
+            self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
+            self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
+            self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+            self.dropout = nn.Dropout(config.dropout_rate)
+            self.gelu_act = ACT2FN["gelu_new"]
+
+        def forward(self, hidden_states):
+            hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+            hidden_linear = self.wi_1(hidden_states)
+            hidden_states = hidden_gelu * hidden_linear
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = self.wo(hidden_states)
+            return hidden_states
+
+Now it's easy to see the ``dropout`` call, and all the previous calls as well.
+
+Since the detection is happening in a forward hook, these reports are printed immediately after each ``forward``
+returns.
+
+Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers
+started to go up and most likely switch to the ``fp32`` mode here, so that the numbers don't overflow when multiplied
+or summed up. Of course, there might be other solutions. For example, we could turn off ``amp`` temporarily if it's
+enabled, after moving the original ``forward`` into a helper wrapper, like so:
+
+.. code-block:: python
+
+    def _forward(self, hidden_states):
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+    import torch
+    def forward(self, hidden_states):
+        if torch.is_autocast_enabled():
+             with torch.cuda.amp.autocast(enabled=False):
+                 return self._forward(hidden_states)
+         else:
+             return self._forward(hidden_states)
+
+Since the automatic detector only reports on inputs and outputs of full frames, once you know where to look, you may
+want to analyse the intermediary stages of any specific ``forward`` function as well. In such a case you can use the
+``detect_overflow`` helper function to inject the detector where you want it, for example:
+
+.. code-block:: python
+
+    from debug_utils import detect_overflow
+
+    class T5LayerFF(nn.Module):
+        [...]
+        def forward(self, hidden_states):
+            forwarded_states = self.layer_norm(hidden_states)
+            detect_overflow(forwarded_states, "after layer_norm")
+            forwarded_states = self.DenseReluDense(forwarded_states)
+            detect_overflow(forwarded_states, "after DenseReluDense")
+            return hidden_states + self.dropout(forwarded_states)
+
+You can see that we added 2 of these and now we track if ``inf`` or ``nan`` for ``forwarded_states`` was detected
+somewhere in between.
+
+Actually, the detector already reports these because each of the calls in the example above is a `nn.Module``, but
+let's say if you had some local direct calculations this is how you'd do that.
+
+Additionally, if you're instantiating the debugger in your own code, you can adjust the number of frames printed from
+its default, e.g.:
+
+.. code-block:: python
+
+    from .debug_utils import DebugUnderflowOverflow
+    debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+
+Specific batch absolute mix and max value tracing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The same debugging class can be used for per-batch tracing with the underflow/overflow detection feature turned off.
+
+Let's say you want to watch the absolute min and max values for all the ingredients of each ``forward`` call of a given
+batch, and only do that for batches 1 and 3. Then you instantiate this class as:
+
+.. code-block:: python
+
+    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
+
+And now full batches 1 and 3 will be traced using the same format as the underflow/overflow detector does.
+
+Batches are 0-indexed.
+
+This is helpful if you know that the program starts misbehaving after a certain batch number, so you can fast-forward
+right to that area. Here is a sample truncated output for such configuration:
+
+.. code-block::
+
+                      *** Starting batch number=1 ***
+    abs min  abs max  metadata
+                      shared Embedding
+    1.01e-06 7.92e+02 weight
+    0.00e+00 2.47e+04 input[0]
+    5.36e-05 7.92e+02 output
+    [...]
+                      decoder.dropout Dropout
+    1.60e-07 2.27e+01 input[0]
+    0.00e+00 2.52e+01 output
+                      decoder T5Stack
+         not a tensor output
+                      lm_head Linear
+    1.01e-06 7.92e+02 weight
+    0.00e+00 1.11e+00 input[0]
+    6.06e-02 8.39e+01 output
+                       T5ForConditionalGeneration
+         not a tensor output
+
+                      *** Starting batch number=3 ***
+    abs min  abs max  metadata
+                      shared Embedding
+    1.01e-06 7.92e+02 weight
+    0.00e+00 2.78e+04 input[0]
+    5.36e-05 7.92e+02 output
+    [...]
+
+Here you will get a huge number of frames dumped - as many as there were forward calls in your model, so it may or may
+not what you want, but sometimes it can be easier to use for debugging purposes than a normal debugger. For example, if
+a problem starts happening at batch number 150. So you can dump traces for batches 149 and 150 and compare where
+numbers started to diverge.
+
+You can also specify the batch number after which to stop the training, with:
+
+.. code-block:: python
+
+    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -182,7 +182,7 @@ such:

 .. code-block::

-   >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
+    >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]

 We can use our tokenizer to automatically generate such a sentence by passing the two sequences to ``tokenizer`` as two
 arguments (and not a list, like before) like this:
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,12 +1,12 @@
 Transformers
 =======================================================================================================================

-State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0.
+State-of-the-art Natural Language Processing for Jax, Pytorch and TensorFlow

 🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose
 architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural
-Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
-TensorFlow 2.0 and PyTorch.
+Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between Jax,
+PyTorch and TensorFlow.

 This is the documentation of our repository `transformers <https://github.com/huggingface/transformers>`_.

@@ -22,7 +22,7 @@ State-of-the-art NLP for everyone:
 - Hands-on practitioners
 - AI/ML/NLP teachers and educators

-.. 
+..
    Copyright 2020 The HuggingFace Team. All rights reserved.

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
@@ -43,11 +43,11 @@ Lower compute costs, smaller carbon footprint:
 Choose the right framework for every part of a model's lifetime:

 - Train state-of-the-art models in 3 lines of code
- Deep interoperability between TensorFlow 2.0 and PyTorch models
- Move a single model between TF2.0/PyTorch frameworks at will
+- Deep interoperability between Jax, Pytorch and TensorFlow models
+- Move a single model between Jax/PyTorch/TensorFlow frameworks at will
 - Seamlessly pick the right framework for training, evaluation, production

-Experimental support for Flax with a few models right now, expected to grow in the coming months.
+The support for Jax is still experimental (with a few models right now), expect to see it grow in the coming months!

 `All the model checkpoints <https://huggingface.co/models>`__ are seamlessly integrated from the huggingface.co `model
 hub <https://huggingface.co>`__ where they are uploaded directly by `users <https://huggingface.co/users>`__ and
@@ -74,8 +74,8 @@ The documentation is organized in five parts:
    - **MODELS** for the classes and functions related to each model implemented in the library.
    - **INTERNAL HELPERS** for the classes and functions we use internally.

-The library currently contains PyTorch, Tensorflow and Flax implementations, pretrained model weights, usage scripts
-and conversion utilities for the following models:
+The library currently contains Jax, PyTorch and Tensorflow implementations, pretrained model weights, usage scripts and
+conversion utilities for the following models:

 ..
    This list is updated automatically from the README with `make fix-copies`. Do not update manually!
@@ -100,136 +100,160 @@ and conversion utilities for the following models:
 6. :doc:`BigBird-RoBERTa <model_doc/bigbird>` (from Google Research) released with the paper `Big Bird: Transformers
   for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua
   Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-7. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
+7. :doc:`BigBird-Pegasus <model_doc/bigbird_pegasus>` (from Google Research) released with the paper `Big Bird:
+   Transformers for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by Manzil Zaheer, Guru Guruganesh, Avinava
+   Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+8. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-8. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building an
+9. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building an
   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-9. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
-   <https://arxiv.org/abs/2010.10499>`__ by Adrian de Wynter and Daniel J. Perry.
-10. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
+10. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
+    <https://arxiv.org/abs/2010.10499>`__ by Adrian de Wynter and Daniel J. Perry.
+11. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
    French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
    Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-11. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
+12. :doc:`CLIP <model_doc/clip>` from (OpenAI) released with the paper `Learning Transferable Visual Models From
+    Natural Language Supervision <https://arxiv.org/abs/2103.00020>`__ by Alec Radford, Jong Wook Kim, Chris Hallacy,
+    Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen
+    Krueger, Ilya Sutskever.
+13. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
    Span-based Dynamic Convolution <https://arxiv.org/abs/2008.02496>`__ by Zihang Jiang, Weihao Yu, Daquan Zhou,
    Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-12. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
+14. :doc:`CPM <model_doc/cpm>` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative
+    Chinese Pre-trained Language Model <https://arxiv.org/abs/2012.00413>`__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei
+    Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng,
+    Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang,
+    Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+15. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
    Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
    Lav R. Varshney, Caiming Xiong and Richard Socher.
-13. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
+16. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
    Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu
    Chen.
-14. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
+17. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
    with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
    Weizhu Chen.
-15. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+18. :doc:`DeiT <model_doc/deit>` (from Facebook) released with the paper `Training data-efficient image transformers &
+    distillation through attention <https://arxiv.org/abs/2012.12877>`__ by Hugo Touvron, Matthieu Cord, Matthijs
+    Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+19. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
    Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
    Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-16. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+20. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
    distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
    Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
    `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
    version of DistilBERT.
-17. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+21. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
    Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
    Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-18. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+22. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
    Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
    Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-19. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+23. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
    Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
    Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-20. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+24. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
    Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
    Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-21. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+25. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
    Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
    and Ilya Sutskever.
-22. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+26. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
    Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
    Luan, Dario Amodei** and Ilya Sutskever**.
-23. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
+27. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
    <https://github.com/EleutherAI/gpt-neo>`__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-24. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
+28. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
    <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
-25. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+29. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-26. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+30. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-27. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+31. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-28. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+32. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
+    Representations with Entity-aware Self-attention <https://arxiv.org/abs/2010.01057>`__ by Ikuya Yamada, Akari Asai,
+    Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+33. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
    by Hao Tan and Mohit Bansal.
-29. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
+34. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
    Machine Translation <https://arxiv.org/abs/2010.11125>`__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi
    Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman
    Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-30. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+35. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
    Translator Team.
-31. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+36. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-32. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
+37. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
    Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
    Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-33. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+38. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
+    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
+    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+39. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
+    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
+    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+40. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
    Jianfeng Lu, Tie-Yan Liu.
-34. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+41. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-35. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+42. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
    Mohammad Saleh and Peter J. Liu.
-36. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+43. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-37. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+44. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-38. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+45. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-39. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
+46. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
    `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
    Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-40. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
+47. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
    about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
    Krishna, and Kurt W. Keutzer.
-41. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+48. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-42. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+49. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
    Francesco Piccinno and Julian Martin Eisenschlos.
-43. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+50. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-44. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
+51. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
    Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
    Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
    Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-45. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+52. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
    Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
    Zhou, Abdelrahman Mohamed, Michael Auli.
-46. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+53. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-47. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+54. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-48. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+55. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
    Zettlemoyer and Veselin Stoyanov.
-49. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+56. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-50. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
+57. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
    Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
    Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.

@@ -237,8 +261,8 @@ and conversion utilities for the following models:
 .. _bigtable:

 The table below represents the current support in the library for each of those models, whether they have a Python
-tokenizer (called "slow"). A "fast" tokenizer backed by the 🤗 Tokenizers library, whether they have support in PyTorch,
-TensorFlow and/or Flax.
+tokenizer (called "slow"). A "fast" tokenizer backed by the 🤗 Tokenizers library, whether they have support in Jax (via
+Flax), PyTorch, and/or TensorFlow.

 ..
    This table is updated automatically from the auto modules with `make fix-copies`. Do not update manually!
@@ -256,12 +280,16 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           BigBird           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       BigBirdPegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       BlenderbotSmall       |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            CLIP             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
@@ -270,13 +298,15 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           DeBERTa           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -292,6 +322,8 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
@@ -304,6 +336,8 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
@@ -376,6 +410,7 @@ TensorFlow and/or Flax.

    pretrained_models
    examples
+    troubleshooting
    custom_datasets
    notebooks
    sagemaker
@@ -386,6 +421,7 @@ TensorFlow and/or Flax.
    add_new_model
    fast_tokenizers
    testing
+    debugging
    serialization

 .. toctree::
@@ -402,6 +438,7 @@ TensorFlow and/or Flax.

    main_classes/callback
    main_classes/configuration
+    main_classes/data_collator
    main_classes/logging
    main_classes/model
    main_classes/optimizer_schedules
@@ -423,15 +460,20 @@ TensorFlow and/or Flax.
    model_doc/bert
    model_doc/bertweet
    model_doc/bertgeneration
+    model_doc/bert_japanese
    model_doc/bigbird
+    model_doc/bigbird_pegasus
    model_doc/blenderbot
    model_doc/blenderbot_small
    model_doc/bort
    model_doc/camembert
+    model_doc/clip
    model_doc/convbert
+    model_doc/cpm
    model_doc/ctrl
    model_doc/deberta
    model_doc/deberta_v2
+    model_doc/deit
    model_doc/dialogpt
    model_doc/distilbert
    model_doc/dpr
@@ -445,10 +487,13 @@ TensorFlow and/or Flax.
    model_doc/layoutlm
    model_doc/led
    model_doc/longformer
+    model_doc/luke
    model_doc/lxmert
    model_doc/marian
    model_doc/m2m_100
    model_doc/mbart
+    model_doc/megatron_bert
+    model_doc/megatron_gpt2
    model_doc/mobilebert
    model_doc/mpnet
    model_doc/mt5
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -149,12 +149,6 @@ So if you don't have any specific environment variable set, the cache directory
 (``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
 environment variable for ``TRANSFORMERS_CACHE``.

-### Note on model downloads (Continuous Integration or large-scale deployments)
-
-If you expect to be downloading large volumes of models (more than 10,000) from huggingface.co (for instance through
-your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way
-faster, and cheaper. Feel free to contact us privately, we'd love to help with this.
-
 ### Offline mode

 It's possible to run 🤗 Transformers in a firewalled or a no-network environment.
@@ -168,13 +162,13 @@ Here is an example of how this can be used on a filesystem that is shared betwee
 On the instance with the normal network run your program which will download and cache models (and optionally datasets if you use 🤗 Datasets). For example:

 ```
-python examples/seq2seq/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```

 and then with the same filesystem you can now run the same program on a firewalled instance:
 ```
 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/seq2seq/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 and it should succeed without any hanging waiting to timeout.

--- a/docs/source/internal/trainer_utils.rst
+++ b/docs/source/internal/trainer_utils.rst
@@ -1,4 +1,4 @@
-.. 
+..
    Copyright 2020 The HuggingFace Team. All rights reserved.

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
@@ -46,3 +46,9 @@ Distributed Evaluation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.HfArgumentParser
+
+
+Debug Utilities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.debug_utils.DebugUnderflowOverflow
--- a/docs/source/main_classes/data_collator.rst
+++ b/docs/source/main_classes/data_collator.rst
@@ -0,0 +1,71 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Data Collator
+-----------------------------------------------------------------------------------------------------------------------
+
+Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of
+the same type as the elements of :obj:`train_dataset` or :obj:`eval_dataset`.
+
+To be able to build batches, data collators may apply some processing (like padding). Some of them (like
+:class:`~transformers.DataCollatorForLanguageModeling`) also apply some random data augmentation (like random masking)
+oin the formed batch.
+
+Examples of use can be found in the :doc:`example scripts <../examples>` or :doc:`example notebooks <../notebooks>`.
+
+
+Default data collator
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.data.data_collator.default_data_collator
+
+
+DataCollatorWithPadding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.data.data_collator.DataCollatorWithPadding
+    :members:
+
+
+DataCollatorForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.data.data_collator.DataCollatorForTokenClassification
+    :members:
+
+
+DataCollatorForSeq2Seq
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.data.data_collator.DataCollatorForSeq2Seq
+    :members:
+
+
+DataCollatorForLanguageModeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.data.data_collator.DataCollatorForLanguageModeling
+    :members: mask_tokens
+
+
+DataCollatorForWholeWordMask
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.data.data_collator.DataCollatorForWholeWordMask
+    :members: mask_tokens
+
+
+DataCollatorForPermutationLanguageModeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.data.data_collator.DataCollatorForPermutationLanguageModeling
+    :members: mask_tokens
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -73,3 +73,10 @@ Generation

 .. autoclass:: transformers.generation_tf_utils.TFGenerationMixin
    :members:
+
+
+Pushing to the Hub
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.file_utils.PushToHubMixin
+    :members:
--- a/docs/source/main_classes/output.rst
+++ b/docs/source/main_classes/output.rst
@@ -13,8 +13,8 @@
 Model outputs
 -----------------------------------------------------------------------------------------------------------------------

-PyTorch models have outputs that are instances of subclasses of :class:`~transformers.file_utils.ModelOutput`. Those
-are data structures containing all the information returned by the model, but that can also be used as tuples or
+All models have outputs that are instances of subclasses of :class:`~transformers.file_utils.ModelOutput`. Those are
+data structures containing all the information returned by the model, but that can also be used as tuples or
 dictionaries.

 Let's see of this looks on an example:
--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@@ -23,6 +23,7 @@ There are two categories of pipeline abstractions to be aware about:
 - The :func:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines.
 - The other task-specific pipelines:

+    - :class:`~transformers.AutomaticSpeechRecognitionPipeline`
    - :class:`~transformers.ConversationalPipeline`
    - :class:`~transformers.FeatureExtractionPipeline`
    - :class:`~transformers.FillMaskPipeline`
@@ -35,6 +36,7 @@ There are two categories of pipeline abstractions to be aware about:
    - :class:`~transformers.ZeroShotClassificationPipeline`
    - :class:`~transformers.Text2TextGenerationPipeline`
    - :class:`~transformers.TableQuestionAnsweringPipeline`
+    - :class:`~transformers.ImageClassificationPipeline`

 The pipeline abstraction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -48,6 +50,13 @@ pipeline but requires an additional argument which is the `task`.
 The task specific pipelines
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+AutomaticSpeechRecognitionPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.AutomaticSpeechRecognitionPipeline
+    :special-members: __call__
+    :members:
+
 ConversationalPipeline
 =======================================================================================================================

@@ -71,6 +80,13 @@ FillMaskPipeline
    :special-members: __call__
    :members:

+ImageClassificationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.ImageClassificationPipeline
+    :special-members: __call__
+    :members:
+
 NerPipeline
 =======================================================================================================================

--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@@ -68,8 +68,8 @@ Additionally, the following method can be used to load values from a data file a
 Example usage
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-An example using these processors is given in the `run_glue.py
-<https://github.com/huggingface/pytorch-transformers/blob/master/examples/text-classification/run_glue.py>`__ script.
+An example using these processors is given in the :prefix_link:`run_glue.py
+<examples/legacy/text-classification/run_glue.py>` script.


 XNLI
@@ -89,8 +89,8 @@ This library hosts the processor to load the XNLI data:

 Please note that since the gold labels are available on the test set, evaluation is performed on the test set.

-An example using these processors is given in the `run_xnli.py
-<https://github.com/huggingface/pytorch-transformers/blob/master/examples/text-classification/run_xnli.py>`__ script.
+An example using these processors is given in the :prefix_link:`run_xnli.py
+<examples/legacy/text-classification/run_xnli.py>` script.


 SQuAD
@@ -169,4 +169,4 @@ Using `tensorflow_datasets` is as easy as using a data file:


 Another example using these processors is given in the :prefix_link:`run_squad.py
-<examples/question-answering/run_squad.py>` script.
+<examples/legacy/question-answering/run_squad.py>` script.
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -169,8 +169,8 @@ Regarding the `TFTrainer` class:
 - The `TFTrainer` method `_setup_wandb` is deprecated in favor of `setup_wandb`.
 - The `TFTrainer` method `_run_model` is deprecated in favor of `run_model`.

-Regarding the `TrainerArgument` class:
- The `TrainerArgument` argument `evaluate_during_training` is deprecated in favor of `evaluation_strategy`.
+Regarding the `TrainingArguments` class:
+- The `TrainingArguments` argument `evaluate_during_training` is deprecated in favor of `evaluation_strategy`.

 Regarding the Transfo-XL model:
 - The Transfo-XL configuration attribute `tie_weight` becomes `tie_words_embeddings`.
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -43,7 +43,8 @@ Tips:
  similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
  number of (repeating) layers.

-The original code can be found `here <https://github.com/google-research/ALBERT>`__.
+This model was contributed by `lysandre <https://huggingface.co/lysandre>`__. The original code can be found `here
+<https://github.com/google-research/ALBERT>`__.

 AlbertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -44,6 +44,13 @@ AutoTokenizer
    :members:


+AutoFeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoFeatureExtractor
+    :members:
+
+
 AutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -121,6 +128,13 @@ AutoModelForTableQuestionAnswering
    :members:


+AutoModelForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForImageClassification
+    :members:
+
+
 TFAutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/bart.rst
+++ b/docs/source/model_doc/bart.rst
@@ -35,14 +35,15 @@ According to the abstract,
  state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains
  of up to 6 ROUGE.

-The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/bart>`__.
+This model was contributed by `sshleifer <https://huggingface.co/sshleifer>`__. The Authors' code can be found `here
+<https://github.com/pytorch/fairseq/tree/master/examples/bart>`__.


 Examples
 _______________________________________________________________________________________________________________________

 - Examples and scripts for fine-tuning BART and other models for sequence to sequence tasks can be found in
-  :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
+  :prefix_link:`examples/pytorch/summarization/ <examples/pytorch/summarization/README.md>`.
 - An example of how to train :class:`~transformers.BartForConditionalGeneration` with a Hugging Face :obj:`datasets`
  object can be found in this `forum discussion
  <https://discuss.huggingface.co/t/train-bart-for-conditional-generation-e-g-summarization/1904>`__.
--- a/docs/source/model_doc/barthez.rst
+++ b/docs/source/model_doc/barthez.rst
@@ -16,7 +16,7 @@ BARThez
 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The BARThez model was proposed in `BARThez: a Skilled Pretrained French Sequence-to-Sequence Model`
+The BARThez model was proposed in `BARThez: a Skilled Pretrained French Sequence-to-Sequence Model
 <https://arxiv.org/abs/2010.12321>`__ by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis on 23 Oct,
 2020.

@@ -35,14 +35,15 @@ summarization dataset, OrangeSum, that we release with this paper. We also conti
 pretrained multilingual BART on BARThez's corpus, and we show that the resulting model, which we call mBARTHez,
 provides a significant boost over vanilla BARThez, and is on par with or outperforms CamemBERT and FlauBERT.*

-The Authors' code can be found `here <https://github.com/moussaKam/BARThez>`__.
+This model was contributed by `moussakam <https://huggingface.co/moussakam>`__. The Authors' code can be found `here
+<https://github.com/moussaKam/BARThez>`__.


 Examples
 _______________________________________________________________________________________________________________________

 - BARThez can be fine-tuned on sequence-to-sequence tasks in a similar way as BART, check:
-  :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
+  :prefix_link:`examples/pytorch/summarization/ <examples/pytorch/summarization/README.md>`.


 BarthezTokenizer
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -42,7 +42,8 @@ Tips:
 - BERT was trained with the masked language modeling (MLM) and next sentence prediction (NSP) objectives. It is
  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation.

-The original code can be found `here <https://github.com/google-research/bert>`__.
+This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
+<https://github.com/google-research/bert>`__.

 BertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -90,7 +91,7 @@ BertForPreTraining
    :members: forward


-BertModelLMHeadModel
+BertLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertLMHeadModel
--- a/docs/source/model_doc/bert_japanese.rst
+++ b/docs/source/model_doc/bert_japanese.rst
@@ -0,0 +1,80 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BertJapanese
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BERT models trained on Japanese text.
+
+There are models with two different tokenization methods:
+
+- Tokenize with MeCab and WordPiece. This requires some extra dependencies, `fugashi
+  <https://github.com/polm/fugashi>`__ which is a wrapper around `MeCab <https://taku910.github.io/mecab/>`__.
+- Tokenize into characters.
+
+To use `MecabTokenizer`, you should ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install
+from source) to install dependencies.
+
+See `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__.
+
+Example of using a model with MeCab and WordPiece tokenization:
+
+.. code-block::
+
+    >>> import torch
+    >>> from transformers import AutoModel, AutoTokenizer 
+
+    >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
+    >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
+
+    >>> ## Input Japanese Text
+    >>> line = "吾輩は猫である。"
+
+    >>> inputs = tokenizer(line, return_tensors="pt")
+
+    >>> print(tokenizer.decode(inputs['input_ids'][0]))
+    [CLS] 吾輩 は 猫 で ある 。 [SEP]
+
+    >>> outputs = bertjapanese(**inputs)
+
+Example of using a model with Character tokenization:
+
+.. code-block::
+
+    >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
+    >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
+
+    >>> ## Input Japanese Text
+    >>> line = "吾輩は猫である。"
+
+    >>> inputs = tokenizer(line, return_tensors="pt")
+
+    >>> print(tokenizer.decode(inputs['input_ids'][0]))
+    [CLS] 吾 輩 は 猫 で あ る 。 [SEP]
+
+    >>> outputs = bertjapanese(**inputs)
+
+Tips:
+
+- This implementation is the same as BERT, except for tokenization method. Refer to the :doc:`documentation of BERT
+  <bert>` for more usage examples.
+
+This model was contributed by `cl-tohoku <https://huggingface.co/cl-tohoku>`__.
+
+BertJapaneseTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertJapaneseTokenizer
+    :members: 
--- a/docs/source/model_doc/bertgeneration.rst
+++ b/docs/source/model_doc/bertgeneration.rst
@@ -38,22 +38,22 @@ Usage:

 .. code-block::

-  # leverage checkpoints for Bert2Bert model...
-  # use BERT's cls token as BOS token and sep token as EOS token
-  encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
-  # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
-  decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
-  bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
+    >>> # leverage checkpoints for Bert2Bert model...
+    >>> # use BERT's cls token as BOS token and sep token as EOS token
+    >>> encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
+    >>> # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
+    >>> decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
+    >>> bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)

-  # create tokenizer...
-  tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
+    >>> # create tokenizer...
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

-  input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
-  labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
+    >>> input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
+    >>> labels = tokenizer('This is a short summary', return_tensors="pt").input_ids

-  # train...
-  loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
-  loss.backward()
+    >>> # train...
+    >>> loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
+    >>> loss.backward()


 - Pretrained :class:`~transformers.EncoderDecoderModel` are also directly available in the model hub, e.g.,
@@ -61,15 +61,15 @@ Usage:

 .. code-block::

-  # instantiate sentence fusion model
-  sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
-  tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
+    >>> # instantiate sentence fusion model
+    >>> sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")

-  input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids
+    >>> input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids

-  outputs = sentence_fuser.generate(input_ids)
+    >>> outputs = sentence_fuser.generate(input_ids)

-  print(tokenizer.decode(outputs[0]))
+    >>> print(tokenizer.decode(outputs[0]))


 Tips:
@@ -79,7 +79,8 @@ Tips:
 - For summarization, sentence splitting, sentence fusion and translation, no special tokens are required for the input.
  Therefore, no EOS token should be added to the end of the input.

-The original code can be found `here <https://tfhub.dev/s?module-type=text-generation&subtype=module,placeholder>`__.
+This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
+found `here <https://tfhub.dev/s?module-type=text-generation&subtype=module,placeholder>`__.

 BertGenerationConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/bertweet.rst
+++ b/docs/source/model_doc/bertweet.rst
@@ -31,31 +31,31 @@ Example of use:

 .. code-block::

-  import torch
-  from transformers import AutoModel, AutoTokenizer 
+    >>> import torch
+    >>> from transformers import AutoModel, AutoTokenizer 

-  bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
+    >>> bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

-  # For transformers v4.x+: 
-  tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
+    >>> # For transformers v4.x+: 
+    >>> tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

-  # For transformers v3.x: 
-  # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
+    >>> # For transformers v3.x: 
+    >>> # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

-  # INPUT TWEET IS ALREADY NORMALIZED!
-  line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
+    >>> # INPUT TWEET IS ALREADY NORMALIZED!
+    >>> line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"

-  input_ids = torch.tensor([tokenizer.encode(line)])
+    >>> input_ids = torch.tensor([tokenizer.encode(line)])

-  with torch.no_grad():
-      features = bertweet(input_ids)  # Models outputs are now tuples
+    >>> with torch.no_grad():
+    ...     features = bertweet(input_ids)  # Models outputs are now tuples

-  ## With TensorFlow 2.0+:
-  # from transformers import TFAutoModel
-  # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
+    >>> # With TensorFlow 2.0+:
+    >>> # from transformers import TFAutoModel
+    >>> # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")

-
-The original code can be found `here <https://github.com/VinAIResearch/BERTweet>`__.
+This model was contributed by `dqnguyen <https://huggingface.co/dqnguyen>`__. The original code can be found `here
+<https://github.com/VinAIResearch/BERTweet>`__.

 BertweetTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/bigbird.rst
+++ b/docs/source/model_doc/bigbird.rst
@@ -50,7 +50,8 @@ Tips:
 - Current implementation supports only **ITC**.
 - Current implementation doesn't support **num_random_blocks = 0**

-The original code can be found `here <https://github.com/google-research/bigbird>`__.
+This model was contributed by `vasudevgupta <https://huggingface.co/vasudevgupta>`__. The original code can be found
+`here <https://github.com/google-research/bigbird>`__.

 BigBirdConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -66,6 +67,11 @@ BigBirdTokenizer
    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
        create_token_type_ids_from_sequences, save_vocabulary

+BigBirdTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdTokenizerFast
+    :members:

 BigBird specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/bigbird_pegasus.rst
+++ b/docs/source/model_doc/bigbird_pegasus.rst
@@ -0,0 +1,98 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BigBirdPegasus
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BigBird model was proposed in `Big Bird: Transformers for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by
+Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon,
+Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention
+based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse
+attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it
+has been shown that applying sparse, global, and random attention approximates full attention, while being
+computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context,
+BigBird has shown improved performance on various long document NLP tasks, such as question answering and
+summarization, compared to BERT or RoBERTa.
+
+The abstract from the paper is the following:
+
+*Transformers-based models, such as BERT, have been one of the most successful deep learning models for NLP.
+Unfortunately, one of their core limitations is the quadratic dependency (mainly in terms of memory) on the sequence
+length due to their full attention mechanism. To remedy this, we propose, BigBird, a sparse attention mechanism that
+reduces this quadratic dependency to linear. We show that BigBird is a universal approximator of sequence functions and
+is Turing complete, thereby preserving these properties of the quadratic, full attention model. Along the way, our
+theoretical analysis reveals some of the benefits of having O(1) global tokens (such as CLS), that attend to the entire
+sequence as part of the sparse attention mechanism. The proposed sparse attention can handle sequences of length up to
+8x of what was previously possible using similar hardware. As a consequence of the capability to handle longer context,
+BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also
+propose novel applications to genomics data.*
+
+Tips:
+
+- For an in-detail explanation on how BigBird's attention works, see `this blog post
+  <https://huggingface.co/blog/big-bird>`__.
+- BigBird comes with 2 implementations: **original_full** & **block_sparse**. For the sequence length < 1024, using
+  **original_full** is advised as there is no benefit in using **block_sparse** attention.
+- The code currently uses window size of 3 blocks and 2 global blocks.
+- Sequence length must be divisible by block size.
+- Current implementation supports only **ITC**.
+- Current implementation doesn't support **num_random_blocks = 0**.
+- BigBirdPegasus uses the `PegasusTokenizer
+  <https://github.com/huggingface/transformers/blob/master/src/transformers/models/pegasus/tokenization_pegasus.py>`__.
+
+The original code can be found `here <https://github.com/google-research/bigbird>`__.
+
+BigBirdPegasusConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdPegasusConfig
+    :members:
+
+
+BigBirdPegasusModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdPegasusModel
+    :members: forward
+
+
+BigBirdPegasusForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdPegasusForConditionalGeneration
+    :members: forward
+
+
+BigBirdPegasusForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdPegasusForSequenceClassification
+    :members: forward
+
+
+BigBirdPegasusForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdPegasusForQuestionAnswering
+    :members: forward
+
+
+BigBirdPegasusForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdPegasusForCausalLM
+    :members: forward
+
+
--- a/docs/source/model_doc/blenderbot.rst
+++ b/docs/source/model_doc/blenderbot.rst
@@ -36,7 +36,8 @@ and code publicly available. Human evaluations show our best models are superior
 dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
 failure cases of our models.*

-The authors' code can be found `here <https://github.com/facebookresearch/ParlAI>`__ .
+This model was contributed by `sshleifer <https://huggingface.co/sshleifer>`__. The authors' code can be found `here
+<https://github.com/facebookresearch/ParlAI>`__ .


 Implementation Notes
--- a/docs/source/model_doc/blenderbot_small.rst
+++ b/docs/source/model_doc/blenderbot_small.rst
@@ -39,7 +39,8 @@ and code publicly available. Human evaluations show our best models are superior
 dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
 failure cases of our models.*

-The authors' code can be found `here <https://github.com/facebookresearch/ParlAI>`__ .
+This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The authors' code can be
+found `here <https://github.com/facebookresearch/ParlAI>`__ .

 BlenderbotSmallConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/bort.rst
+++ b/docs/source/model_doc/bort.rst
@@ -43,4 +43,5 @@ Tips:
  that is sadly not open-sourced yet. It would be very useful for the community, if someone tries to implement the
  algorithm to make BORT fine-tuning work.

-The original code can be found `here <https://github.com/alexa/bort/>`__.
+This model was contributed by `stefan-it <https://huggingface.co/stefan-it>`__. The original code can be found `here
+<https://github.com/alexa/bort/>`__.
--- a/docs/source/model_doc/camembert.rst
+++ b/docs/source/model_doc/camembert.rst
@@ -37,7 +37,8 @@ Tips:
 - This implementation is the same as RoBERTa. Refer to the :doc:`documentation of RoBERTa <roberta>` for usage examples
  as well as the information relative to the inputs and outputs.

-The original code can be found `here <https://camembert-model.fr/>`__.
+This model was contributed by `camembert <https://huggingface.co/camembert>`__. The original code can be found `here
+<https://camembert-model.fr/>`__.

 CamembertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/clip.rst
+++ b/docs/source/model_doc/clip.rst
@@ -0,0 +1,154 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+CLIP
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The CLIP model was proposed in `Learning Transferable Visual Models From Natural Language Supervision
+<https://arxiv.org/abs/2103.00020>`__ by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh,
+Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. CLIP
+(Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. It can be
+instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing
+for the task, similarly to the zero-shot capabilities of GPT-2 and 3.
+
+The abstract from the paper is the following:
+
+*State-of-the-art computer vision systems are trained to predict a fixed set of predetermined object categories. This
+restricted form of supervision limits their generality and usability since additional labeled data is needed to specify
+any other visual concept. Learning directly from raw text about images is a promising alternative which leverages a
+much broader source of supervision. We demonstrate that the simple pre-training task of predicting which caption goes
+with which image is an efficient and scalable way to learn SOTA image representations from scratch on a dataset of 400
+million (image, text) pairs collected from the internet. After pre-training, natural language is used to reference
+learned visual concepts (or describe new ones) enabling zero-shot transfer of the model to downstream tasks. We study
+the performance of this approach by benchmarking on over 30 different existing computer vision datasets, spanning tasks
+such as OCR, action recognition in videos, geo-localization, and many types of fine-grained object classification. The
+model transfers non-trivially to most tasks and is often competitive with a fully supervised baseline without the need
+for any dataset specific training. For instance, we match the accuracy of the original ResNet-50 on ImageNet zero-shot
+without needing to use any of the 1.28 million training examples it was trained on. We release our code and pre-trained
+model weights at this https URL.*
+
+Usage
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+CLIP is a multi-modal vision and language model. It can be used for image-text similarity and for zero-shot image
+classification. CLIP uses a ViT like transformer to get visual features and a causal language model to get the text
+features. Both the text and visual features are then projected to a latent space with identical dimension. The dot
+product between the projected image and text features is then used as a similar score.
+
+To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
+which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image. The authors
+also add absolute position embeddings, and feed the resulting sequence of vectors to a standard Transformer encoder.
+The :class:`~transformers.CLIPFeatureExtractor` can be used to resize (or rescale) and normalize images for the model.
+
+The :class:`~transformers.CLIPTokenizer` is used to encode the text. The :class:`~transformers.CLIPProcessor` wraps
+:class:`~transformers.CLIPFeatureExtractor` and :class:`~transformers.CLIPTokenizer` into a single instance to both
+encode the text and prepare the images. The following example shows how to get the image-text similarity scores using
+:class:`~transformers.CLIPProcessor` and :class:`~transformers.CLIPModel`.
+
+
+.. code-block::
+
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> from transformers import CLIPProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+
+
+This model was contributed by `valhalla <https://huggingface.co/valhalla>`__. The original code can be found `here
+<https://github.com/openai/CLIP>`__.
+
+CLIPConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CLIPConfig
+    :members: from_text_vision_configs
+
+
+CLIPTextConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CLIPTextConfig
+    :members:
+
+
+CLIPVisionConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CLIPVisionConfig
+    :members:
+
+
+
+CLIPTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CLIPTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+CLIPTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CLIPTokenizerFast
+    :members:
+
+
+CLIPFeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CLIPFeatureExtractor
+    :members:
+
+
+CLIPProcessor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CLIPProcessor
+    :members:
+
+
+
+CLIPModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CLIPModel
+    :members: forward, get_text_features, get_image_features
+
+
+CLIPTextModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CLIPTextModel
+    :members: forward
+
+
+CLIPVisionModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CLIPVisionModel
+    :members: forward
--- a/docs/source/model_doc/convbert.rst
+++ b/docs/source/model_doc/convbert.rst
@@ -34,8 +34,10 @@ ConvBERT significantly outperforms BERT and its variants in various downstream t
 fewer model parameters. Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while
 using less than 1/4 training cost. Code and pre-trained models will be released.*

-ConvBERT training tips are similar to those of BERT. The original implementation can be found here:
-https://github.com/yitu-opensource/ConvBert
+ConvBERT training tips are similar to those of BERT.
+
+This model was contributed by `abhishek <https://huggingface.co/abhishek>`__. The original implementation can be found
+here: https://github.com/yitu-opensource/ConvBert

 ConvBertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -56,8 +58,7 @@ ConvBertTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.ConvBertTokenizerFast
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
+    :members:


 ConvBertModel
--- a/docs/source/model_doc/cpm.rst
+++ b/docs/source/model_doc/cpm.rst
@@ -0,0 +1,45 @@
+..
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+CPM
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The CPM model was proposed in `CPM: A Large-scale Generative Chinese Pre-trained Language Model
+<https://arxiv.org/abs/2012.00413>`__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin,
+Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen,
+Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+
+The abstract from the paper is the following:
+
+*Pre-trained Language Models (PLMs) have proven to be beneficial for various downstream NLP tasks. Recently, GPT-3,
+with 175 billion parameters and 570GB training data, drew a lot of attention due to the capacity of few-shot (even
+zero-shot) learning. However, applying GPT-3 to address Chinese NLP tasks is still challenging, as the training corpus
+of GPT-3 is primarily English, and the parameters are not publicly available. In this technical report, we release the
+Chinese Pre-trained Language Model (CPM) with generative pre-training on large-scale Chinese training data. To the best
+of our knowledge, CPM, with 2.6 billion parameters and 100GB Chinese training data, is the largest Chinese pre-trained
+language model, which could facilitate several downstream Chinese NLP tasks, such as conversation, essay generation,
+cloze test, and language understanding. Extensive experiments demonstrate that CPM achieves strong performance on many
+NLP tasks in the settings of few-shot (even zero-shot) learning.*
+
+This model was contributed by `canwenxu <https://huggingface.co/canwenxu>`__. The original implementation can be found
+here: https://github.com/TsinghuaAI/CPM-Generate
+
+Note: We only have a tokenizer here, since the model architecture is the same as GPT-2.
+
+CpmTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CpmTokenizer
+    :members:
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@@ -46,7 +46,8 @@ Tips:
  `reusing the past in generative models <../quickstart.html#using-the-past>`__ for more information on the usage of
  this argument.

-The original code can be found `here <https://github.com/salesforce/ctrl>`__.
+This model was contributed by `keskarnitishr <https://huggingface.co/keskarnitishr>`__. The original code can be found
+`here <https://github.com/salesforce/ctrl>`__.


 CTRLConfig
--- a/docs/source/model_doc/deberta.rst
+++ b/docs/source/model_doc/deberta.rst
@@ -38,7 +38,8 @@ the training data performs consistently better on a wide range of NLP tasks, ach
 pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*


-The original code can be found `here <https://github.com/microsoft/DeBERTa>`__.
+This model was contributed by `DeBERTa <https://huggingface.co/DeBERTa>`__. The original code can be found `here
+<https://github.com/microsoft/DeBERTa>`__.


 DebertaConfig
@@ -55,6 +56,12 @@ DebertaTokenizer
    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
        create_token_type_ids_from_sequences, save_vocabulary

+DebertaTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaTokenizerFast
+    :members: build_inputs_with_special_tokens, create_token_type_ids_from_sequences
+

 DebertaModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/deberta_v2.rst
+++ b/docs/source/model_doc/deberta_v2.rst
@@ -58,7 +58,8 @@ New in v2:
 - **900M model & 1.5B model** Two additional model sizes are available: 900M and 1.5B, which significantly improves the
  performance of downstream tasks.

-The original code can be found `here <https://github.com/microsoft/DeBERTa>`__.
+This model was contributed by `DeBERTa <https://huggingface.co/DeBERTa>`__. The original code can be found `here
+<https://github.com/microsoft/DeBERTa>`__.


 DebertaV2Config
--- a/docs/source/model_doc/deit.rst
+++ b/docs/source/model_doc/deit.rst
@@ -0,0 +1,111 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+DeiT
+-----------------------------------------------------------------------------------------------------------------------
+
+.. note::
+
+    This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
+    breaking changes to fix it in the future. If you see something strange, file a `Github Issue
+    <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__.
+
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The DeiT model was proposed in `Training data-efficient image transformers & distillation through attention
+<https://arxiv.org/abs/2012.12877>`__ by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre
+Sablayrolles, Hervé Jégou. The `Vision Transformer (ViT) <https://huggingface.co/transformers/model_doc/vit.html>`__
+introduced in `Dosovitskiy et al., 2020 <https://arxiv.org/abs/2010.11929>`__ has shown that one can match or even
+outperform existing convolutional neural networks using a Transformer encoder (BERT-like). However, the ViT models
+introduced in that paper required training on expensive infrastructure for multiple weeks, using external data. DeiT
+(data-efficient image transformers) are more efficiently trained transformers for image classification, requiring far
+less data and far less computing resources compared to the original ViT models.
+
+The abstract from the paper is the following:
+
+*Recently, neural networks purely based on attention were shown to address image understanding tasks such as image
+classification. However, these visual transformers are pre-trained with hundreds of millions of images using an
+expensive infrastructure, thereby limiting their adoption. In this work, we produce a competitive convolution-free
+transformer by training on Imagenet only. We train them on a single computer in less than 3 days. Our reference vision
+transformer (86M parameters) achieves top-1 accuracy of 83.1% (single-crop evaluation) on ImageNet with no external
+data. More importantly, we introduce a teacher-student strategy specific to transformers. It relies on a distillation
+token ensuring that the student learns from the teacher through attention. We show the interest of this token-based
+distillation, especially when using a convnet as a teacher. This leads us to report results competitive with convnets
+for both Imagenet (where we obtain up to 85.2% accuracy) and when transferring to other tasks. We share our code and
+models.*
+
+Tips:
+
+- Compared to ViT, DeiT models use a so-called distillation token to effectively learn from a teacher (which, in the
+  DeiT paper, is a ResNet like-model). The distillation token is learned through backpropagation, by interacting with
+  the class ([CLS]) and patch tokens through the self-attention layers.
+- There are 2 ways to fine-tune distilled models, either (1) in a classic way, by only placing a prediction head on top
+  of the final hidden state of the class token and not using the distillation signal, or (2) by placing both a
+  prediction head on top of the class token and on top of the distillation token. In that case, the [CLS] prediction
+  head is trained using regular cross-entropy between the prediction of the head and the ground-truth label, while the
+  distillation prediction head is trained using hard distillation (cross-entropy between the prediction of the
+  distillation head and the label predicted by the teacher). At inference time, one takes the average prediction
+  between both heads as final prediction. (2) is also called "fine-tuning with distillation", because one relies on a
+  teacher that has already been fine-tuned on the downstream dataset. In terms of models, (1) corresponds to
+  :class:`~transformers.DeiTForImageClassification` and (2) corresponds to
+  :class:`~transformers.DeiTForImageClassificationWithTeacher`.
+- Note that the authors also did try soft distillation for (2) (in which case the distillation prediction head is
+  trained using KL divergence to match the softmax output of the teacher), but hard distillation gave the best results.
+- All released checkpoints were pre-trained and fine-tuned on ImageNet-1k only. No external data was used. This is in
+  contrast with the original ViT model, which used external data like the JFT-300M dataset/Imagenet-21k for
+  pre-training.
+- The authors of DeiT also released more efficiently trained ViT models, which you can directly plug into
+  :class:`~transformers.ViTModel` or :class:`~transformers.ViTForImageClassification`. Techniques like data
+  augmentation, optimization, and regularization were used in order to simulate training on a much larger dataset
+  (while only using ImageNet-1k for pre-training). There are 4 variants available (in 3 different sizes):
+  `facebook/deit-tiny-patch16-224`, `facebook/deit-small-patch16-224`, `facebook/deit-base-patch16-224` and
+  `facebook/deit-base-patch16-384`. Note that one should use :class:`~transformers.DeiTFeatureExtractor` in order to
+  prepare images for the model.
+
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__.
+
+
+DeiTConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DeiTConfig
+    :members:
+
+
+DeiTFeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DeiTFeatureExtractor
+    :members: __call__
+
+
+DeiTModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DeiTModel
+    :members: forward
+
+
+DeiTForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DeiTForImageClassification
+    :members: forward
+
+
+DeiTForImageClassificationWithTeacher
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DeiTForImageClassificationWithTeacher
+    :members: forward
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -44,8 +44,8 @@ Tips:
 - DistilBERT doesn't have options to select the input positions (:obj:`position_ids` input). This could be added if
  necessary though, just let us know if you need this option.

-The original code can be found `here
-<https://github.com/huggingface/transformers/tree/master/examples/distillation>`__.
+This model was contributed by `victorsanh <https://huggingface.co/victorsanh>`__. The original code can be found
+:prefix_link:`here <examples/research-projects/distillation>`.


 DistilBertConfig
--- a/docs/source/model_doc/dpr.rst
+++ b/docs/source/model_doc/dpr.rst
@@ -30,7 +30,8 @@ our dense retriever outperforms a strong Lucene-BM25 system largely by 9%-19% ab
 retrieval accuracy, and helps our end-to-end QA system establish new state-of-the-art on multiple open-domain QA
 benchmarks.*

-The original code can be found `here <https://github.com/facebookresearch/DPR>`__.
+This model was contributed by `lhoestq <https://huggingface.co/lhoestq>`__. The original code can be found `here
+<https://github.com/facebookresearch/DPR>`__.


 DPRConfig
--- a/docs/source/model_doc/electra.rst
+++ b/docs/source/model_doc/electra.rst
@@ -54,7 +54,8 @@ Tips:
  :class:`~transformers.ElectraForPreTraining` model (the classification head will be randomly initialized as it
  doesn't exist in the generator).

-The original code can be found `here <https://github.com/google-research/electra>`__.
+This model was contributed by `lysandre <https://huggingface.co/lysandre>`__. The original code can be found `here
+<https://github.com/google-research/electra>`__.


 ElectraConfig
@@ -184,3 +185,52 @@ TFElectraForQuestionAnswering

 .. autoclass:: transformers.TFElectraForQuestionAnswering
    :members: call
+
+
+FlaxElectraModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxElectraModel
+    :members: __call__
+
+
+FlaxElectraForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxElectraForPreTraining
+    :members: __call__
+
+
+FlaxElectraForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxElectraForMaskedLM
+    :members: __call__
+
+
+FlaxElectraForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxElectraForSequenceClassification
+    :members: __call__
+
+
+FlaxElectraForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxElectraForMultipleChoice
+    :members: __call__
+
+
+FlaxElectraForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxElectraForTokenClassification
+    :members: __call__
+
+
+FlaxElectraForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxElectraForQuestionAnswering
+    :members: __call__
--- a/docs/source/model_doc/flaubert.rst
+++ b/docs/source/model_doc/flaubert.rst
@@ -35,7 +35,8 @@ time they outperform other pretraining approaches. Different versions of FlauBER
 protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared to the research
 community for further reproducible experiments in French NLP.*

-The original code can be found `here <https://github.com/getalp/Flaubert>`__.
+This model was contributed by `formiel <https://huggingface.co/formiel>`__. The original code can be found `here
+<https://github.com/getalp/Flaubert>`__.


 FlaubertConfig
--- a/docs/source/model_doc/fsmt.rst
+++ b/docs/source/model_doc/fsmt.rst
@@ -34,7 +34,8 @@ data, then decode using noisy channel model reranking. Our submissions are ranke
 human evaluation campaign. On En->De, our system significantly outperforms other systems as well as human translations.
 This system improves upon our WMT'18 submission by 4.5 BLEU points.*

-The original code can be found here <https://github.com/pytorch/fairseq/tree/master/examples/wmt19>__.
+This model was contributed by `stas <https://huggingface.co/stas>`__. The original code can be found here
+<https://github.com/pytorch/fairseq/tree/master/examples/wmt19>__.

 Implementation Notes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/funnel.rst
+++ b/docs/source/model_doc/funnel.rst
@@ -49,7 +49,8 @@ Tips:
  :class:`~transformers.FunnelBaseModel`, :class:`~transformers.FunnelForSequenceClassification` and
  :class:`~transformers.FunnelForMultipleChoice`.

-The original code can be found `here <https://github.com/laiguokun/Funnel-Transformer>`__.
+This model was contributed by `sgugger <https://huggingface.co/sgugger>`__. The original code can be found `here
+<https://github.com/laiguokun/Funnel-Transformer>`__.


 FunnelConfig
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -45,7 +45,8 @@ Tips:
 `Write With Transformer <https://transformer.huggingface.co/doc/gpt>`__ is a webapp created and hosted by Hugging Face
 showcasing the generative capabilities of several models. GPT is one of them.

-The original code can be found `here <https://github.com/openai/finetune-transformer-lm>`__.
+This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
+<https://github.com/openai/finetune-transformer-lm>`__.

 Note:

--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -45,7 +45,8 @@ Tips:
 Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
 different sizes: small, medium, large, xl and a distilled version of the small checkpoint: `distilgpt-2`.

-The original code can be found `here <https://openai.com/blog/better-language-models/>`__.
+This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
+<https://openai.com/blog/better-language-models/>`__.


 GPT2Config
--- a/docs/source/model_doc/gpt_neo.rst
+++ b/docs/source/model_doc/gpt_neo.rst
@@ -23,6 +23,8 @@ Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. It is a GPT2 like c
 The architecture is similar to GPT2 except that GPT Neo uses local attention in every other layer with a window size of
 256 tokens.

+This model was contributed by `valhalla <https://huggingface.co/valhalla>`__.
+
 Generation
 _______________________________________________________________________________________________________________________

@@ -38,9 +40,9 @@ The :obj:`generate()` method can be used to generate text using GPT Neo model.
    ...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
    ...          "researchers was the fact that the unicorns spoke perfect English."

-    >>> input_ids = tokenizer(unicorns, return_tensors="pt").input_ids
+    >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids

-    >>> gen_tokens = model.generate(ids, do_sample=True, temperature=0.9, max_length=100,)
+    >>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
    >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]


--- a/docs/source/model_doc/herbert.rst
+++ b/docs/source/model_doc/herbert.rst
@@ -40,23 +40,25 @@ Examples of use:

 .. code-block::

-  from transformers import HerbertTokenizer, RobertaModel
+    >>> from transformers import HerbertTokenizer, RobertaModel

-  tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
-  model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
+    >>> tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+    >>> model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")

-  encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
-  outputs = model(encoded_input)
+    >>> encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
+    >>> outputs = model(encoded_input)

-  # HerBERT can also be loaded using AutoTokenizer and AutoModel:
-  import torch
-  from transformers import AutoModel, AutoTokenizer
+    >>> # HerBERT can also be loaded using AutoTokenizer and AutoModel:
+    >>> import torch
+    >>> from transformers import AutoModel, AutoTokenizer

-  tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
-  model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")
+    >>> tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+    >>> model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")


-The original code can be found `here <https://github.com/allegro/HerBERT>`__.
+This model was contributed by `rmroczkowski <https://huggingface.co/rmroczkowski>`__. The original code can be found
+`here <https://github.com/allegro/HerBERT>`__.
+

 HerbertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/ibert.rst
+++ b/docs/source/model_doc/ibert.rst
@@ -36,8 +36,9 @@ the full-precision baseline. Furthermore, our preliminary implementation of I-BE
 INT8 inference on a T4 GPU system as compared to FP32 inference. The framework has been developed in PyTorch and has
 been open-sourced.*

+This model was contributed by `kssteven <https://huggingface.co/kssteven>`__. The original code can be found `here
+<https://github.com/kssteven418/I-BERT>`__.

-The original code can be found `here <https://github.com/kssteven418/I-BERT>`__.

 IBertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/layoutlm.rst
+++ b/docs/source/model_doc/layoutlm.rst
@@ -56,31 +56,32 @@ Tips:

 .. code-block::

-   def normalize_bbox(bbox, width, height):
-        return [
-            int(1000 * (bbox[0] / width)),
-            int(1000 * (bbox[1] / height)),
-            int(1000 * (bbox[2] / width)),
-            int(1000 * (bbox[3] / height)),
-        ]
+    def normalize_bbox(bbox, width, height):
+         return [
+             int(1000 * (bbox[0] / width)),
+             int(1000 * (bbox[1] / height)),
+             int(1000 * (bbox[2] / width)),
+             int(1000 * (bbox[3] / height)),
+         ]

 Here, :obj:`width` and :obj:`height` correspond to the width and height of the original document in which the token
 occurs. Those can be obtained using the Python Image Library (PIL) library for example, as follows:

 .. code-block::

-   from PIL import Image
+    from PIL import Image

-   image = Image.open("name_of_your_document - can be a png file, pdf, etc.")
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.")

-   width, height = image.size
+    width, height = image.size

 - For a demo which shows how to fine-tune :class:`LayoutLMForTokenClassification` on the `FUNSD dataset
  <https://guillaumejaume.github.io/FUNSD/>`__ (a collection of annotated forms), see `this notebook
  <https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb>`__.
  It includes an inference part, which shows how to use Google's Tesseract on a new document.

-The original code can be found `here <https://github.com/microsoft/unilm/tree/master/layoutlm>`_.
+This model was contributed by `liminghao1630 <https://huggingface.co/liminghao1630>`__. The original code can be found
+`here <https://github.com/microsoft/unilm/tree/master/layoutlm>`_.


 LayoutLMConfig
--- a/docs/source/model_doc/led.rst
+++ b/docs/source/model_doc/led.rst
@@ -53,6 +53,8 @@ Tips:
 - A notebook showing how to fine-tune LED, can be accessed `here
  <https://colab.research.google.com/drive/12LjJazBl7Gam0XBPy_y0CTOJZeZ34c2v?usp=sharing>`__.

+This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__.
+

 LEDConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -73,8 +75,7 @@ LEDTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.LEDTokenizerFast
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
+    :members:


 LED specific outputs
--- a/docs/source/model_doc/longformer.rst
+++ b/docs/source/model_doc/longformer.rst
@@ -40,7 +40,8 @@ Tips:
  token belongs to which segment. Just separate your segments with the separation token :obj:`tokenizer.sep_token` (or
  :obj:`</s>`).

-The Authors' code can be found `here <https://github.com/allenai/longformer>`__.
+This model was contributed by `beltagy <https://huggingface.co/beltagy>`__. The Authors' code can be found `here
+<https://github.com/allenai/longformer>`__.

 Longformer Self Attention
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/luke.rst
+++ b/docs/source/model_doc/luke.rst
@@ -0,0 +1,159 @@
+..
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+LUKE
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The LUKE model was proposed in `LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention
+<https://arxiv.org/abs/2010.01057>`_ by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda and Yuji Matsumoto.
+It is based on RoBERTa and adds entity embeddings as well as an entity-aware self-attention mechanism, which helps
+improve performance on various downstream tasks involving reasoning about entities such as named entity recognition,
+extractive and cloze-style question answering, entity typing, and relation classification.
+
+The abstract from the paper is the following:
+
+*Entity representations are useful in natural language tasks involving entities. In this paper, we propose new
+pretrained contextualized representations of words and entities based on the bidirectional transformer. The proposed
+model treats words and entities in a given text as independent tokens, and outputs contextualized representations of
+them. Our model is trained using a new pretraining task based on the masked language model of BERT. The task involves
+predicting randomly masked words and entities in a large entity-annotated corpus retrieved from Wikipedia. We also
+propose an entity-aware self-attention mechanism that is an extension of the self-attention mechanism of the
+transformer, and considers the types of tokens (words or entities) when computing attention scores. The proposed model
+achieves impressive empirical performance on a wide range of entity-related tasks. In particular, it obtains
+state-of-the-art results on five well-known datasets: Open Entity (entity typing), TACRED (relation classification),
+CoNLL-2003 (named entity recognition), ReCoRD (cloze-style question answering), and SQuAD 1.1 (extractive question
+answering).*
+
+Tips:
+
+- This implementation is the same as :class:`~transformers.RobertaModel` with the addition of entity embeddings as well
+  as an entity-aware self-attention mechanism, which improves performance on tasks involving reasoning about entities.
+- LUKE treats entities as input tokens; therefore, it takes :obj:`entity_ids`, :obj:`entity_attention_mask`,
+  :obj:`entity_token_type_ids` and :obj:`entity_position_ids` as extra input. You can obtain those using
+  :class:`~transformers.LukeTokenizer`.
+- :class:`~transformers.LukeTokenizer` takes :obj:`entities` and :obj:`entity_spans` (character-based start and end
+  positions of the entities in the input text) as extra input. :obj:`entities` typically consist of [MASK] entities or
+  Wikipedia entities. The brief description when inputting these entities are as follows:
+
+  - *Inputting [MASK] entities to compute entity representations*: The [MASK] entity is used to mask entities to be
+    predicted during pretraining. When LUKE receives the [MASK] entity, it tries to predict the original entity by
+    gathering the information about the entity from the input text. Therefore, the [MASK] entity can be used to address
+    downstream tasks requiring the information of entities in text such as entity typing, relation classification, and
+    named entity recognition.
+  - *Inputting Wikipedia entities to compute knowledge-enhanced token representations*: LUKE learns rich information
+    (or knowledge) about Wikipedia entities during pretraining and stores the information in its entity embedding. By
+    using Wikipedia entities as input tokens, LUKE outputs token representations enriched by the information stored in
+    the embeddings of these entities. This is particularly effective for tasks requiring real-world knowledge, such as
+    question answering.
+
+- There are three head models for the former use case:
+
+  - :class:`~transformers.LukeForEntityClassification`, for tasks to classify a single entity in an input text such as
+    entity typing, e.g. the `Open Entity dataset <https://www.cs.utexas.edu/~eunsol/html_pages/open_entity.html>`__.
+    This model places a linear head on top of the output entity representation.
+  - :class:`~transformers.LukeForEntityPairClassification`, for tasks to classify the relationship between two entities
+    such as relation classification, e.g. the `TACRED dataset <https://nlp.stanford.edu/projects/tacred/>`__. This
+    model places a linear head on top of the concatenated output representation of the pair of given entities.
+  - :class:`~transformers.LukeForEntitySpanClassification`, for tasks to classify the sequence of entity spans, such as
+    named entity recognition (NER). This model places a linear head on top of the output entity representations. You
+    can address NER using this model by inputting all possible entity spans in the text to the model.
+
+  :class:`~transformers.LukeTokenizer` has a ``task`` argument, which enables you to easily create an input to these
+  head models by specifying ``task="entity_classification"``, ``task="entity_pair_classification"``, or
+  ``task="entity_span_classification"``. Please refer to the example code of each head models.
+
+  There are also 3 notebooks available, which showcase how you can reproduce the results as reported in the paper with
+  the HuggingFace implementation of LUKE. They can be found `here
+  <https://github.com/studio-ousia/luke/tree/master/notebooks>`__.
+
+Example:
+
+.. code-block::
+
+    >>> from transformers import LukeTokenizer, LukeModel, LukeForEntityPairClassification
+
+    >>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
+    >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
+
+    # Example 1: Computing the contextualized entity representation corresponding to the entity mention "Beyoncé"
+    >>> text = "Beyoncé lives in Los Angeles."
+    >>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"
+    >>> inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
+    >>> outputs = model(**inputs)
+    >>> word_last_hidden_state = outputs.last_hidden_state
+    >>> entity_last_hidden_state = outputs.entity_last_hidden_state
+
+    # Example 2: Inputting Wikipedia entities to obtain enriched contextualized representations
+    >>> entities = ["Beyoncé", "Los Angeles"]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
+    >>> entity_spans = [(0, 7), (17, 28)]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
+    >>> inputs = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
+    >>> outputs = model(**inputs)
+    >>> word_last_hidden_state = outputs.last_hidden_state
+    >>> entity_last_hidden_state = outputs.entity_last_hidden_state
+
+    # Example 3: Classifying the relationship between two entities using LukeForEntityPairClassification head model
+    >>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
+    >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
+    >>> entity_spans = [(0, 7), (17, 28)]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
+    >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
+    >>> predicted_class_idx = int(logits[0].argmax())
+    >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+
+This model was contributed by `ikuyamada <https://huggingface.co/ikuyamada>`__ and `nielsr
+<https://huggingface.co/nielsr>`__. The original code can be found `here <https://github.com/studio-ousia/luke>`__.
+
+
+LukeConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LukeConfig
+    :members:
+
+
+LukeTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LukeTokenizer
+    :members: __call__, save_vocabulary
+
+
+LukeModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LukeModel
+    :members: forward
+
+
+LukeForEntityClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LukeForEntityClassification
+    :members: forward
+
+
+LukeForEntityPairClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LukeForEntityPairClassification
+    :members: forward
+
+
+LukeForEntitySpanClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LukeForEntitySpanClassification
+    :members: forward
--- a/docs/source/model_doc/lxmert.rst
+++ b/docs/source/model_doc/lxmert.rst
@@ -52,7 +52,8 @@ Tips:
  contains self-attention for each respective modality and cross-attention, only the cross attention is returned and
  both self attention outputs are disregarded.

-The original code can be found `here <https://github.com/airsplay/lxmert>`__.
+This model was contributed by `eltoto1219 <https://huggingface.co/eltoto1219>`__. The original code can be found `here
+<https://github.com/airsplay/lxmert>`__.


 LxmertConfig
--- a/docs/source/model_doc/m2m_100.rst
+++ b/docs/source/model_doc/m2m_100.rst
@@ -34,6 +34,8 @@ to create high quality models. Our focus on non-English-Centric models brings ga
 translating between non-English directions while performing competitively to the best single systems of WMT. We
 open-source our scripts so that others may reproduce the data, evaluation, and final M2M-100 model.*

+This model was contributed by `valhalla <https://huggingface.co/valhalla>`__.
+

 Training and Generation
 _______________________________________________________________________________________________________________________
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -37,6 +37,7 @@ Implementation Notes
    - the model starts generating with :obj:`pad_token_id` (which has 0 as a token_embedding) as the prefix (Bart uses
      :obj:`<s/>`),
 - Code to bulk convert models can be found in ``convert_marian_to_pytorch.py``.
+- This model was contributed by `sshleifer <https://huggingface.co/sshleifer>`__.

 Naming
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/mbart.rst
+++ b/docs/source/model_doc/mbart.rst
@@ -29,7 +29,8 @@ corpora in many languages using the BART objective. mBART is one of the first me
 sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only
 on the encoder, decoder, or reconstructing parts of the text.

-The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/mbart>`__
+This model was contributed by `valhalla <https://huggingface.co/valhalla>`__. The Authors' code can be found `here
+<https://github.com/pytorch/fairseq/tree/master/examples/mbart>`__

 Training of MBart
 _______________________________________________________________________________________________________________________
--- a/docs/source/model_doc/megatron_bert.rst
+++ b/docs/source/model_doc/megatron_bert.rst
@@ -0,0 +1,154 @@
+.. 
+    Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+MegatronBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The MegatronBERT model was proposed in `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
+Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
+Jared Casper and Bryan Catanzaro.
+
+The abstract from the paper is the following:
+
+*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
+Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
+constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
+efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
+approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
+parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
+illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
+15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
+that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
+the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
+billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
+BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
+achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
+accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
+of 89.4%).*
+
+Tips:
+
+We have provided pretrained `BERT-345M <https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m>`__ checkpoints
+for use to evaluate or finetuning downstream tasks.
+
+To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__ for and setup the NVIDIA GPU Cloud (NGC)
+Registry CLI. Further documentation for downloading models can be found in the `NGC documentation
+<https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
+
+Alternatively, you can directly download the checkpoints using:
+
+BERT-345M-uncased::
+
+.. code-block:: bash
+
+    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
+    -O megatron_bert_345m_v0_1_uncased.zip
+
+BERT-345M-cased::
+
+.. code-block:: bash
+
+    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
+    megatron_bert_345m_v0_1_cased.zip
+
+Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to convert them to a format that will
+easily be loaded by Hugging Face Transformers and our port of the BERT code.
+
+The following commands allow you to do the conversion. We assume that the folder ``models/megatron_bert`` contains
+``megatron_bert_345m_v0_1_{cased, uncased}.zip`` and that the commands are run from inside that folder::
+
+.. code-block:: bash
+
+    python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip 
+
+.. code-block:: bash
+
+    python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
+
+This model was contributed by `jdemouth <https://huggingface.co/jdemouth>`__. The original code can be found `here
+<https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU and multi-node implementation of the
+Megatron Language models. In particular, it contains a hybrid model parallel approach using "tensor parallel" and
+"pipeline parallel" techniques.
+
+MegatronBertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertConfig
+    :members:
+
+
+MegatronBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertModel
+    :members: forward
+
+
+MegatronBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForMaskedLM
+    :members: forward
+
+
+MegatronBertForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForCausalLM
+    :members: forward
+
+
+MegatronBertForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForNextSentencePrediction
+    :members: forward
+
+
+MegatronBertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForPreTraining
+    :members: forward
+
+
+MegatronBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForSequenceClassification
+    :members: forward
+
+
+MegatronBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForMultipleChoice
+    :members: forward
+
+
+MegatronBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForTokenClassification
+    :members: forward
+
+
+MegatronBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MegatronBertForQuestionAnswering
+    :members: forward
+
+
--- a/docs/source/model_doc/megatron_gpt2.rst
+++ b/docs/source/model_doc/megatron_gpt2.rst
@@ -0,0 +1,71 @@
+.. 
+    Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+MegatronGPT2
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The MegatronGPT2 model was proposed in `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
+Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
+Jared Casper and Bryan Catanzaro.
+
+The abstract from the paper is the following:
+
+*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
+Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
+constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
+efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
+approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
+parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
+illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
+15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
+that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
+the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
+billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
+BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
+achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
+accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
+of 89.4%).*
+
+Tips:
+
+We have provided pretrained `GPT2-345M <https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m>`__ checkpoints
+for use to evaluate or finetuning downstream tasks.
+
+To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__ for and setup the NVIDIA GPU Cloud (NGC)
+Registry CLI. Further documentation for downloading models can be found in the `NGC documentation
+<https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
+
+Alternatively, you can directly download the checkpoints using::
+
+.. code-block:: bash
+
+    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
+    megatron_gpt2_345m_v0_0.zip
+
+Once you have obtained the checkpoint from NVIDIA GPU Cloud (NGC), you have to convert it to a format that will easily
+be loaded by Hugging Face Transformers GPT2 implementation.
+
+The following command allows you to do the conversion. We assume that the folder ``models/megatron_gpt2`` contains
+``megatron_gpt2_345m_v0_0.zip`` and that the command is run from that folder::
+
+.. code-block:: bash
+
+    python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
+
+This model was contributed by `jdemouth <https://huggingface.co/jdemouth>`__. The original code can be found `here
+<https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU and multi-node implementation of the
+Megatron Language models. In particular, it contains a hybrid model parallel approach using "tensor parallel" and
+"pipeline parallel" techniques.
+
--- a/docs/source/model_doc/mobilebert.rst
+++ b/docs/source/model_doc/mobilebert.rst
@@ -44,7 +44,8 @@ Tips:
  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
  with a causal language modeling (CLM) objective are better in that regard.

-The original code can be found `here <https://github.com/google-research/mobilebert>`__.
+This model was contributed by `vshampor <https://huggingface.co/vshampor>`__. The original code can be found `here
+<https://github.com/google-research/mobilebert>`__.

 MobileBertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/mt5.rst
+++ b/docs/source/model_doc/mt5.rst
@@ -28,7 +28,8 @@ multilingual variant of T5 that was pre-trained on a new Common Crawl-based data
 the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual
 benchmarks. All of the code and model checkpoints*

-The original code can be found `here <https://github.com/google-research/multilingual-t5>`__.
+This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
+found `here <https://github.com/google-research/multilingual-t5>`__.

 MT5Config
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/pegasus.rst
+++ b/docs/source/model_doc/pegasus.rst
@@ -31,7 +31,8 @@ According to the abstract,
  extractive summary.
 - Pegasus achieves SOTA summarization performance on all 12 downstream tasks, as measured by ROUGE and human eval.

-The Authors' code can be found `here <https://github.com/google-research/pegasus>`__.
+This model was contributed by `sshleifer <https://huggingface.co/sshleifer>`__. The Authors' code can be found `here
+<https://github.com/google-research/pegasus>`__.


 Checkpoints
@@ -52,7 +53,8 @@ Examples
 _______________________________________________________________________________________________________________________

 - :prefix_link:`Script <examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh>` to fine-tune pegasus
-  on the XSUM dataset. Data download instructions at :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
+  on the XSUM dataset. Data download instructions at :prefix_link:`examples/pytorch/summarization/
+  <examples/pytorch/summarization/README.md>`.
 - FP16 is not supported (help/ideas on this appreciated!).
 - The adafactor optimizer is recommended for pegasus fine-tuning.

--- a/docs/source/model_doc/phobert.rst
+++ b/docs/source/model_doc/phobert.rst
@@ -31,26 +31,26 @@ Example of use:

 .. code-block::

-  import torch
-  from transformers import AutoModel, AutoTokenizer
+    >>> import torch
+    >>> from transformers import AutoModel, AutoTokenizer

-  phobert = AutoModel.from_pretrained("vinai/phobert-base")
-  tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
+    >>> phobert = AutoModel.from_pretrained("vinai/phobert-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

-  # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
-  line = "Tôi là sinh_viên trường đại_học Công_nghệ ."
+    >>> # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
+    >>> line = "Tôi là sinh_viên trường đại_học Công_nghệ ."

-  input_ids = torch.tensor([tokenizer.encode(line)])
+    >>> input_ids = torch.tensor([tokenizer.encode(line)])

-  with torch.no_grad():
-      features = phobert(input_ids)  # Models outputs are now tuples
+    >>> with torch.no_grad():
+    ...     features = phobert(input_ids)  # Models outputs are now tuples

-  ## With TensorFlow 2.0+:
-  # from transformers import TFAutoModel
-  # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
+    >>> # With TensorFlow 2.0+:
+    >>> # from transformers import TFAutoModel
+    >>> # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")


-The original code can be found `here <https://github.com/VinAIResearch/PhoBERT>`__.
+    This model was contributed by `dqnguyen <https://huggingface.co/dqnguyen>`__. The original code can be found `here <https://github.com/VinAIResearch/PhoBERT>`__.

 PhobertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/rag.rst
+++ b/docs/source/model_doc/rag.rst
@@ -43,6 +43,7 @@ outperforming parametric seq2seq models and task-specific retrieve-and-extract a
 tasks, we find that RAG models generate more specific, diverse and factual language than a state-of-the-art
 parametric-only seq2seq baseline.*

+This model was contributed by `ola13 <https://huggingface.co/ola13>`__.


 RagConfig
--- a/docs/source/model_doc/reformer.rst
+++ b/docs/source/model_doc/reformer.rst
@@ -32,7 +32,8 @@ layers instead of the standard residuals, which allows storing activations only
 N times, where N is the number of layers. The resulting model, the Reformer, performs on par with Transformer models
 while being much more memory-efficient and much faster on long sequences.*

-The Authors' code can be found `here <https://github.com/google/trax/tree/master/trax/models/reformer>`__.
+This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The Authors' code can be
+found `here <https://github.com/google/trax/tree/master/trax/models/reformer>`__.

 Axial Positional Encodings
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -145,8 +146,8 @@ For training, the :class:`~transformers.ReformerModelWithLMHead` should be used

 .. code-block::

-  input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
-  loss = model(input_ids, labels=input_ids)[0]
+    input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+    loss = model(input_ids, labels=input_ids)[0]


 ReformerConfig
--- a/docs/source/model_doc/retribert.rst
+++ b/docs/source/model_doc/retribert.rst
@@ -20,8 +20,8 @@ The RetriBERT model was proposed in the blog post `Explain Anything Like I'm Fiv
 Question Answering <https://yjernite.github.io/lfqa.html>`__. RetriBERT is a small model that uses either a single or
 pair of BERT encoders with lower-dimension projection for dense semantic indexing of text.

-Code to train and use the model can be found `here
-<https://github.com/huggingface/transformers/tree/master/examples/distillation>`__.
+This model was contributed by `yjernite <https://huggingface.co/yjernite>`__. Code to train and use the model can be
+found :prefix_link:`here <examples/research-projects/distillation>`.


 RetriBertConfig
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -44,7 +44,8 @@ Tips:
  separate your segments with the separation token :obj:`tokenizer.sep_token` (or :obj:`</s>`)
 - :doc:`CamemBERT <camembert>` is a wrapper around RoBERTa. Refer to this page for usage examples.

-The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_.
+This model was contributed by `julien-c <https://huggingface.co/julien-c>`__. The original code can be found `here
+<https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_.


 RobertaConfig
@@ -165,3 +166,38 @@ FlaxRobertaModel

 .. autoclass:: transformers.FlaxRobertaModel
    :members: __call__
+
+
+FlaxRobertaForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxRobertaForMaskedLM
+    :members: __call__
+
+
+FlaxRobertaForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxRobertaForSequenceClassification
+    :members: __call__
+
+
+FlaxRobertaForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxRobertaForMultipleChoice
+    :members: __call__
+
+
+FlaxRobertaForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxRobertaForTokenClassification
+    :members: __call__
+
+
+FlaxRobertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxRobertaForQuestionAnswering
+    :members: __call__
--- a/docs/source/model_doc/speech_to_text.rst
+++ b/docs/source/model_doc/speech_to_text.rst
@@ -25,7 +25,8 @@ transcripts/translations autoregressively. Speech2Text has been fine-tuned on se
 `LibriSpeech <http://www.openslr.org/12>`__, `CoVoST 2 <https://github.com/facebookresearch/covost>`__, `MuST-C
 <https://ict.fbk.eu/must-c/>`__.

-The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text>`__.
+This model was contributed by `valhalla <https://huggingface.co/valhalla>`__. The original code can be found `here
+<https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text>`__.


 Inference
--- a/docs/source/model_doc/squeezebert.rst
+++ b/docs/source/model_doc/squeezebert.rst
@@ -47,6 +47,9 @@ Tips:
 - For best results when finetuning on sequence classification tasks, it is recommended to start with the
  `squeezebert/squeezebert-mnli-headless` checkpoint.

+This model was contributed by `forresti <https://huggingface.co/forresti>`__.
+
+
 SqueezeBertConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -48,7 +48,8 @@ Tips:
  layers to the decoder and auto-regressively generates the decoder output. - T5 uses relative scalar embeddings.
  Encoder input padding can be done on the left and on the right.

-The original code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`__.
+This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
+<https://github.com/google-research/text-to-text-transfer-transformer>`__.

 Training
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -73,10 +74,10 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash

 .. code-block::

-  input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
-  labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
-  # the forward function automatically creates the correct decoder_input_ids
-  loss = model(input_ids=input_ids, labels=labels).loss
+    input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
+    labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
+    # the forward function automatically creates the correct decoder_input_ids
+    loss = model(input_ids=input_ids, labels=labels).loss

 - Supervised training

@@ -86,10 +87,10 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash

 .. code-block::

-  input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
-  labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
-  # the forward function automatically creates the correct decoder_input_ids
-  loss = model(input_ids=input_ids, labels=labels).loss
+    input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+    labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
+    # the forward function automatically creates the correct decoder_input_ids
+    loss = model(input_ids=input_ids, labels=labels).loss


 T5Config
--- a/docs/source/model_doc/tapas.rst
+++ b/docs/source/model_doc/tapas.rst
@@ -49,7 +49,8 @@ entailment (a binary classification task). For more details, see their follow-up
 intermediate pre-training <https://www.aclweb.org/anthology/2020.findings-emnlp.27/>`__ by Julian Martin Eisenschlos,
 Syrine Krichene and Thomas Müller.

-The original code can be found `here <https://github.com/google-research/tapas>`__.
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
+<https://github.com/google-research/tapas>`__.

 Tips:

--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -41,7 +41,8 @@ Tips:
  original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
 - Transformer-XL is one of the few models that has no sequence length limit.

-The original code can be found `here <https://github.com/kimiyoung/transformer-xl>`__.
+This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
+<https://github.com/kimiyoung/transformer-xl>`__.


 TransfoXLConfig
--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -1,5 +1,5 @@
 .. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
+    Copyright 2021 The HuggingFace Team. All rights reserved.

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
@@ -47,10 +47,6 @@ Tips:
  which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be
  used for classification. The authors also add absolute position embeddings, and feed the resulting sequence of
  vectors to a standard Transformer encoder.
- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
-  use a higher resolution than pre-training `(Touvron et al., 2019) <https://arxiv.org/abs/1906.06423>`__, `(Kolesnikov
-  et al., 2020) <https://arxiv.org/abs/1912.11370>`__. The authors report the best results with a resolution of 384x384
-  during fine-tuning.
 - As the Vision Transformer expects each image to be of the same size (resolution), one can use
  :class:`~transformers.ViTFeatureExtractor` to resize (or rescale) and normalize images for the model.
 - Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
@@ -61,13 +57,18 @@ Tips:
  14 million images and 21k classes) only, or (2) also fine-tuned on `ImageNet
  <http://www.image-net.org/challenges/LSVRC/2012/>`__ (also referred to as ILSVRC 2012, a collection of 1.3 million
  images and 1,000 classes).
+- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
+  use a higher resolution than pre-training `(Touvron et al., 2019) <https://arxiv.org/abs/1906.06423>`__, `(Kolesnikov
+  et al., 2020) <https://arxiv.org/abs/1912.11370>`__. In order to fine-tune at higher resolution, the authors perform
+  2D interpolation of the pre-trained position embeddings, according to their location in the original image.
 - The best results are obtained with supervised pre-training, which is not the case in NLP. The authors also performed
  an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked
  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.


-The original code (written in JAX) can be found `here <https://github.com/google-research/vision_transformer>`__.
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code (written in JAX) can be
+found `here <https://github.com/google-research/vision_transformer>`__.

 Note that we converted the weights from Ross Wightman's `timm library
 <https://github.com/rwightman/pytorch-image-models>`__, who already converted the weights from JAX to PyTorch. Credits
--- a/docs/source/model_doc/wav2vec2.rst
+++ b/docs/source/model_doc/wav2vec2.rst
@@ -36,6 +36,8 @@ Tips:
 - Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
  using :class:`~transformers.Wav2Vec2CTCTokenizer`.

+This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__.
+

 Wav2Vec2Config
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -42,7 +42,8 @@ Tips:
 - XLM has multilingual checkpoints which leverage a specific :obj:`lang` parameter. Check out the :doc:`multi-lingual
  <../multilingual>` page for more information.

-The original code can be found `here <https://github.com/facebookresearch/XLM/>`__.
+This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
+<https://github.com/facebookresearch/XLM/>`__.


 XLMConfig
--- a/docs/source/model_doc/xlmroberta.rst
+++ b/docs/source/model_doc/xlmroberta.rst
@@ -44,7 +44,8 @@ Tips:
 - This implementation is the same as RoBERTa. Refer to the :doc:`documentation of RoBERTa <roberta>` for usage examples
  as well as the information relative to the inputs and outputs.

-The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`__.
+This model was contributed by `stefan-it <https://huggingface.co/stefan-it>`__. The original code can be found `here
+<https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`__.


 XLMRobertaConfig
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -41,10 +41,11 @@ Tips:
  using only a sub-set of the output tokens as target which are selected with the :obj:`target_mapping` input.
 - To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the :obj:`perm_mask` and
  :obj:`target_mapping` inputs to control the attention span and outputs (see examples in
-  `examples/text-generation/run_generation.py`)
+  `examples/pytorch/text-generation/run_generation.py`)
 - XLNet is one of the few models that has no sequence length limit.

-The original code can be found `here <https://github.com/zihangdai/xlnet/>`__.
+This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
+<https://github.com/zihangdai/xlnet/>`__.


 XLNetConfig
--- a/docs/source/model_sharing.rst
+++ b/docs/source/model_sharing.rst
@@ -22,8 +22,6 @@ the `model hub <https://huggingface.co/models>`__.

    Optionally, you can join an existing organization or create a new one.

-Prepare your model for uploading
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 We have seen in the :doc:`training tutorial <training>`: how to fine-tune a model on a given task. You have probably
 done something similar on your task, either using the model directly in your own training loop or using the
@@ -31,7 +29,7 @@ done something similar on your task, either using the model directly in your own
 `model hub <https://huggingface.co/models>`__.

 Model versioning
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 Since version v3.5.0, the model hub has built-in model versioning based on git and git-lfs. It is based on the paradigm
 that one model *is* one repo.
@@ -54,6 +52,106 @@ For instance:
    >>>   revision="v2.0.1" # tag name, or branch name, or commit hash
    >>> )

+
+Push your model from Python
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Preparation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The first step is to make sure your credentials to the hub are stored somewhere. This can be done in two ways. If you
+have access to a terminal, you cam just run the following command in the virtual environment where you installed 🤗
+Transformers:
+
+.. code-block:: bash
+
+    transformers-cli login
+
+It will store your access token in the Hugging Face cache folder (by default :obj:`~/.cache/`).
+
+If you don't have an easy access to a terminal (for instance in a Colab session), you can find a token linked to your
+acount by going on `huggingface.co <https://huggingface.co/>`, click on your avatar on the top left corner, then on
+`Edit profile` on the left, just beneath your profile picture. In the submenu `API Tokens`, you will find your API
+token that you can just copy.
+
+Directly push your model to the hub
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Once you have an API token (either stored in the cache or copied and pasted in your notebook), you can directly push a
+finetuned model you saved in :obj:`save_drectory` by calling:
+
+.. code-block:: python
+
+    finetuned_model.push_to_hub("my-awesome-model")
+
+If you have your API token not stored in the cache, you will need to pass it with :obj:`use_auth_token=your_token`.
+This is also be the case for all the examples below, so we won't mention it again.
+
+This will create a repository in your namespace name :obj:`my-awesome-model`, so anyone can now run:
+
+.. code-block:: python
+
+    from transformers import AutoModel
+
+    model = AutoModel.from_pretrained("your_username/my-awesome-model")
+
+Even better, you can combine this push to the hub with the call to :obj:`save_pretrained`:
+
+.. code-block:: python
+
+    finetuned_model.save_pretrained(save_directory, push_to_hub=True, repo_name="my-awesome-model")
+
+If you are a premium user and want your model to be private, just add :obj:`private=True` to this call.
+
+If you are a member of an organization and want to push it inside the namespace of the organization instead of yours,
+just add :obj:`organization=my_amazing_org`.
+
+Add new files to your model repo
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Once you have pushed your model to the hub, you might want to add the tokenizer, or a version of your model for another
+framework (TensorFlow, PyTorch, Flax). This is super easy to do! Let's begin with the tokenizer. You can add it to the
+repo you created before like this
+
+.. code-block:: python
+
+    tokenizer.push_to_hub("my-awesome-model")
+
+If you know its URL (it should be :obj:`https://huggingface.co/username/repo_name`), you can also do:
+
+.. code-block:: python
+
+    tokenizer.push_to_hub(repo_url=my_repo_url)
+
+And that's all there is to it! It's also a very easy way to fix a mistake if one of the files online had a bug.
+
+To add a model for another backend, it's also super easy. Let's say you have fine-tuned a TensorFlow model and want to
+add the pytorch model files to your model repo, so that anyone in the community can use it. The following allows you to
+directly create a PyTorch version of your TensorFlow model:
+
+.. code-block:: python
+
+    from transfomers import AutoModel
+
+    model = AutoModel.from_pretrained(save_directory, from_tf=True)
+
+You can also replace :obj:`save_directory` by the identifier of your model (:obj:`username/repo_name`) if you don't
+have a local save of it anymore. Then, just do the same as before:
+
+.. code-block:: python
+
+    model.push_to_hub("my-awesome-model")
+
+or
+
+.. code-block:: python
+
+    model.push_to_hub(repo_url=my_repo_url)
+
+
+Use your terminal and git
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 Basic steps
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

--- a/docs/source/model_summary.rst
+++ b/docs/source/model_summary.rst
@@ -682,7 +682,8 @@ The `mbart-large-en-ro checkpoint <https://huggingface.co/facebook/mbart-large-e
 romanian translation.

 The `mbart-large-cc25 <https://huggingface.co/facebook/mbart-large-cc25>`_ checkpoint can be finetuned for other
-translation and summarization tasks, using code in ```examples/seq2seq/``` , but is not very useful without finetuning.
+translation and summarization tasks, using code in ```examples/pytorch/translation/``` , but is not very useful without
+finetuning.


 ProphetNet
--- a/docs/source/multilingual.rst
+++ b/docs/source/multilingual.rst
@@ -90,8 +90,8 @@ You can then feed it all as input to your model:
    >>> outputs = model(input_ids, langs=langs)


-The example :prefix_link:`run_generation.py <examples/text-generation/run_generation.py>` can generate text using the
-CLM checkpoints from XLM, using the language embeddings.
+The example :prefix_link:`run_generation.py <examples/pytorch/text-generation/run_generation.py>` can generate text
+using the CLM checkpoints from XLM, using the language embeddings.

 XLM without Language Embeddings
 -----------------------------------------------------------------------------------------------------------------------
--- a/docs/source/quicktour.rst
+++ b/docs/source/quicktour.rst
@@ -238,23 +238,22 @@ keys directly to tensors, for a PyTorch model, you need to unpack the dictionary
    >>> ## TENSORFLOW CODE
    >>> tf_outputs = tf_model(tf_batch)

-In 🤗 Transformers, all outputs are tuples (with only one element potentially). Here, we get a tuple with just the final
-activations of the model.
+In 🤗 Transformers, all outputs are objects that contain the model's final activations along with other metadata. These
+objects are described in greater detail :doc:`here <main_classes/output>`. For now, let's inspect the output ourselves:

 .. code-block::

    >>> ## PYTORCH CODE
    >>> print(pt_outputs)
-    (tensor([[-4.0833,  4.3364],
-            [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>),)
+    SequenceClassifierOutput(loss=None, logits=tensor([[-4.0833,  4.3364],
+        [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
    >>> ## TENSORFLOW CODE
    >>> print(tf_outputs)
-    (<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
-    array([[-4.0832963 ,  4.336414  ],
-           [ 0.08181786, -0.04179301]], dtype=float32)>,)
+    TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
+    array([[-4.0832963 ,  4.3364143 ],
+           [ 0.081807  , -0.04178282]], dtype=float32)>, hidden_states=None, attentions=None)

-The model can return more than just the final activations, which is why the output is a tuple. Here we only asked for
-the final activations, so we get a tuple with one element.
+Notice how the output object has a ``logits`` attribute. You can use this to access the model's final activations.

 .. note::

@@ -267,10 +266,10 @@ Let's apply the SoftMax activation to get predictions.

    >>> ## PYTORCH CODE
    >>> import torch.nn.functional as F
-    >>> pt_predictions = F.softmax(pt_outputs[0], dim=-1)
+    >>> pt_predictions = F.softmax(pt_outputs.logits, dim=-1)
    >>> ## TENSORFLOW CODE
    >>> import tensorflow as tf
-    >>> tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
+    >>> tf.nn.softmax(tf_outputs.logits, axis=-1)

 We can see we get the numbers from before:

@@ -286,16 +285,24 @@ We can see we get the numbers from before:
    tensor([[2.2043e-04, 9.9978e-01],
            [5.3086e-01, 4.6914e-01]], grad_fn=<SoftmaxBackward>)

-If you have labels, you can provide them to the model, it will return a tuple with the loss and the final activations.
+If you provide the model with labels in addition to inputs, the model output object will also contain a ``loss``
+attribute:

 .. code-block::

    >>> ## PYTORCH CODE
    >>> import torch
    >>> pt_outputs = pt_model(**pt_batch, labels = torch.tensor([1, 0]))
+    >>> print(pt_outputs)
+    SequenceClassifierOutput(loss=tensor(0.3167, grad_fn=<NllLossBackward>), logits=tensor([[-4.0833,  4.3364],
+    [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
    >>> ## TENSORFLOW CODE
    >>> import tensorflow as tf
    >>> tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0]))
+    >>> print(tf_outputs)
+    TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.2051287e-04, 6.3326043e-01], dtype=float32)>, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
+    array([[-4.0832963 ,  4.3364143 ],
+           [ 0.081807  , -0.04178282]], dtype=float32)>, hidden_states=None, attentions=None)

 Models are standard `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ or `tf.keras.Model
 <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ so you can use them in your usual training loop. 🤗
@@ -323,6 +330,7 @@ loading a saved PyTorch model in a TensorFlow model, use :func:`~transformers.TF

 .. code-block::

+    from transformers import TFAutoModel
    tokenizer = AutoTokenizer.from_pretrained(save_directory)
    model = TFAutoModel.from_pretrained(save_directory, from_pt=True)

@@ -330,6 +338,7 @@ and if you are loading a saved TensorFlow model in a PyTorch model, you should u

 .. code-block::

+    from transformers import AutoModel
    tokenizer = AutoTokenizer.from_pretrained(save_directory)
    model = AutoModel.from_pretrained(save_directory, from_tf=True)

@@ -340,10 +349,12 @@ Lastly, you can also ask the model to return all hidden states and all attention

    >>> ## PYTORCH CODE
    >>> pt_outputs = pt_model(**pt_batch, output_hidden_states=True, output_attentions=True)
-    >>> all_hidden_states, all_attentions = pt_outputs[-2:]
+    >>> all_hidden_states  = pt_outputs.hidden_states 
+    >>> all_attentions = pt_outputs.attentions
    >>> ## TENSORFLOW CODE
    >>> tf_outputs = tf_model(tf_batch, output_hidden_states=True, output_attentions=True)
-    >>> all_hidden_states, all_attentions = tf_outputs[-2:]
+    >>> all_hidden_states =  tf_outputs.hidden_states
+    >>> all_attentions = tf_outputs.attentions

 Accessing the code
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -376,16 +387,16 @@ directly instantiate model and tokenizer without the auto magic:
 Customizing the model
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-If you want to change how the model itself is built, you can define your custom configuration class. Each architecture
-comes with its own relevant configuration (in the case of DistilBERT, :class:`~transformers.DistilBertConfig`) which
-allows you to specify any of the hidden dimension, dropout rate, etc. If you do core modifications, like changing the
-hidden size, you won't be able to use a pretrained model anymore and will need to train from scratch. You would then
-instantiate the model directly from this configuration.
+If you want to change how the model itself is built, you can define a custom configuration class. Each architecture
+comes with its own relevant configuration. For example, :class:`~transformers.DistilBertConfig` allows you to specify
+parameters such as the hidden dimension, dropout rate, etc for DistilBERT. If you do core modifications, like changing
+the hidden size, you won't be able to use a pretrained model anymore and will need to train from scratch. You would
+then instantiate the model directly from this configuration.

-Here we use the predefined vocabulary of DistilBERT (hence load the tokenizer with the
-:func:`~transformers.DistilBertTokenizer.from_pretrained` method) and initialize the model from scratch (hence
-instantiate the model from the configuration instead of using the
-:func:`~transformers.DistilBertForSequenceClassification.from_pretrained` method).
+Below, we load a predefined vocabulary for a tokenizer with the
+:func:`~transformers.DistilBertTokenizer.from_pretrained` method. However, unlike the tokenizer, we wish to initialize
+the model from scratch. Therefore, we instantiate the model from a configuration instead of using the
+:func:`~transformers.DistilBertForSequenceClassification.from_pretrained` method.

 .. code-block::

@@ -402,9 +413,9 @@ instantiate the model from the configuration instead of using the

 For something that only changes the head of the model (for instance, the number of labels), you can still use a
 pretrained model for the body. For instance, let's define a classifier for 10 different labels using a pretrained body.
-We could create a configuration with all the default values and just change the number of labels, but more easily, you
-can directly pass any argument a configuration would take to the :func:`from_pretrained` method and it will update the
-default configuration with it:
+Instead of creating a new configuration with all the default values just to change the number of labels, we can instead
+pass any argument a configuration would take to the :func:`from_pretrained` method and it will update the default
+configuration appropriately:

 .. code-block::

--- a/docs/source/sagemaker.md
+++ b/docs/source/sagemaker.md
@@ -325,7 +325,7 @@ When you create a `HuggingFace` Estimator, you can specify a [training script th

 If you are using `git_config` to run the [🤗 Transformers examples scripts](https://github.com/huggingface/transformers/tree/master/examples) keep in mind that you need to configure the right `'branch'` for you `transformers_version`, e.g. if you use `transformers_version='4.4.2` you have to use `'branch':'v4.4.2'`. 

-As an example to use `git_config` with an [example script from the transformers repository](https://github.com/huggingface/transformers/tree/master/examples/text-classification).
+As an example to use `git_config` with an [example script from the transformers repository](https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-classification).

 _Tip: define `output_dir` as `/opt/ml/model` in the hyperparameter for the script to save your model to S3 after training._

@@ -338,7 +338,7 @@ git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch'
 # create the Estimator
 huggingface_estimator = HuggingFace(
        entry_point='run_glue.py',
-        source_dir='./examples/text-classification',
+        source_dir='./examples/pytorch/text-classification',
        git_config=git_config,
        instance_type='ml.p3.2xlarge',
        instance_count=1,
--- a/docs/source/task_summary.rst
+++ b/docs/source/task_summary.rst
@@ -55,10 +55,10 @@ Sequence Classification
 Sequence classification is the task of classifying sequences according to a given number of classes. An example of
 sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune a
 model on a GLUE sequence classification task, you may leverage the :prefix_link:`run_glue.py
-<examples/text-classification/run_glue.py>`, :prefix_link:`run_tf_glue.py
-<examples/text-classification/run_tf_glue.py>`, :prefix_link:`run_tf_text_classification.py
-<examples/text-classification/run_tf_text_classification.py>` or :prefix_link:`run_xnli.py
-<examples/text-classification/run_xnli.py>` scripts.
+<examples/pytorch/text-classification/run_glue.py>`, :prefix_link:`run_tf_glue.py
+<examples/tensorflow/text-classification/run_tf_glue.py>`, :prefix_link:`run_tf_text_classification.py
+<examples/tensorflow/text-classification/run_tf_text_classification.py>` or :prefix_link:`run_xnli.py
+<examples/pytorch/text-classification/run_xnli.py>` scripts.

 Here is an example of using pipelines to do sentiment analysis: identifying if a sequence is positive or negative. It
 leverages a fine-tuned model on sst2, which is a GLUE task.
@@ -85,9 +85,8 @@ each other. The process is the following:

 1. Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
   with the weights stored in the checkpoint.
-2. Build a sequence from the two sentences, with the correct model-specific separators token type ids and attention
-   masks (:func:`~transformers.PreTrainedTokenizer.encode` and :func:`~transformers.PreTrainedTokenizer.__call__` take
-   care of this).
+2. Build a sequence from the two sentences, with the correct model-specific separators, token type ids and attention
+   masks (which will be created automatically by the tokenizer).
 3. Pass this sequence through the model so that it is classified in one of the two available classes: 0 (not a
   paraphrase) and 1 (is a paraphrase).
 4. Compute the softmax of the result to get probabilities over the classes.
@@ -108,6 +107,7 @@ each other. The process is the following:
    >>> sequence_1 = "Apples are especially bad for your health"
    >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

+    >>> # The tokekenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to the sequence, as well as compute the attention masks.
    >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
    >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")

@@ -141,6 +141,7 @@ each other. The process is the following:
    >>> sequence_1 = "Apples are especially bad for your health"
    >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

+    >>> # The tokekenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to the sequence, as well as compute the attention masks.
    >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="tf")
    >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf")

@@ -168,8 +169,10 @@ Extractive Question Answering
 Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
 question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune a
 model on a SQuAD task, you may leverage the `run_qa.py
-<https://github.com/huggingface/transformers/tree/master/examples/question-answering/run_qa.py>`__ and `run_tf_squad.py
-<https://github.com/huggingface/transformers/tree/master/examples/question-answering/run_tf_squad.py>`__ scripts.
+<https://github.com/huggingface/transformers/tree/master/examples/pytorch/question-answering/run_qa.py>`__ and
+`run_tf_squad.py
+<https://github.com/huggingface/transformers/tree/master/examples/tensorflow/question-answering/run_tf_squad.py>`__
+scripts.


 Here is an example of using pipelines to do question answering: extracting an answer from a text given a question. It
@@ -184,7 +187,7 @@ leverages a fine-tuned model on SQuAD.
    >>> context = r"""
    ... Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
    ... question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
-    ... a model on a SQuAD task, you may leverage the examples/question-answering/run_squad.py script.
+    ... a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script.
    ... """

 This returns an answer extracted from the text, a confidence score, alongside "start" and "end" values, which are the
@@ -325,8 +328,7 @@ fill that mask with an appropriate token. This allows the model to attend to bot
 right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis for
 downstream tasks requiring bi-directional context, such as SQuAD (question answering, see `Lewis, Lui, Goyal et al.
 <https://arxiv.org/abs/1910.13461>`__, part 4.2). If you would like to fine-tune a model on a masked language modeling
-task, you may leverage the `run_mlm.py
-<https://github.com/huggingface/transformers/tree/master/examples/language-modeling/run_mlm.py>`__ script.
+task, you may leverage the :prefix_link:`run_mlm.py <examples/pytorch/language-modeling/run_mlm.py>` script.

 Here is an example of using pipelines to replace a mask from a sequence:

@@ -435,7 +437,7 @@ Causal Language Modeling
 Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the
 model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
 for generation tasks. If you would like to fine-tune a model on a causal language modeling task, you may leverage the
-`run_clm.py <https://github.com/huggingface/transformers/tree/master/examples/language-modeling/run_clm.py>`__ script.
+:prefix_link:`run_clm.py <examples/pytorch/language-modeling/run_clm.py>` script.

 Usually, the next token is predicted by sampling from the logits of the last hidden state the model produces from the
 input sequence.
@@ -503,8 +505,8 @@ This outputs a (hopefully) coherent next token following the original sequence,
    >>> print(resulting_string)
    Hugging Face is based in DUMBO, New York City, and has

-In the next section, we show how this functionality is leveraged in :func:`~transformers.PreTrainedModel.generate` to
-generate multiple tokens up to a user-defined length.
+In the next section, we show how :func:`~transformers.PreTrainedModel.generate` can be used to generate multiple tokens
+up to a specified length instead of one token at a time.

 Text Generation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -525,10 +527,11 @@ As a default all models apply *Top-K* sampling when used in pipelines, as config


 Here, the model generates a random text with a total maximal length of *50* tokens from context *"As far as I am
-concerned, I will"*. The default arguments of ``PreTrainedModel.generate()`` can be directly overridden in the
-pipeline, as is shown above for the argument ``max_length``.
+concerned, I will"*. Behind the scenes, the pipeline object calls the method
+:func:`~transformers.PreTrainedModel.generate` to generate text. The default arguments for this method can be
+overridden in the pipeline, as is shown above for the arguments ``max_length`` and ``do_sample``.

-Here is an example of text generation using ``XLNet`` and its tokenizer.
+Below is an example of text generation using ``XLNet`` and its tokenizer, which includes calling ``generate`` directly:

 .. code-block::

@@ -602,8 +605,7 @@ Named Entity Recognition
 Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example, identifying a token
 as a person, an organisation or a location. An example of a named entity recognition dataset is the CoNLL-2003 dataset,
 which is entirely based on that task. If you would like to fine-tune a model on an NER task, you may leverage the
-`run_ner.py <https://github.com/huggingface/transformers/tree/master/examples/token-classification/run_ner.py>`__
-script.
+:prefix_link:`run_ner.py <examples/pytorch/token-classification/run_ner.py>` script.

 Here is an example of using pipelines to do named entity recognition, specifically, trying to identify tokens as
 belonging to one of 9 classes:
@@ -627,8 +629,8 @@ It leverages a fine-tuned model on CoNLL-2003, fine-tuned by `@stefan-it <https:

    >>> nlp = pipeline("ner")

-    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very"
-    ...            "close to the Manhattan Bridge which is visible from the window."
+    >>> sequence = """Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, 
+    ... therefore very close to the Manhattan Bridge which is visible from the window."""


 This outputs a list of all words that have been identified as one of the entities from the 9 classes defined above.
@@ -659,15 +661,14 @@ Here is an example of doing named entity recognition, using a model and a tokeni

 1. Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
   with the weights stored in the checkpoint.
-2. Define the label list with which the model was trained on.
-3. Define a sequence with known entities, such as "Hugging Face" as an organisation and "New York City" as a location.
-4. Split words into tokens so that they can be mapped to predictions. We use a small hack by, first, completely
+2. Define a sequence with known entities, such as "Hugging Face" as an organisation and "New York City" as a location.
+3. Split words into tokens so that they can be mapped to predictions. We use a small hack by, first, completely
   encoding and decoding the sequence, so that we're left with a string that contains the special tokens.
-5. Encode that sequence into IDs (special tokens are added automatically).
-6. Retrieve the predictions by passing the input to the model and getting the first output. This results in a
+4. Encode that sequence into IDs (special tokens are added automatically).
+5. Retrieve the predictions by passing the input to the model and getting the first output. This results in a
   distribution over the 9 possible classes for each token. We take the argmax to retrieve the most likely class for
   each token.
-7. Zip together each token with its prediction and print it.
+6. Zip together each token with its prediction and print it.

 .. code-block::

@@ -706,18 +707,6 @@ Here is an example of doing named entity recognition, using a model and a tokeni
    >>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

-    >>> label_list = [
-    ...     "O",       # Outside of a named entity
-    ...     "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
-    ...     "I-MISC",  # Miscellaneous entity
-    ...     "B-PER",   # Beginning of a person's name right after another person's name
-    ...     "I-PER",   # Person's name
-    ...     "B-ORG",   # Beginning of an organisation right after another organisation
-    ...     "I-ORG",   # Organisation
-    ...     "B-LOC",   # Beginning of a location right after another location
-    ...     "I-LOC"    # Location
-    ... ]
-
    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
    ...            "close to the Manhattan Bridge."

@@ -731,23 +720,61 @@ Here is an example of doing named entity recognition, using a model and a tokeni

 This outputs a list of each token mapped to its corresponding prediction. Differently from the pipeline, here every
 token has a prediction as we didn't remove the "0"th class, which means that no particular entity was found on that
-token. The following array should be the output:
+token.
+
+In the above example, ``predictions`` is an integer that corresponds to the predicted class. We can use the
+``model.config.id2label`` property in order to recover the class name corresponding to the class number, which is
+illustrated below:

 .. code-block::

-    >>> print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())])
-    [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]
+    >>> for token, prediction in zip(tokens, predictions[0].numpy()):
+    ...     print((token, model.config.id2label[prediction]))
+    ('[CLS]', 'O')
+    ('Hu', 'I-ORG')
+    ('##gging', 'I-ORG')
+    ('Face', 'I-ORG')
+    ('Inc', 'I-ORG')
+    ('.', 'O')
+    ('is', 'O')
+    ('a', 'O')
+    ('company', 'O')
+    ('based', 'O')
+    ('in', 'O')
+    ('New', 'I-LOC')
+    ('York', 'I-LOC')
+    ('City', 'I-LOC')
+    ('.', 'O')
+    ('Its', 'O')
+    ('headquarters', 'O')
+    ('are', 'O')
+    ('in', 'O')
+    ('D', 'I-LOC')
+    ('##UM', 'I-LOC')
+    ('##BO', 'I-LOC')
+    (',', 'O')
+    ('therefore', 'O')
+    ('very', 'O')
+    ('##c', 'O')
+    ('##lose', 'O')
+    ('to', 'O')
+    ('the', 'O')
+    ('Manhattan', 'I-LOC')
+    ('Bridge', 'I-LOC')
+    ('.', 'O')
+    ('[SEP]', 'O')

 Summarization
 -----------------------------------------------------------------------------------------------------------------------

 Summarization is the task of summarizing a document or an article into a shorter text. If you would like to fine-tune a
 model on a summarization task, you may leverage the `run_summarization.py
-<https://github.com/huggingface/transformers/tree/master/examples/seq2seq/run_summarization.py>`__ script.
+<https://github.com/huggingface/transformers/tree/master/examples/pytorch/summarization/run_summarization.py>`__
+script.

 An example of a summarization dataset is the CNN / Daily Mail dataset, which consists of long news articles and was
 created for the task of summarization. If you would like to fine-tune a model on a summarization task, various
-approaches are described in this :prefix_link:`document <examples/seq2seq/README.md>`.
+approaches are described in this :prefix_link:`document <examples/pytorch/summarization/README.md>`.

 Here is an example of using the pipelines to do summarization. It leverages a Bart model that was fine-tuned on the CNN
 / Daily Mail data set.
@@ -794,7 +821,7 @@ Here is an example of doing summarization using a model and a tokenizer. The pro
 3. Add the T5 specific prefix "summarize: ".
 4. Use the ``PreTrainedModel.generate()`` method to generate the summary.

-In this example we use Google`s T5 model. Even though it was pre-trained only on a multi-task mixed dataset (including
+In this example we use Google's T5 model. Even though it was pre-trained only on a multi-task mixed dataset (including
 CNN / Daily Mail), it yields very good results.

 .. code-block::
@@ -818,16 +845,23 @@ CNN / Daily Mail), it yields very good results.
    >>> inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
    >>> outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

+.. code-block::
+
+    >>> print(tokenizer.decode(outputs[0]))
+    <pad> prosecutors say the marriages were part of an immigration scam. if convicted, barrientos faces two criminal counts of "offering a false instrument for filing in the first degree" she has been married 10 times, nine of them between 1999 and 2002.</s>
+
+
 Translation
 -----------------------------------------------------------------------------------------------------------------------

 Translation is the task of translating a text from one language to another. If you would like to fine-tune a model on a
 translation task, you may leverage the `run_translation.py
-<https://github.com/huggingface/transformers/tree/master/examples/seq2seq/run_translation.py>`__ script.
+<https://github.com/huggingface/transformers/tree/master/examples/pytorch/translation/run_translation.py>`__ script.

 An example of a translation dataset is the WMT English to German dataset, which has sentences in English as the input
 data and the corresponding sentences in German as the target data. If you would like to fine-tune a model on a
-translation task, various approaches are described in this :prefix_link:`document <examples/seq2seq/README.md>`.
+translation task, various approaches are described in this :prefix_link:`document
+<examples/pytorch.translation/README.md>`.

 Here is an example of using the pipelines to do translation. It leverages a T5 model that was only pre-trained on a
 multi-task mixture dataset (including WMT), yet, yielding impressive translation results.
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -1,4 +1,4 @@
-.. 
+..
    Copyright 2020 The HuggingFace Team. All rights reserved.

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
@@ -70,19 +70,19 @@ Run all:

 .. code-block:: console

-   pytest
+    pytest

 or:

 .. code-block:: bash

-   make test
+    make test

 Note that the latter is defined as:

 .. code-block:: bash

-   python -m pytest -n auto --dist=loadfile -s -v ./tests/
+    python -m pytest -n auto --dist=loadfile -s -v ./tests/

 which tells pytest to:

@@ -100,13 +100,13 @@ All tests of the test suite:

 .. code-block:: bash

-   pytest --collect-only -q
+    pytest --collect-only -q

 All tests of a given test file:

 .. code-block:: bash

-   pytest tests/test_optimization.py --collect-only -q
+    pytest tests/test_optimization.py --collect-only -q



@@ -117,7 +117,7 @@ To run an individual test module:

 .. code-block:: bash

-   pytest tests/test_logging.py
+    pytest tests/test_logging.py


 Run specific tests
@@ -128,7 +128,7 @@ class containing those tests. For example, it could be:

 .. code-block:: bash

-   pytest tests/test_optimization.py::OptimizationTest::test_adam_w
+    pytest tests/test_optimization.py::OptimizationTest::test_adam_w

 Here:

@@ -140,7 +140,7 @@ If the file contains multiple classes, you can choose to run only tests of a giv

 .. code-block:: bash

-   pytest tests/test_optimization.py::OptimizationTest
+    pytest tests/test_optimization.py::OptimizationTest


 will run all the tests inside that class.
@@ -149,7 +149,7 @@ As mentioned earlier you can see what tests are contained inside the ``Optimizat

 .. code-block:: bash

-   pytest tests/test_optimization.py::OptimizationTest --collect-only -q
+    pytest tests/test_optimization.py::OptimizationTest --collect-only -q

 You can run tests by keyword expressions.

@@ -157,7 +157,7 @@ To run only tests whose name contains ``adam``:

 .. code-block:: bash

-   pytest -k adam tests/test_optimization.py
+    pytest -k adam tests/test_optimization.py

 Logical ``and`` and ``or`` can be used to indicate whether all keywords should match or either. ``not`` can be used to
 negate.
@@ -166,19 +166,19 @@ To run all tests except those whose name contains ``adam``:

 .. code-block:: bash

-   pytest -k "not adam" tests/test_optimization.py
+    pytest -k "not adam" tests/test_optimization.py

 And you can combine the two patterns in one:

 .. code-block:: bash

-   pytest -k "ada and not adam" tests/test_optimization.py
+    pytest -k "ada and not adam" tests/test_optimization.py

 For example to run both ``test_adafactor`` and ``test_adam_w`` you can use:

 .. code-block:: bash

-   pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py
+    pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py

 Note that we use ``or`` here, since we want either of the keywords to match to include both.

@@ -186,7 +186,7 @@ If you want to include only tests that include both patterns, ``and`` is to be u

 .. code-block:: bash

-   pytest -k "test and ada" tests/test_optimization.py
+    pytest -k "test and ada" tests/test_optimization.py



@@ -251,7 +251,7 @@ example, to run all except ``test_modeling_*.py`` tests:

 .. code-block:: bash

-   pytest `ls -1 tests/*py | grep -v test_modeling`
+    pytest `ls -1 tests/*py | grep -v test_modeling`


 Clearing state
@@ -292,13 +292,13 @@ Repeat tests

 .. code-block:: bash

-   pip install pytest-flakefinder
+    pip install pytest-flakefinder

 And then run every test multiple times (50 by default):

 .. code-block:: bash

-   pytest --flake-finder --flake-runs=5 tests/test_failing_test.py
+    pytest --flake-finder --flake-runs=5 tests/test_failing_test.py

 .. note::
   This plugin doesn't work with ``-n`` flag from ``pytest-xdist``.
@@ -322,19 +322,19 @@ As explained earlier this allows detection of coupled tests - where one test's s

 .. code-block:: bash

-   pytest tests
-   [...]
-   Using --random-order-bucket=module
-   Using --random-order-seed=573663
+    pytest tests
+    [...]
+    Using --random-order-bucket=module
+    Using --random-order-seed=573663

 So that if the given particular sequence fails, you can reproduce it by adding that exact seed, e.g.:

 .. code-block:: bash

-   pytest --random-order-seed=573663
-   [...]
-   Using --random-order-bucket=module
-   Using --random-order-seed=573663
+    pytest --random-order-seed=573663
+    [...]
+    Using --random-order-bucket=module
+    Using --random-order-seed=573663

 It will only reproduce the exact order if you use the exact same list of tests (or no list at all). Once you start to
 manually narrowing down the list you can no longer rely on the seed, but have to list them manually in the exact order
@@ -342,7 +342,7 @@ they failed and tell pytest to not randomize them instead using ``--random-order

 .. code-block:: bash

-   pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py
+    pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py

 To disable the shuffling for all tests:

@@ -369,7 +369,7 @@ progressbar, and show tests that fail and the assert instantly. It gets activate

 .. code-block:: bash

-   pip install pytest-sugar
+    pip install pytest-sugar

 To run tests without it, run:

@@ -388,7 +388,7 @@ For a single or a group of tests via ``pytest`` (after ``pip install pytest-pspe

 .. code-block:: bash

-   pytest --pspec tests/test_optimization.py 
+    pytest --pspec tests/test_optimization.py



@@ -490,8 +490,8 @@ Inside tests:

 .. code-block:: bash

-   from transformers.testing_utils import get_gpu_count
-   n_gpu = get_gpu_count() # works with torch and tf
+    from transformers.testing_utils import get_gpu_count
+    n_gpu = get_gpu_count() # works with torch and tf



@@ -502,20 +502,18 @@ Distributed training
 thing and end up thinking they are ``pytest`` and start running the test suite in loops. It works, however, if one
 spawns a normal process that then spawns off multiple workers and manages the IO pipes.

-This is still under development but you can study 2 different tests that perform this successfully:
+Here are some tests that use it:

-* :prefix_link:`test_seq2seq_examples_multi_gpu.py <examples/seq2seq/test_seq2seq_examples_multi_gpu.py>` - a
-  ``pytorch-lightning``-running test (had to use PL's ``ddp`` spawning method which is the default)
-* :prefix_link:`test_finetune_trainer.py <examples/seq2seq/test_finetune_trainer.py>` - a normal (non-PL) test
+* :prefix_link:`test_trainer_distributed.py <tests/test_trainer_distributed.py>`
+* :prefix_link:`test_deepspeed.py <tests/deepspeed/test_deepspeed.py>`

-To jump right into the execution point, search for the ``execute_subprocess_async`` function in those tests.
+To jump right into the execution point, search for the ``execute_subprocess_async`` call in those tests.

 You will need at least 2 GPUs to see these tests in action:

 .. code-block:: bash

-   CUDA_VISIBLE_DEVICES="0,1" RUN_SLOW=1 pytest -sv examples/seq2seq/test_finetune_trainer.py \
-   examples/seq2seq/test_seq2seq_examples_multi_gpu.py
+    CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/test_trainer_distributed.py


 Output capture
@@ -528,13 +526,13 @@ To disable output capturing and to get the ``stdout`` and ``stderr`` normally, u

 .. code-block:: bash

-   pytest -s tests/test_logging.py
+    pytest -s tests/test_logging.py

 To send test results to JUnit format output:

 .. code-block:: bash

-   py.test tests --junitxml=result.xml
+    py.test tests --junitxml=result.xml


 Color control
@@ -544,7 +542,7 @@ To have no color (e.g., yellow on white background is not readable):

 .. code-block:: bash

-   pytest --color=no tests/test_logging.py
+    pytest --color=no tests/test_logging.py



@@ -555,7 +553,7 @@ Creating a URL for each test failure:

 .. code-block:: bash

-   pytest --pastebin=failed tests/test_logging.py
+    pytest --pastebin=failed tests/test_logging.py

 This will submit test run information to a remote Paste service and provide a URL for each failure. You may select
 tests as usual or add for example -x if you only want to send one particular failure.
@@ -564,7 +562,7 @@ Creating a URL for a whole test session log:

 .. code-block:: bash

-   pytest --pastebin=all tests/test_logging.py
+    pytest --pastebin=all tests/test_logging.py



@@ -606,13 +604,13 @@ and you could run just the ``negative`` and ``integer`` sets of params with:

 .. code-block:: bash

-   pytest -k "negative and integer" tests/test_mytest.py
+    pytest -k "negative and integer" tests/test_mytest.py

 or all but ``negative`` sub-tests, with:

 .. code-block:: bash

-   pytest -k "not negative" tests/test_mytest.py
+    pytest -k "not negative" tests/test_mytest.py

 Besides using the ``-k`` filter that was just mentioned, you can find out the exact name of each sub-test and run any
 or all of them using their exact names.
@@ -672,7 +670,7 @@ and it will list:

    test_this2.py::test_floor[integer-1-1.0]
    test_this2.py::test_floor[negative--1.5--2.0]
-    test_this2.py::test_floor[large fraction-1.6-1]       
+    test_this2.py::test_floor[large fraction-1.6-1]

 So now you can run just the specific test:

@@ -718,10 +716,10 @@ To start using those all you need is to make sure that the test resides in a sub
    from transformers.testing_utils import TestCasePlus
    class PathExampleTest(TestCasePlus):
        def test_something_involving_local_locations(self):
-            data_dir = self.examples_dir / "seq2seq/test_data/wmt_en_ro"
+            data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"

-If you don't need to manipulated paths via ``pathlib`` or you just need a path as a string, you can always invoked
-``str()`` on the ``pathlib`` oboject or use the accessors ending with ``_str``. For example:
+If you don't need to manipulate paths via ``pathlib`` or you just need a path as a string, you can always invoked
+``str()`` on the ``pathlib`` object or use the accessors ending with ``_str``. For example:

 .. code-block:: python

@@ -795,6 +793,23 @@ leave any data in there.
   otherwise.


+Temporary sys.path override
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you need to temporary override ``sys.path`` to import from another test for example, you can use the
+``ExtendSysPath`` context manager. Example:
+
+
+.. code-block:: python
+
+    import os
+    from transformers.testing_utils import ExtendSysPath
+    bindir = os.path.abspath(os.path.dirname(__file__))
+    with ExtendSysPath(f"{bindir}/.."):
+        from test_trainer import TrainerIntegrationCommon  # noqa
+
+
+
 Skipping tests
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/Show More
+++ b/Show More