Release: v4.3.0

Update tokenizers requirement (#10077 )
Bump minimum Jax requirement to 2.8.0 (#10027 )
2021-02-08 18:31:49 +01:00 · 2021-02-08 18:29:16 +01:00 · 2021-02-08 18:18:26 +01:00
471 changed files with 8930 additions and 36944 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -77,10 +77,9 @@ jobs:
                  keys:
                      - v0.4-torch_and_tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,speech]
-            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
+            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
@@ -91,34 +90,6 @@ jobs:
            - store_artifacts:
                  path: ~/transformers/reports

-    run_tests_torch_and_flax:
-        working_directory: ~/transformers
-        docker:
-            - image: circleci/python:3.6
-        environment:
-            OMP_NUM_THREADS: 1
-        resource_class: xlarge
-        parallelism: 1
-        steps:
-            - checkout
-            - restore_cache:
-                  keys:
-                      - v0.4-torch_and_flax-{{ checksum "setup.py" }}
-                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
-            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,speech]
-            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
-            - save_cache:
-                key: v0.4-{{ checksum "setup.py" }}
-                paths:
-                    - '~/.cache/pip'
-            - run: RUN_PT_FLAX_CROSS_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax ./tests/ -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
-            - store_artifacts:
-                  path: ~/transformers/tests_output.txt
-            - store_artifacts:
-                  path: ~/transformers/reports
-
    run_tests_torch:
        working_directory: ~/transformers
        docker:
@@ -133,10 +104,9 @@ jobs:
                  keys:
                      - v0.4-torch-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech]
-            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
+            - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
@@ -188,7 +158,7 @@ jobs:
                    - v0.4-flax-{{ checksum "setup.py" }}
                    - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: sudo pip install .[flax,testing,sentencepiece]
+            - run: sudo pip install .[flax,sklearn,torch,testing,sentencepiece]
            - save_cache:
                  key: v0.4-flax-{{ checksum "setup.py" }}
                  paths:
@@ -213,10 +183,9 @@ jobs:
                  keys:
                      - v0.4-torch-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech]
-            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
+            - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
@@ -331,14 +300,13 @@ jobs:
                  keys:
                      - v0.4-build_doc-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
            - run: pip install ."[all, docs]"
            - save_cache:
                  key: v0.4-build_doc-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: cd docs && make html SPHINXOPTS="-W -j 4"
+            - run: cd docs && make html SPHINXOPTS="-W"
            - store_artifacts:
                path: ./docs/_build

@@ -445,7 +413,6 @@ workflows:
            - run_examples_torch
            - run_tests_custom_tokenizers
            - run_tests_torch_and_tf
-            - run_tests_torch_and_flax
            - run_tests_torch
            - run_tests_tf
            - run_tests_flax
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -3,7 +3,6 @@ cd docs
 function deploy_doc(){
 	echo "Creating doc at commit $1 and pushing to folder $2"
 	git checkout $1
-	pip install -U ..
 	if [ ! -z "$2" ]
 	then
 		if [ "$2" == "master" ]; then
@@ -46,7 +45,7 @@ deploy_doc "6f5a12a" v2.7.0
 deploy_doc "11c3257" v2.8.0
 deploy_doc "e7cfc1a" v2.9.0
 deploy_doc "7cb203f" v2.9.1
-deploy_doc "10d7239" v2.10.0
+deploy_doc "10d7239" v2.10.0 
 deploy_doc "b42586e" v2.11.0
 deploy_doc "7fb8bdf" v3.0.2
 deploy_doc "4b3ee9c" v3.1.0
@@ -54,7 +53,6 @@ deploy_doc "3ebb1b3" v3.2.0
 deploy_doc "0613f05" v3.3.1
 deploy_doc "eb0e0ce" v3.4.0
 deploy_doc "818878d" v3.5.1
-deploy_doc "c781171" v4.0.1
+deploy_doc "c781171" v4.0.0
 deploy_doc "bfa4ccf" v4.1.1
-deploy_doc "7d9a9d0" v4.2.2
-deploy_doc "bae0c79"  # v4.3.3 Latest stable release
+deploy_doc "7d9a9d0" # v4.2.0 Latest stable release
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +0,0 @@
-*.py	eol=lf
-*.rst	eol=lf
-*.md	eol=lf
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -42,7 +42,7 @@ Library:
 - deepspeed: @stas00
 - ray/raytune: @richardliaw, @amogkam
 - text generation: @patrickvonplaten
- tokenizers: @LysandreJik
+- tokenizers: @n1t0, @LysandreJik
 - trainer: @sgugger
 - pipelines: @LysandreJik

--- a/.github/conda/meta.yaml
+++ b/.github/conda/meta.yaml
@@ -14,7 +14,7 @@ requirements:
  host:
    - python
    - pip
-    - numpy >=1.17
+    - numpy
    - dataclasses
    - packaging
    - filelock
@@ -23,10 +23,10 @@ requirements:
    - sacremoses
    - regex !=2019.12.17
    - protobuf
-    - tokenizers >=0.10.1,<0.11.0
+    - tokenizers ==0.9.4
  run:
    - python
-    - numpy >=1.17
+    - numpy
    - dataclasses
    - packaging
    - filelock
@@ -35,7 +35,7 @@ requirements:
    - sacremoses
    - regex !=2019.12.17
    - protobuf
-    - tokenizers >=0.10.1,<0.11.0
+    - tokenizers ==0.9.4

 test:
  imports:
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -0,0 +1,18 @@
+# Number of days of inactivity before an issue becomes stale
+daysUntilStale: 60
+# Number of days of inactivity before a stale issue is closed
+daysUntilClose: 7
+# Issues with these labels will never be considered stale
+exemptLabels:
+  - pinned
+  - security
+  - Feature request
+# Label to use when marking an issue as stale
+staleLabel: wontfix
+# Comment to post when marking an issue as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs. Thank you
+  for your contributions.
+# Comment to post when closing a stale issue. Set to `false` to disable
+closeComment: false
--- a/.github/workflows/model-templates.yml
+++ b/.github/workflows/model-templates.yml
@@ -1,13 +1,15 @@
 name: Model templates runner

 on:
-  pull_request:
+  push:
    paths:
      - "src/**"
      - "tests/**"
      - ".github/**"
      - "templates/**"
-    types: [assigned, opened, synchronize, reopened]
+  pull_request_target:
+    branches:
+      - master

 jobs:
  run_tests_templates:
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -10,91 +10,143 @@ on:
      - "tests/**"
      - ".github/**"
      - "templates/**"
+  # pull_request:
  repository_dispatch:

+
 jobs:
  run_tests_torch_gpu:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
-    container:
-      image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    runs-on: [self-hosted, gpu, single-gpu]
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
+      - uses: actions/checkout@v2
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_torch_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version

      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip install pandas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
-          python -c "import torch; print('Cuda version:', torch.version.cuda)"
-          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

+#      - name: Create model files
+#        run: |
+#          source .env/bin/activate
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+
      - name: Run all non-slow tests on GPU
        env:
-          OMP_NUM_THREADS: 8
-          MKL_NUM_THREADS: 8
-          HF_HOME: /mnt/cache
+          OMP_NUM_THREADS: 1
+          CUDA_VISIBLE_DEVICES: 0
        run: |
-          python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_gpu_failures_short.txt
-
+        
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_torch_gpu_test_reports
          path: reports
+                  

  run_tests_tf_gpu:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
-    container:
-      image: tensorflow/tensorflow:2.4.1-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    runs-on: [self-hosted, gpu, single-gpu]
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
+      - uses: actions/checkout@v2
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_tf_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version

      - name: Install dependencies
        run: |
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece]
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

+      - name: Create model files
+        run: |
+          source .env/bin/activate
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+
      - name: Run all non-slow tests on GPU
        env:
-          OMP_NUM_THREADS: 8
-          MKL_NUM_THREADS: 8
-          TF_NUM_INTRAOP_THREADS: 8
-          TF_NUM_INTEROP_THREADS: 1
-          HF_HOME: /mnt/cache
+          OMP_NUM_THREADS: 1
+          CUDA_VISIBLE_DEVICES: 0
        run: |
-          python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_gpu_failures_short.txt
-
+        
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
@@ -102,45 +154,58 @@ jobs:
          name: run_all_tests_tf_gpu_test_reports
          path: reports

-
  run_tests_torch_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu]
-    container:
-      image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    runs-on: [self-hosted, gpu, multi-gpu]
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
+      - uses: actions/checkout@v2
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version

+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip install pandas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
-          python -c "import torch; print('Cuda version:', torch.version.cuda)"
-          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

      - name: Run all non-slow tests on GPU
        env:
-          OMP_NUM_THREADS: 8
-          MKL_NUM_THREADS: 8
-          MKL_SERVICE_FORCE_INTEL: 1
-          HF_HOME: /mnt/cache
+          OMP_NUM_THREADS: 1
        run: |
-          python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/tests_torch_multi_gpu_failures_short.txt
+        run: cat reports/tests_torch_multi_gpu_failures_short.txt          

      - name: Test suite reports artifacts
        if: ${{ always() }}
@@ -150,37 +215,52 @@ jobs:
          path: reports

  run_tests_tf_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu]
-    container:
-      image: tensorflow/tensorflow:2.4.1-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    runs-on: [self-hosted, gpu, multi-gpu]
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
+      - uses: actions/checkout@v2
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version

+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
      - name: Install dependencies
        run: |
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece]
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

      - name: Run all non-slow tests on GPU
        env:
-          OMP_NUM_THREADS: 8
-          MKL_NUM_THREADS: 8
-          TF_NUM_INTRAOP_THREADS: 8
-          TF_NUM_INTEROP_THREADS: 1
-          HF_HOME: /mnt/cache
+          OMP_NUM_THREADS: 1
        run: |
-          python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -192,22 +272,4 @@ jobs:
        with:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports
-
-  send_results:
-    name: Send results to webhook
-    runs-on: ubuntu-latest
-    if: always()
-    needs: [run_tests_torch_gpu, run_tests_tf_gpu, run_tests_torch_multi_gpu, run_tests_tf_multi_gpu]
-    steps:
-      - uses: actions/checkout@v2
-
-      - uses: actions/download-artifact@v2
-
-      - name: Send message to Slack
-        env:
-          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
-
-        run: |
-          pip install slack_sdk
-          python utils/notification_service.py push
+          
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -1,63 +1,82 @@
+# configuration notes:
+#
+# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise
+# the step uses the system-wide python interpreter.
+
 name: Self-hosted runner (scheduled)

 on:
-  push:
-    branches:
-      - multi_ci_*
  repository_dispatch:
  schedule:
    - cron: "0 0 * * *"

 jobs:
  run_all_tests_torch_gpu:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
-    container:
-      image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    runs-on: [self-hosted, gpu, single-gpu]
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
+      - uses: actions/checkout@v2

-      - name: NVIDIA-SMI
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v  1.1-slow_tests_torch_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version

      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
-          python -c "import torch; print('Cuda version:', torch.version.cuda)"
-          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

      - name: Run all tests on GPU
        env:
-          OMP_NUM_THREADS: 16
-          MKL_NUM_THREADS: 16
+          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
-          HF_HOME: /mnt/cache
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_gpu_failures_short.txt
-
+        
      - name: Run examples tests on GPU
        if: ${{ always() }}
        env:
-          OMP_NUM_THREADS: 16
-          MKL_NUM_THREADS: 16
+          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
-          HF_HOME: /mnt/cache
        run: |
+          source .env/bin/activate
          pip install -r examples/_tests_requirements.txt
-          python -m pytest -n 1 --dist=loadfile --make-reports=examples_torch_gpu examples
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_gpu examples

      - name: Failure short reports
        if: ${{ always() }}
@@ -66,13 +85,13 @@ jobs:
      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
-          OMP_NUM_THREADS: 16
-          MKL_NUM_THREADS: 16
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
-          HF_HOME: /mnt/cache
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -85,40 +104,60 @@ jobs:
          name: run_all_tests_torch_gpu_test_reports
          path: reports

-  run_all_tests_tf_gpu:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
-    container:
-      image: tensorflow/tensorflow:2.4.1-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2

-      - name: NVIDIA-SMI
+  run_all_tests_tf_gpu:
+    runs-on: [self-hosted, gpu, single-gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-slow_tests_tf_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version

      - name: Install dependencies
        run: |
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnx,sentencepiece]
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

      - name: Run all tests on GPU
        env:
+          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
-          HF_HOME: /mnt/cache
-          OMP_NUM_THREADS: 16
-          TF_NUM_INTEROP_THREADS: 1
-          TF_NUM_INTRAOP_THREADS: 16
-          MKL_NUM_THREADS: 16
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests
-
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_gpu tests
+          
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_gpu_failures_short.txt
@@ -126,19 +165,17 @@ jobs:
      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
-          HF_HOME: /mnt/cache
-          OMP_NUM_THREADS: 16
          RUN_PIPELINE_TESTS: yes
-          TF_NUM_INTEROP_THREADS: 1
-          TF_NUM_INTRAOP_THREADS: 16
-          MKL_NUM_THREADS: 16
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/tests_tf_pipeline_gpu_failures_short.txt
+        run: cat reports/tests_tf_pipelines_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
@@ -146,57 +183,86 @@ jobs:
        with:
          name: run_all_tests_tf_gpu_test_reports
          path: reports
-
+          
  run_all_tests_torch_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu]
-    container:
-      image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    runs-on: [self-hosted, gpu, multi-gpu]
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
+      - uses: actions/checkout@v2

-      - name: NVIDIA-SMI
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version

      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
-          python -c "import torch; print('Cuda version:', torch.version.cuda)"
-          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

-      - name: Run all tests on GPU
+      - name: Run all tests on multi-GPU
        env:
+          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
-          HF_HOME: /mnt/cache
-          OMP_NUM_THREADS: 16
-          MKL_NUM_THREADS: 16
-          MKL_SERVICE_FORCE_INTEL: 1
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_multi_gpu_failures_short.txt

-      - name: Run all pipeline tests on GPU
+      - name: Run examples tests on multi-GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_examples_multi_gpu examples
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_examples_multi_gpu_failures_short.txt
+
+      - name: Run all pipeline tests on multi-GPU
        if: ${{ always() }}
        env:
-          OMP_NUM_THREADS: 16
-          MKL_NUM_THREADS: 16
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
-          HF_HOME: /mnt/cache
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -210,56 +276,73 @@ jobs:
          path: reports

  run_all_tests_tf_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu]
-    container:
-      image: tensorflow/tensorflow:2.4.1-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    runs-on: [self-hosted, gpu, multi-gpu]
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
+      - uses: actions/checkout@v2

-      - name: NVIDIA-SMI
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version

      - name: Install dependencies
        run: |
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnx,sentencepiece]
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

-      - name: Run all tests on GPU
+      - name: Run all tests on multi-GPU
        env:
-          OMP_NUM_THREADS: 16
+          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
-          MKL_NUM_THREADS: 16
-          TF_NUM_INTEROP_THREADS: 1
-          TF_NUM_INTRAOP_THREADS: 16
-          HF_HOME: /mnt/cache
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_multi_gpu_failures_short.txt

-      - name: Run all pipeline tests on GPU
+      - name: Run all pipeline tests on multi-GPU
        if: ${{ always() }}
        env:
-          OMP_NUM_THREADS: 16
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
-          MKL_NUM_THREADS: 16
-          TF_NUM_INTEROP_THREADS: 1
-          TF_NUM_INTRAOP_THREADS: 16
-          HF_HOME: /mnt/cache
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
-
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
+          
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_pipeline_multi_gpu_failures_short.txt
@@ -270,23 +353,4 @@ jobs:
        with:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports
-
-  send_results:
-    name: Send results to webhook
-    runs-on: ubuntu-latest
-    if: always()
-    needs: [run_all_tests_torch_gpu, run_all_tests_tf_gpu, run_all_tests_torch_multi_gpu, run_all_tests_tf_multi_gpu]
-    steps:
-      - uses: actions/checkout@v2
-
-      - uses: actions/download-artifact@v2
-
-      - name: Send message to Slack
-        env:
-          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
-
-
-        run: |
-          pip install slack_sdk
-          python utils/notification_service.py scheduled
+          
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -1,27 +0,0 @@
-name: Stale Bot
-
-on:
-  schedule:
-    - cron: "0 0 * * *"
-
-jobs:
-  close_stale_issues:
-    name: Close Stale Issues
-    if: github.repository == 'huggingface/transformers'
-    runs-on: ubuntu-latest
-    env:
-      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-    steps:
-    - uses: actions/checkout@v2
-
-    - name: Setup Python
-      uses: actions/setup-python@v1
-      with:
-        python-version: 3.7
-
-    - name: Install requirements
-      run: |
-        pip install PyGithub
-    - name: Close stale issues
-      run: |
-        python scripts/stale.py
--- a/ISSUES.md
+++ b/ISSUES.md
@@ -207,8 +207,6 @@ You are not required to read the following guidelines before opening an issue. H

   Do not dispair if you can't figure it out from the begining, just share what you can and perhaps someone else will be able to help you at the forums.

-   If your setup involves any custom datasets, the best way to help us reproduce the problem is to create a [Google Colab notebook](https://colab.research.google.com/) that demonstrates the issue and once you verify that the issue still exists, include a link to that notebook in the Issue. Just make sure that you don't copy and paste the location bar url of the open notebook - as this is private and we won't be able to open it. Instead, you need to click on `Share` in the right upper corner of the notebook, select `Get Link` and then copy and paste the public link it will give to you.
-
 7. If you forked off some of this project's code or example applications, please, do not ask us to go into your code repository and figure out what you may have done. The code is already very complex and unless there is an easy way to do a diff and it's a small diff, it won't be possible to find someone with time on their hands to make a lengthy investigation. Albeit, you might find someone at the forums who will be generous to do this for you.

 8. Before reporting an issue, first, always try to update your environment to the latest official version of this library. We have no resources to go and debug older revisions, which could easily have bugs that have been fixed in the latest released version.
--- a/16
+++ b/16
@@ -27,7 +27,6 @@ extra_quality_checks: deps_table_update
 	python utils/check_dummies.py
 	python utils/check_repo.py
 	python utils/style_doc.py src/transformers docs/source --max_len 119
-	python utils/class_mapping_update.py

 # this target runs checks on all files
 quality:
@@ -69,18 +68,3 @@ test-examples:

 docs:
 	cd docs && make html SPHINXOPTS="-W -j 4"
-
-# Release stuff
-
-pre-release:
-	python utils/release.py
-
-pre-patch:
-	python utils/release.py --patch
-
-post-release:
-	python utils/release.py --post_release
-
-post-patch:
-	python utils/release.py --post_release --patch
-
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ Here are a few examples:
 - [Masked word completion with BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [Name Entity Recognition with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
 - [Text generation with GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
- [Natural Language Inference with RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [Natural Langugage Inference with RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
 - [Summarization with BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
 - [Question answering with DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
 - [Translation with T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
@@ -200,8 +200,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft Research) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
 1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
@@ -212,22 +211,18 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
 1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
@@ -237,7 +232,6 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/transformers/model_doc/xlsr_wav2vec2.html)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.

 To check if each model has an implementation in PyTorch/TensorFlow/Flax or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/transformers/index.html#bigtable)
--- a/docs/README.md
+++ b/docs/README.md
@@ -26,7 +26,7 @@ pip install -e ".[docs]"
 ---
 **NOTE**

-You only need to generate the documentation to inspect it locally (if you're planning changes and want to
+You only need to generate the documentation to inspect it locally (if you're planning changes and want to 
 check how they look like before committing for instance). You don't have to commit the built documentation.

 ---
@@ -65,7 +65,7 @@ make html
 ```

 A folder called ``_build/html`` should have been created. You can now open the file ``_build/html/index.html`` in your
-browser.
+browser. 

 ---
 **NOTE**
@@ -95,15 +95,15 @@ following these steps:
  expand them).
 - Click on "details" next to the `ci/circleci: build_doc` check.
 - In the new window, click on the "Artifacts" tab.
- Locate the file "docs/_build/html/index.html" (or any specific page you want to check) and click on it to get a
+- Locate the file "docs/_build/html/index.html" (or any specific page you want to check) and click on it to get a 
  preview.

 ## Writing Documentation - Specification

 The `huggingface/transformers` documentation follows the
 [Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style. It is
-mostly written in ReStructuredText
-([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html),
+mostly written in ReStructuredText 
+([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html), 
 [Sourceforge complete documentation](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html)).


@@ -121,8 +121,8 @@ four.
 ### Adding a new model

 When adding a new model:
-
- Create a file `xxx.rst` under `./source/model_doc` (don't hesitate to copy an existing file as template).
+ 
+- Create a file `xxx.rst` under `./source/model_doc` (don't hesitate to copy an existing file as template). 
 - Link that file in `./source/index.rst` on the `model_doc` toc-tree.
 - Write a short overview of the model:
    - Overview with paper & authors
@@ -130,8 +130,8 @@ When adding a new model:
    - Tips and tricks and how to use it best
 - Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and
  every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow.
-  The order is generally:
-    - Configuration,
+  The order is generally: 
+    - Configuration, 
    - Tokenizer
    - PyTorch base model
    - PyTorch head models
@@ -179,7 +179,7 @@ Links should be done as so (note the double underscore at the end): \`text for t

 #### Defining arguments in a method

-Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation.
+Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 
 The argument should be followed by its type, with its shape if it is a tensor, and a line return.
 Another indentation is necessary before writing the description of the argument.

@@ -216,9 +216,9 @@ then its documentation should look like this:

 Note that we always omit the "defaults to :obj:\`None\`" when None is the default for any argument. Also note that even
 if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
-however write as many lines as you want in the indented description (see the example above with `input_ids`).
+however write as many lines as you want in the indented description (see the example above with `input_ids`). 

-#### Writing a multi-line code block
+#### Writing a multi-line code block 

 Multi-line code blocks can be useful for displaying examples. They are done like so:

@@ -237,7 +237,7 @@ the results stay consistent with the library.

 #### Writing a return block

-Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation.
+Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 
 The first line should be the type of the return, followed by a line return. No need to indent further for the elements
 building the return.

@@ -258,43 +258,3 @@ Here's an example for a single value return:
    Returns:
        :obj:`List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
 ```
-
-#### Adding a new section
-
-In ReST section headers are designated as such with the help of a line of underlying characters, e.g.,:
-
-```
-Section 1
-^^^^^^^^^^^^^^^^^^
-
-Sub-section 1
-~~~~~~~~~~~~~~~~~~
-```
-
-ReST allows the use of any characters to designate different section levels, as long as they are used consistently within the same document. For details see [sections doc](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#sections). Because there is no standard different documents often end up using different characters for the same levels which makes it very difficult to know which character to use when creating a new section.
-
-Specifically, if when running `make docs` you get an error like:
-```
-docs/source/main_classes/trainer.rst:127:Title level inconsistent:
-```
-you picked an inconsistent character for some of the levels.
-
-But how do you know which characters you must use for an already existing level or when adding a new level?
-
-You can use this helper script:
-```
-perl -ne '/^(.)\1{100,}/ && do { $h{$1}=++$c if !$h{$1} }; END { %h = reverse %h ; print "$_ $h{$_}\n" for sort keys %h}' docs/source/main_classes/trainer.rst
-1 -
-2 ~
-3 ^
-4 =
-5 "
-```
-
-This tells you which characters have already been assigned for each level.
-
-So using this particular example's output -- if your current section's header uses `=` as its underline character, you now know you're at level 4, and if you want to add a sub-section header you know you want `"` as it'd level 5.
-
-If you needed to add yet another sub-level, then pick a character that is not used already. That is you must pick a character that is not in the output of that script.
-
-Here is the full list of characters that can be used in this context: `= - ` : ' " ~ ^ _ * + # < >`
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,11 +1,10 @@
 // These two things need to be updated at each release for the version selector.
 // Last stable version
-const stableVersion = "v4.3.2"
+const stableVersion = "v4.2.0"
 // Dictionary doc folder to label. The last stable version should have an empty key.
 const versionMapping = {
    "master": "master",
-    "": "v4.3.0/v4.3.1/v4.3.2/v4.3.3 (stable)",
-    "v4.2.2": "v4.2.0/v4.2.1/v4.2.2",
+    "": "v4.2.0/v4.2.1 (stable)",
    "v4.1.1": "v4.1.0/v4.1.1",
    "v4.0.1": "v4.0.0/v4.0.1",
    "v3.5.1": "v3.5.0/v3.5.1",
@@ -128,11 +127,11 @@ function addVersionControl() {
    const parts = location.toString().split('/');
    let versionIndex = parts.length - 2;
    // Index page may not have a last part with filename.html so we need to go up
-    if (parts[parts.length - 1] != "" && ! parts[parts.length - 1].match(/\.html/)) {
+    if (parts[parts.length - 1] != "" && ! parts[parts.length - 1].match(/\.html$|^search.html?/)) {
        versionIndex = parts.length - 1;
    }
    // Main classes and models are nested so we need to go deeper
-    else if (parts[versionIndex] == "main_classes" || parts[versionIndex] == "model_doc" || parts[versionIndex] == "internal") {
+    else if (parts[versionIndex] == "main_classes" || parts[versionIndex] == "model_doc") {
        versionIndex = versionIndex - 1;
    } 
    const version = parts[versionIndex];
--- a/docs/source/community.md
+++ b/docs/source/community.md
@@ -18,8 +18,8 @@ This page regroups resources around 🤗 Transformers developed by the community
 | [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb)  | How to fine-tune the DialoGPT model on a new dataset for open-dialog conversational chatbots |  [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
 | [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  | How to train on sequences as long as 500,000 tokens with Reformer |  [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  |
 | [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) | How to fine-tune BART for summarization with fastai using blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) |
-| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | How to generate tweets in the style of your favorite Twitter account by fine-tuning a GPT-2 model |  [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
-| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | A complete tutorial showcasing W&B integration with Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) |
+| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb)  | How to generate tweets in the style of your favorite Twitter account by fine-tune a GPT-2 model |  [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
+| [A Step by Step Guide to Tracking Hugging Face Model Performance](https://colab.research.google.com/drive/1NEiqNPhiouu2pPwDAVeFoN4-vTYMz9F8)  | A quick tutorial for training NLP models with HuggingFace and & visualizing their performance with Weights & Biases |  [Jack Morris](https://github.com/jxmorris12) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1NEiqNPhiouu2pPwDAVeFoN4-vTYMz9F8) |
 | [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb)  | How to build a "long" version of existing pretrained models |  [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) |
 | [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | How to fine-tune longformer model for QA task | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) |
 | [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | How to evaluate longformer on TriviaQA with `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) |
@@ -30,7 +30,6 @@ This page regroups resources around 🤗 Transformers developed by the community
 |[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|How to speed up fine-tuning by a factor of 2 using dynamic padding / bucketing|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)|
 |[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| How to train a Reformer model with bi-directional self-attention layers | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)|
 |[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| How to increase vocabulary of a pretrained SciBERT model from AllenAI on the CORD dataset and pipeline it. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)|
-|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| How to fine tune BlenderBotSmall for summarization on a custom dataset, using the Trainer API. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)|
 |[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | How to fine-tune Electra for sentiment analysis and interpret predictions with Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)|
 |[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | How to fine-tune a non-English GPT-2 Model with Trainer class | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)|
 |[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | How to fine-tune a DistilBERT Model for Multi Label Classification task | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)|
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,10 +26,7 @@ author = u'huggingface'
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'4.4.2'
-
-
-
+release = u'4.3.0'
 # Prefix link to point to master, comment this during version release and uncomment below line
 extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/master/%s', '')}
 # Prefix link to always point to corresponding version, uncomment this during version release
@@ -98,8 +95,7 @@ html_theme = 'sphinx_rtd_theme'
 # documentation.
 #
 html_theme_options = {
-    'analytics_id': 'UA-83738774-2',
-    'navigation_with_keys': True
+    'analytics_id': 'UA-83738774-2'
 }

 # Add any paths that contain custom static files (such as style sheets) here,
--- a/docs/source/custom_datasets.rst
+++ b/docs/source/custom_datasets.rst
@@ -558,14 +558,15 @@ we can use the built in :func:`~transformers.BatchEncoding.char_to_token` method
        end_positions = []
        for i in range(len(answers)):
            start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
-            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
+            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

            # if start position is None, the answer passage has been truncated
            if start_positions[-1] is None:
                start_positions[-1] = tokenizer.model_max_length
-            if end_positions[-1] is None:
-                end_positions[-1] = tokenizer.model_max_length

+            # if end position is None, the 'char_to_token' function points to the space before the correct token - > add + 1
+            if end_positions[-1] is None:
+                end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] + 1)
        encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

    add_token_positions(train_encodings, train_answers)
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -21,7 +21,6 @@ General terms
 - CLM: causal language modeling, a pretraining task where the model reads the texts in order and has to predict the
  next word. It's usually done by reading the whole sentence but using a mask inside the model to hide the future
  tokens at a certain timestep.
- deep learning: machine learning algorithms which uses neural networks with several layers.
 - MLM: masked language modeling, a pretraining task where the model sees a corrupted version of the texts, usually done
  by masking some tokens randomly, and has to predict the original text.
 - multimodal: a task that combines texts with another kind of inputs (for instance images).
@@ -34,12 +33,10 @@ General terms
  involve a self-supervised objective, which can be reading the text and trying to predict the next word (see CLM) or
  masking some words and trying to predict them (see MLM).
 - RNN: recurrent neural network, a type of model that uses a loop over a layer to process texts.
- self-attention: each element of the input finds out which other elements of the input they should attend to.
 - seq2seq or sequence-to-sequence: models that generate a new sequence from an input, like translation models, or
  summarization models (such as :doc:`Bart </model_doc/bart>` or :doc:`T5 </model_doc/t5>`).
 - token: a part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords)
  or a punctuation symbol.
- transformer: self-attention based deep learning model architecture.

 Model inputs
 -----------------------------------------------------------------------------------------------------------------------
--- a/docs/source/imgs/transformers_overview.png
+++ b/docs/source/imgs/transformers_overview.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -114,115 +114,97 @@ and conversion utilities for the following models:
 11. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
    Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
    Lav R. Varshney, Caiming Xiong and Richard Socher.
-12. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
-    Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu
-    Chen.
-13. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
-    with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
+12. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft Research) released with the paper `DeBERTa: Decoding-enhanced
+    BERT with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
    Weizhu Chen.
-14. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+13. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
    Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
    Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-15. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+14. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
    distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
    Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
    `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
    version of DistilBERT.
-16. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+15. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
    Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
    Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-17. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+16. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
    Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
    Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-18. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+17. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
    Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
    Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-19. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+18. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
    Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
    Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-20. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+19. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
    Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
    and Ilya Sutskever.
-21. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+20. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
    Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
    Luan, Dario Amodei** and Ilya Sutskever**.
-22. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
-    <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
-23. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+21. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-24. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+22. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-25. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+23. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-26. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+24. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
    by Hao Tan and Mohit Bansal.
-27. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
-    Machine Translation <https://arxiv.org/abs/2010.11125>`__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi
-    Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman
-    Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-28. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+25. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
    Translator Team.
-29. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+26. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-30. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
-    Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
-    Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-31. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+27. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
    Jianfeng Lu, Tie-Yan Liu.
-32. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+28. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-33. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+29. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
    Mohammad Saleh and Peter J. Liu.
-34. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+30. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-35. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+31. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-36. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+32. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-37. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
-    `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
-    Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-38. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
+33. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
    about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
    Krishna, and Kurt W. Keutzer.
-39. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+34. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-40. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+35. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
    Francesco Piccinno and Julian Martin Eisenschlos.
-41. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+36. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-42. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+37. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
    Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
    Zhou, Abdelrahman Mohamed, Michael Auli.
-43. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+38. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-44. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+39. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-45. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+40. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
    Zettlemoyer and Veselin Stoyanov.
-46. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+41. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-47. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
-    Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
-    Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.


 .. _bigtable:
@@ -261,8 +243,6 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           DeBERTa           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
@@ -275,8 +255,6 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
@@ -285,8 +263,6 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
@@ -301,7 +277,7 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             RAG             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -309,8 +285,6 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
@@ -393,7 +367,6 @@ TensorFlow and/or Flax.
    main_classes/processors
    main_classes/tokenizer
    main_classes/trainer
-    main_classes/feature_extractor

 .. toctree::
    :maxdepth: 2
@@ -413,7 +386,6 @@ TensorFlow and/or Flax.
    model_doc/convbert
    model_doc/ctrl
    model_doc/deberta
-    model_doc/deberta_v2
    model_doc/dialogpt
    model_doc/distilbert
    model_doc/dpr
@@ -423,13 +395,11 @@ TensorFlow and/or Flax.
    model_doc/fsmt
    model_doc/funnel
    model_doc/herbert
-    model_doc/ibert
    model_doc/layoutlm
    model_doc/led
    model_doc/longformer
    model_doc/lxmert
    model_doc/marian
-    model_doc/m2m_100
    model_doc/mbart
    model_doc/mobilebert
    model_doc/mpnet
@@ -443,7 +413,6 @@ TensorFlow and/or Flax.
    model_doc/reformer
    model_doc/retribert
    model_doc/roberta
-    model_doc/speech_to_text
    model_doc/squeezebert
    model_doc/t5
    model_doc/tapas
@@ -453,7 +422,6 @@ TensorFlow and/or Flax.
    model_doc/xlmprophetnet
    model_doc/xlmroberta
    model_doc/xlnet
-    model_doc/xlsr_wav2vec2

 .. toctree::
    :maxdepth: 2
@@ -464,4 +432,3 @@ TensorFlow and/or Flax.
    internal/tokenization_utils
    internal/trainer_utils
    internal/generation_utils
-    internal/file_utils
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -155,31 +155,6 @@ If you expect to be downloading large volumes of models (more than 1,000) from o
 your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way
 faster, and cheaper. Feel free to contact us privately if you need any help.

-### Offline mode
-
-It's possible to run 🤗 Transformers in a firewalled or a no-network environment.
-
-Setting environment variable `TRANSFORMERS_OFFLINE=1` will tell 🤗 Transformers to use local files only and will not try to look things up.
-
-Most likely you may want to couple this with `HF_DATASETS_OFFLINE=1` that performs the same for 🤗 Datasets if you're using the latter.
-
-Here is an example of how this can be used on a filesystem that is shared between a normally networked and a firewalled to the external world instances.
-
-On the instance with the normal network run your program which will download and cache models (and optionally datasets if you use 🤗 Datasets). For example:
-
-```
-python examples/seq2seq/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-
-and then with the same filesystem you can now run the same program on a firewalled instance:
-```
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/seq2seq/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-and it should succeed without any hanging waiting to timeout.
-
-
-
 ## Do you want to run a Transformer model on a mobile device?

 You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.
--- a/docs/source/internal/file_utils.rst
+++ b/docs/source/internal/file_utils.rst
@@ -1,54 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-General Utilities
-----------------------------------------------------------------------------------------------------------------------
-
-This page lists all of Transformers general utility functions that are found in the file ``file_utils.py``.
-
-Most of those are only useful if you are studying the general code in the library.
-
-
-Enums and namedtuples
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.file_utils.ExplicitEnum
-
-.. autoclass:: transformers.file_utils.PaddingStrategy
-
-.. autoclass:: transformers.file_utils.TensorType
-
-
-Special Decorators
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: transformers.file_utils.add_start_docstrings
-
-.. autofunction:: transformers.file_utils.add_start_docstrings_to_model_forward
-
-.. autofunction:: transformers.file_utils.add_end_docstrings
-
-.. autofunction:: transformers.file_utils.add_code_sample_docstrings
-
-.. autofunction:: transformers.file_utils.replace_return_docstrings
-
-
-Special Properties
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.file_utils.cached_property
-
-
-Other Utilities
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.file_utils._BaseLazyModule
--- a/docs/source/internal/generation_utils.rst
+++ b/docs/source/internal/generation_utils.rst
@@ -151,23 +151,6 @@ generation.
 .. autoclass:: transformers.HammingDiversityLogitsProcessor
    :members: __call__

-StoppingCriteria
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-A :class:`~transformers.StoppingCriteria` can be used to change when to stop generation (other than EOS token).
-
-.. autoclass:: transformers.StoppingCriteria
-    :members: __call__
-
-.. autoclass:: transformers.StoppingCriteriaList
-    :members: __call__
-
-.. autoclass:: transformers.MaxLengthCriteria
-    :members: __call__
-
-.. autoclass:: transformers.MaxTimeCriteria
-    :members: __call__
-
 BeamSearch
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/internal/tokenization_utils.rst
+++ b/docs/source/internal/tokenization_utils.rst
@@ -38,6 +38,12 @@ SpecialTokensMixin
 Enums and namedtuples
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+.. autoclass:: transformers.tokenization_utils_base.ExplicitEnum
+
+.. autoclass:: transformers.tokenization_utils_base.PaddingStrategy
+
+.. autoclass:: transformers.tokenization_utils_base.TensorType
+
 .. autoclass:: transformers.tokenization_utils_base.TruncationStrategy

 .. autoclass:: transformers.tokenization_utils_base.CharSpan
--- a/docs/source/internal/trainer_utils.rst
+++ b/docs/source/internal/trainer_utils.rst
@@ -22,7 +22,7 @@ Utilities

 .. autoclass:: transformers.EvalPrediction

-.. autoclass:: transformers.IntervalStrategy
+.. autoclass:: transformers.EvaluationStrategy

 .. autofunction:: transformers.set_seed

--- a/docs/source/main_classes/feature_extractor.rst
+++ b/docs/source/main_classes/feature_extractor.rst
@@ -1,41 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-
-Feature Extractor
-----------------------------------------------------------------------------------------------------------------------
-
-A feature extractor is in charge of preparing input features for a multi-modal model. This includes feature extraction
-from sequences, *e.g.*, pre-processing audio files to Log-Mel Spectrogram features, feature extraction from images
-*e.g.* cropping image image files, but also padding, normalization, and conversion to Numpy, PyTorch, and TensorFlow
-tensors.
-
-
-FeatureExtractionMixin
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.feature_extraction_utils.FeatureExtractionMixin
-    :members: from_pretrained, save_pretrained
-
-
-SequenceFeatureExtractor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SequenceFeatureExtractor
-    :members: pad
-
-
-BatchFeature
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BatchFeature
-    :members:
--- a/docs/source/main_classes/logging.rst
+++ b/docs/source/main_classes/logging.rst
@@ -65,10 +65,6 @@ Other functions

 .. autofunction:: transformers.logging.get_logger

-.. autofunction:: transformers.logging.enable_default_handler
-
-.. autofunction:: transformers.logging.disable_default_handler
-
 .. autofunction:: transformers.logging.enable_explicit_format

 .. autofunction:: transformers.logging.reset_format
--- a/docs/source/main_classes/output.rst
+++ b/docs/source/main_classes/output.rst
@@ -60,7 +60,7 @@ ModelOutput
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.file_utils.ModelOutput
-    :members: to_tuple
+    :members:


 BaseModelOutput
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -54,9 +54,9 @@ PreTrainedTokenizer

 .. autoclass:: transformers.PreTrainedTokenizer
    :special-members: __call__
-    :members: batch_decode, convert_ids_to_tokens, convert_tokens_to_ids, convert_tokens_to_string, decode, encode, 
-        get_added_vocab, get_special_tokens_mask, num_special_tokens_to_add, prepare_for_tokenization, tokenize,
-        vocab_size
+    :members:
+
+    .. automethod:: encode


 PreTrainedTokenizerFast
@@ -64,9 +64,9 @@ PreTrainedTokenizerFast

 .. autoclass:: transformers.PreTrainedTokenizerFast
    :special-members: __call__
-    :members: batch_decode, convert_ids_to_tokens, convert_tokens_to_ids, convert_tokens_to_string, decode, encode, 
-        get_added_vocab, get_special_tokens_mask, num_special_tokens_to_add,
-        set_truncation_and_padding,tokenize, vocab_size
+    :members:
+
+    .. automethod:: encode


 BatchEncoding
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -21,16 +21,16 @@ Before instantiating your :class:`~transformers.Trainer`/:class:`~transformers.T
 customization during training.

 The API supports distributed training on multiple GPUs/TPUs, mixed precision through `NVIDIA Apex
-<https://github.com/NVIDIA/apex>`__ and Native AMP for PyTorch and :obj:`tf.keras.mixed_precision` for TensorFlow.
+<https://github.com/NVIDIA/apex>`__ for PyTorch and :obj:`tf.keras.mixed_precision` for TensorFlow.

-Both :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` contain the basic training loop which supports
-the above features. To inject custom behavior you can subclass them and override the following methods:
+Both :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` contain the basic training loop supporting the
+previous features. To inject custom behavior you can subclass them and override the following methods:

 - **get_train_dataloader**/**get_train_tfdataset** -- Creates the training DataLoader (PyTorch) or TF Dataset.
 - **get_eval_dataloader**/**get_eval_tfdataset** -- Creates the evaluation DataLoader (PyTorch) or TF Dataset.
 - **get_test_dataloader**/**get_test_tfdataset** -- Creates the test DataLoader (PyTorch) or TF Dataset.
 - **log** -- Logs information on the various objects watching training.
- **create_optimizer_and_scheduler** -- Sets up the optimizer and learning rate scheduler if they were not passed at
+- **create_optimizer_and_scheduler** -- Setups the optimizer and learning rate scheduler if they were not passed at
  init.
 - **compute_loss** - Computes the loss on a batch of training inputs.
 - **training_step** -- Performs a training step.
@@ -39,35 +39,17 @@ the above features. To inject custom behavior you can subclass them and override
 - **evaluate** -- Runs an evaluation loop and returns metrics.
 - **predict** -- Returns predictions (with metrics if labels are available) on a test set.

-.. warning::
-
-    The :class:`~transformers.Trainer` class is optimized for 🤗 Transformers models and can have surprising behaviors
-    when you use it on other models. When using it on your own model, make sure:
-
-    - your model always return tuples or subclasses of :class:`~transformers.file_utils.ModelOutput`.
-    - your model can compute the loss if a :obj:`labels` argument is provided and that loss is returned as the first
-      element of the tuple (if your model returns tuples)
-    - your model can accept multiple label arguments (use the :obj:`label_names` in your
-      :class:`~transformers.TrainingArguments` to indicate their name to the :class:`~transformers.Trainer`) but none
-      of them should be named :obj:`"label"`.
-
-Here is an example of how to customize :class:`~transformers.Trainer` using a custom loss function for multi-label
-classification:
+Here is an example of how to customize :class:`~transformers.Trainer` using a custom loss function:

 .. code-block:: python

-    import torch
    from transformers import Trainer
-
-    class MultilabelTrainer(Trainer):
-        def compute_loss(self, model, inputs, return_outputs=False):
+    class MyTrainer(Trainer):
+        def compute_loss(self, model, inputs):
            labels = inputs.pop("labels")
            outputs = model(**inputs)
-            logits = outputs.logits
-            loss_fct = torch.nn.BCEWithLogitsLoss()
-            loss = loss_fct(logits.view(-1, self.model.config.num_labels),
-                            labels.float().view(-1, self.model.config.num_labels))
-            return (loss, outputs) if return_outputs else loss
+            logits = outputs[0]
+            return my_custom_loss(logits, labels)

 Another way to customize the training loop behavior for the PyTorch :class:`~transformers.Trainer` is to use
 :doc:`callbacks <callback>` that can inspect the training loop state (for progress reporting, logging on TensorBoard or
@@ -259,8 +241,6 @@ provides support for the following features from `the ZeRO paper <https://arxiv.

 1. Optimizer State Sharding
 2. Gradient Sharding
-3. Model Parameters Sharding (new and very experimental)
-4. CPU offload (new and very experimental)

 You will need at least two GPUs to use this feature.

@@ -275,69 +255,31 @@ To deploy this feature:
   or find more details on `the FairScale's GitHub page
   <https://github.com/facebookresearch/fairscale/#installation>`__.

-2. To use the first version of Sharded data-parallelism, add ``--sharded_ddp simple`` to the command line arguments,
-   and make sure you have added the distributed launcher ``-m torch.distributed.launch
-   --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.
+2. Add ``--sharded_ddp`` to the command line arguments, and make sure you have added the distributed launcher ``-m
+   torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.

-For example here is how you could use it for ``run_translation.py`` with 2 GPUs:
+For example here is how you could use it for ``finetune_trainer.py`` with 2 GPUs:

 .. code-block:: bash

-    python -m torch.distributed.launch --nproc_per_node=2 examples/seq2seq/run_translation.py \
-    --model_name_or_path t5-small --per_device_train_batch_size 1   \
+    cd examples/seq2seq
+    python -m torch.distributed.launch --nproc_per_node=2 ./finetune_trainer.py \
+    --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
    --output_dir output_dir --overwrite_output_dir \
-    --do_train --max_train_samples 500 --num_train_epochs 1 \
-    --dataset_name wmt16 --dataset_config "ro-en" \
-    --source_lang en --target_lang ro \
-    --fp16 --sharded_ddp simple
+    --do_train --n_train 500 --num_train_epochs 1 \
+    --per_device_train_batch_size 1  --freeze_embeds \
+    --src_lang en_XX --tgt_lang ro_RO --task translation \
+    --fp16 --sharded_ddp

 Notes:

 - This feature requires distributed training (so multiple GPUs).
 - It is not implemented for TPUs.
 - It works with ``--fp16`` too, to make things even faster.
- One of the main benefits of enabling ``--sharded_ddp simple`` is that it uses a lot less GPU memory, so you should be
-  able to use significantly larger batch sizes using the same hardware (e.g. 3x and even bigger) which should lead to
+- One of the main benefits of enabling ``--sharded_ddp`` is that it uses a lot less GPU memory, so you should be able
+  to use significantly larger batch sizes using the same hardware (e.g. 3x and even bigger) which should lead to
  significantly shorter training time.

-3. To use the second version of Sharded data-parallelism, add ``--sharded_ddp zero_dp_2`` or ``--sharded_ddp zero_dp_3`
-   to the command line arguments, and make sure you have added the distributed launcher ``-m torch.distributed.launch
-   --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.
-
-For example here is how you could use it for ``run_translation.py`` with 2 GPUs:
-
-.. code-block:: bash
-
-    python -m torch.distributed.launch --nproc_per_node=2 examples/seq2seq/run_translation.py \
-    --model_name_or_path t5-small --per_device_train_batch_size 1   \
-    --output_dir output_dir --overwrite_output_dir \
-    --do_train --max_train_samples 500 --num_train_epochs 1 \
-    --dataset_name wmt16 --dataset_config "ro-en" \
-    --source_lang en --target_lang ro \
-    --fp16 --sharded_ddp zero_dp_2
-
-:obj:`zero_dp_2` is an optimized version of the simple wrapper, while :obj:`zero_dp_3` fully shards model weights,
-gradients and optimizer states.
-
-Both are compatible with adding :obj:`cpu_offload` to enable ZeRO-offload (activate it like this: :obj:`--sharded_ddp
-"zero_dp_2 cpu_offload"`).
-
-Notes:
-
- This feature requires distributed training (so multiple GPUs).
- It is not implemented for TPUs.
- It works with ``--fp16`` too, to make things even faster.
- The ``cpu_offload`` additional option requires ``--fp16``.
- This is an area of active development, so make sure you have a source install of fairscale to use this feature as
-  some bugs you encounter may have been fixed there already.
-
-Known caveats:
-
- This feature is incompatible with :obj:`--predict_with_generate` in the `run_translation.py` script.
- Using :obj:`--sharded_ddp zero_dp_3` requires wrapping each layer of the model in the special container
-  :obj:`FullyShardedDataParallelism` of fairscale. It should be used with the option :obj:`auto_wrap` if you are not
-  doing this yourself: :obj:`--sharded_ddp "zero_dp_3 auto_wrap"`.
-

 DeepSpeed
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -348,16 +290,6 @@ full support for:

 1. Optimizer State Partitioning (ZeRO stage 1)
 2. Add Gradient Partitioning (ZeRO stage 2)
-3. Custom fp16 handling
-4. A range of fast Cuda-extension-based Optimizers
-5. ZeRO-Offload
-
-ZeRO-Offload has its own dedicated paper: `ZeRO-Offload: Democratizing Billion-Scale Model Training
-<https://arxiv.org/abs/2101.06840>`__.
-
-DeepSpeed is currently used only for training, as all the currently available features are of no use to inference.
-
-

 Installation
 =======================================================================================================================
@@ -397,23 +329,17 @@ Unlike, ``torch.distributed.launch`` where you have to specify how many GPUs to
 full details on how to configure various nodes and GPUs can be found `here
 <https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node>`__.

-In fact, you can continue using ``-m torch.distributed.launch`` with DeepSpeed as long as you don't need to use
-``deepspeed`` launcher-specific arguments. Typically if you don't need a multi-node setup you're not required to use
-the ``deepspeed`` launcher. But since in the DeepSpeed documentation it'll be used everywhere, for consistency we will
-use it here as well.
-
-Here is an example of running ``run_translation.py`` under DeepSpeed deploying all available GPUs:
+Here is an example of running ``finetune_trainer.py`` under DeepSpeed deploying all available GPUs:

 .. code-block:: bash

-    deepspeed examples/seq2seq/run_translation.py \
-    --deepspeed examples/tests/deepspeed/ds_config.json \
-    --model_name_or_path t5-small --per_device_train_batch_size 1   \
-    --output_dir output_dir --overwrite_output_dir --fp16 \
-    --do_train --max_train_samples 500 --num_train_epochs 1 \
-    --dataset_name wmt16 --dataset_config "ro-en" \
-    --source_lang en --target_lang ro
-
+    cd examples/seq2seq
+    deepspeed ./finetune_trainer.py --deepspeed ds_config.json \
+    --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
+    --output_dir output_dir --overwrite_output_dir \
+    --do_train --n_train 500 --num_train_epochs 1 \
+    --per_device_train_batch_size 1  --freeze_embeds \
+    --src_lang en_XX --tgt_lang ro_RO --task translation

 Note that in the DeepSpeed documentation you are likely to see ``--deepspeed --deepspeed_config ds_config.json`` - i.e.
 two DeepSpeed-related arguments, but for the sake of simplicity, and since there are already so many arguments to deal
@@ -431,13 +357,13 @@ To deploy DeepSpeed with one GPU adjust the :class:`~transformers.Trainer` comma

 .. code-block:: bash

-    deepspeed --num_gpus=1 examples/seq2seq/run_translation.py \
-    --deepspeed examples/tests/deepspeed/ds_config.json \
-    --model_name_or_path t5-small --per_device_train_batch_size 1   \
-    --output_dir output_dir --overwrite_output_dir --fp16 \
-    --do_train --max_train_samples 500 --num_train_epochs 1 \
-    --dataset_name wmt16 --dataset_config "ro-en" \
-    --source_lang en --target_lang ro
+    cd examples/seq2seq
+    deepspeed --num_gpus=1 ./finetune_trainer.py --deepspeed ds_config.json \
+    --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
+    --output_dir output_dir --overwrite_output_dir \
+    --do_train --n_train 500 --num_train_epochs 1 \
+    --per_device_train_batch_size 1  --freeze_embeds \
+    --src_lang en_XX --tgt_lang ro_RO --task translation

 This is almost the same as with multiple-GPUs, but here we tell DeepSpeed explicitly to use just one GPU. By default,
 DeepSpeed deploys all GPUs it can see. If you have only 1 GPU to start with, then you don't need this argument. The
@@ -476,142 +402,12 @@ find more details in the discussion below.
 For a practical usage example of this type of deployment, please, see this `post
 <https://github.com/huggingface/transformers/issues/8771#issuecomment-759176685>`__.

-Notes:
-
- if you need to run on a specific GPU, which is different from GPU 0, you can't use ``CUDA_VISIBLE_DEVICES`` to limit
-  the visible scope of available GPUs. Instead, you have to use the following syntax:
-
-   .. code-block:: bash
-
-       deepspeed --include localhost:1 examples/seq2seq/run_translation.py ...
-
-   In this example, we tell DeepSpeed to use GPU 1 (second gpu).
-
-
-
-Deployment in Notebooks
-=======================================================================================================================
-
-The problem with running notebook cells as a script is that there is no normal ``deepspeed`` launcher to rely on, so
-under certain setups we have to emulate it.
-
-Here is how you'd have to adjust your training code in the notebook to use DeepSpeed.
-
-.. code-block:: python
-
-    # DeepSpeed requires a distributed environment even when only one process is used.
-    # This emulates a launcher in the notebook
-    import os
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = '9994' # modify if RuntimeError: Address already in use
-    os.environ['RANK'] = "0"
-    os.environ['LOCAL_RANK'] = "0"
-    os.environ['WORLD_SIZE'] = "1"
-
-    # Now proceed as normal, plus pass the deepspeed config file
-    training_args = TrainingArguments(..., deepspeed="ds_config.json")
-    trainer = Trainer(...)
-    trainer.train()
-
-Note: `...` stands for the normal arguments that you'd pass to the functions.
-
-If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated
-cell with:
-
-.. code-block:: python
-
-    %%bash
-    cat <<'EOT' > ds_config.json
-    {
-        "fp16": {
-            "enabled": true,
-            "loss_scale": 0,
-            "loss_scale_window": 1000,
-            "hysteresis": 2,
-            "min_loss_scale": 1
-        },
-
-        "zero_optimization": {
-            "stage": 2,
-            "allgather_partitions": true,
-            "allgather_bucket_size": 2e8,
-            "overlap_comm": true,
-            "reduce_scatter": true,
-            "reduce_bucket_size": 2e8,
-            "contiguous_gradients": true,
-            "cpu_offload": true
-        },
-
-        "zero_allow_untested_optimizer": true,
-
-        "optimizer": {
-            "type": "AdamW",
-            "params": {
-                "lr": 3e-5,
-                "betas": [0.8, 0.999],
-                "eps": 1e-8,
-                "weight_decay": 3e-7
-            }
-        },
-
-        "scheduler": {
-            "type": "WarmupLR",
-            "params": {
-                "warmup_min_lr": 0,
-                "warmup_max_lr": 3e-5,
-                "warmup_num_steps": 500
-            }
-        },
-
-        "steps_per_print": 2000,
-        "wall_clock_breakdown": false
-    }
-    EOT
-
-
-That's said if the script is not in the notebook cells, you can launch ``deepspeed`` normally via shell from a cell
-with:
-
-.. code-block::
-
-   !deepspeed examples/seq2seq/run_translation.py ...
-
-or with bash magic, where you can write a multi-line code for the shell to run:
-
-.. code-block::
-
-   %%bash
-
-   cd /somewhere
-   deepspeed examples/seq2seq/run_translation.py ...
-
-
-
-
 Configuration
 =======================================================================================================================

 For the complete guide to the DeepSpeed configuration options that can be used in its configuration file please refer
 to the `following documentation <https://www.deepspeed.ai/docs/config-json/>`__.

-You can find dozens of DeepSpeed configuration examples that address various practical needs in `the DeepSpeedExamples
-repo <https://github.com/microsoft/DeepSpeedExamples>`__:
-
-.. code-block:: bash
-
-  git clone https://github.com/microsoft/DeepSpeedExamples
-  cd DeepSpeedExamples
-  find . -name '*json'
-
-Continuing the code from above, let's say you're looking to configure the Lamb optimizer. So you can search through the
-example ``.json`` files with:
-
-.. code-block:: bash
-
-  grep -i Lamb $(find . -name '*json')
-
-Some more examples are to be found in the `main repo <https://github.com/microsoft/DeepSpeed>`__ as well.
-
 While you always have to supply the DeepSpeed configuration file, you can configure the DeepSpeed integration in
 several ways:

@@ -655,6 +451,7 @@ enables FP16, uses AdamW optimizer and WarmupLR scheduler:
           "weight_decay": 3e-7
         }
       },
+       "zero_allow_untested_optimizer": true,

       "scheduler": {
         "type": "WarmupLR",
@@ -750,11 +547,7 @@ Notes:
 - ``"overlap_comm": true`` trades off increased GPU RAM usage to lower all-reduce latency. ``overlap_comm`` uses 4.5x
  the ``allgather_bucket_size`` and ``reduce_bucket_size`` values. So if they are set to 5e8, this requires a 9GB
  footprint (``5e8 x 2Bytes x 2 x 4.5``). Therefore, if you have a GPU with 8GB or less RAM, to avoid getting
-  OOM-errors you will need to reduce those parameters to about ``2e8``, which would require 3.6GB. You will want to do
-  the same on larger capacity GPU as well, if you're starting to hit OOM.
- when reducing these buffers you're trading communication speed to avail more GPU RAM. The smaller the buffer size,
-  the slower the communication, and the more GPU RAM will be available to other tasks. So if a bigger batch size is
-  important, getting a slightly slower training time could be a good trade.
+  OOM-errors you will need to reduce those parameters to about ``2e8``, which would require 3.6GB.

 This section has to be configured exclusively via DeepSpeed configuration - the :class:`~transformers.Trainer` provides
 no equivalent command line arguments.
@@ -765,8 +558,8 @@ Optimizer
 =======================================================================================================================


-DeepSpeed's main optimizers are Adam, AdamW, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are
-thus recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here
+DeepSpeed's main optimizers are Adam, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are thus
+recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here
 <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`__.

 If you don't configure the ``optimizer`` entry in the configuration file, the :class:`~transformers.Trainer` will
@@ -778,6 +571,7 @@ Here is an example of the pre-configured ``optimizer`` entry for AdamW:
 .. code-block:: json

    {
+       "zero_allow_untested_optimizer": true,
       "optimizer": {
           "type": "AdamW",
           "params": {
@@ -789,8 +583,8 @@ Here is an example of the pre-configured ``optimizer`` entry for AdamW:
         }
    }

-If you want to use another optimizer which is not listed above, you will have to add ``"zero_allow_untested_optimizer":
-true`` to the top level configuration.
+Since AdamW isn't on the list of tested with DeepSpeed/ZeRO optimizers, we have to add
+``zero_allow_untested_optimizer`` flag.

 If you want to use one of the officially supported optimizers, configure them explicitly in the configuration file, and
 make sure to adjust the values. e.g. if use Adam you will want ``weight_decay`` around ``0.01``.
@@ -887,28 +681,6 @@ Here is an example of the ``amp`` configuration:
    }


-Gradient Accumulation
-=======================================================================================================================
-
-While normally DeepSpeed gets gradient accumulation configured with:
-
-.. code-block:: json
-
-    {
-        "gradient_accumulation_steps": 3,
-    }
-
-in this case, to enable gradient accumulation, pass the command line `--gradient_accumulation_steps` argument as normal
-and it will get injected into the DeepSpeed configuration.
-
-If you try to add it directly to the configuration file, you will receive an error from the Trainer - this is because
-this setting is needed by the Trainer too, and so this approach ensures that there is a single way of setting this
-value and thus avoid potential subtle errors.
-
-
-
-
-

 Gradient Clipping
 =======================================================================================================================
@@ -945,11 +717,6 @@ Main DeepSpeed Resources
 - `API docs <https://deepspeed.readthedocs.io/en/latest/index.html>`__
 - `Blog posts <https://www.microsoft.com/en-us/research/search/?q=deepspeed>`__

-Papers:
-
- `ZeRO: Memory Optimizations Toward Training Trillion Parameter Models <https://arxiv.org/abs/1910.02054>`__
- `ZeRO-Offload: Democratizing Billion-Scale Model Training <https://arxiv.org/abs/2101.06840>`__
-
 Finally, please, remember that, HuggingFace :class:`~transformers.Trainer` only integrates DeepSpeed, therefore if you
 have any problems or questions with regards to DeepSpeed usage, please, file an issue with `DeepSpeed GitHub
 <https://github.com/microsoft/DeepSpeed/issues>`__.
--- a/docs/source/model_doc/deberta.rst
+++ b/docs/source/model_doc/deberta.rst
@@ -60,7 +60,7 @@ DebertaModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DebertaModel
-    :members: forward
+    :members:


 DebertaPreTrainedModel
@@ -74,25 +74,25 @@ DebertaForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DebertaForMaskedLM
-    :members: forward
+    :members:


 DebertaForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DebertaForSequenceClassification
-    :members: forward
+    :members:


 DebertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DebertaForTokenClassification
-    :members: forward
+    :members:


 DebertaForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DebertaForQuestionAnswering
-    :members: forward
+    :members:
--- a/docs/source/model_doc/deberta_v2.rst
+++ b/docs/source/model_doc/deberta_v2.rst
@@ -1,118 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-DeBERTa-v2
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
-<https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
-BERT model released in 2018 and Facebook's RoBERTa model released in 2019.
-
-It builds on RoBERTa with disentangled attention and enhanced mask decoder training with half of the data used in
-RoBERTa.
-
-The abstract from the paper is the following:
-
-*Recent progress in pre-trained neural language models has significantly improved the performance of many natural
-language processing (NLP) tasks. In this paper we propose a new model architecture DeBERTa (Decoding-enhanced BERT with
-disentangled attention) that improves the BERT and RoBERTa models using two novel techniques. The first is the
-disentangled attention mechanism, where each word is represented using two vectors that encode its content and
-position, respectively, and the attention weights among words are computed using disentangled matrices on their
-contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
-predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
-of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
-the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
-(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
-pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
-
-
-The following information is visible directly on the [original implementation
-repository](https://github.com/microsoft/DeBERTa). DeBERTa v2 is the second version of the DeBERTa model. It includes
-the 1.5B model used for the SuperGLUE single-model submission and achieving 89.9, versus human baseline 89.8. You can
-find more details about this submission in the authors'
-[blog](https://www.microsoft.com/en-us/research/blog/microsoft-deberta-surpasses-human-performance-on-the-superglue-benchmark/)
-
-New in v2:
-
- **Vocabulary** In v2 the tokenizer is changed to use a new vocabulary of size 128K built from the training data.
-  Instead of a GPT2-based tokenizer, the tokenizer is now
-  [sentencepiece-based](https://github.com/google/sentencepiece) tokenizer.
- **nGiE(nGram Induced Input Encoding)** The DeBERTa-v2 model uses an additional convolution layer aside with the first
-  transformer layer to better learn the local dependency of input tokens.
- **Sharing position projection matrix with content projection matrix in attention layer** Based on previous
-  experiments, this can save parameters without affecting the performance.
- **Apply bucket to encode relative postions** The DeBERTa-v2 model uses log bucket to encode relative positions
-  similar to T5.
- **900M model & 1.5B model** Two additional model sizes are available: 900M and 1.5B, which significantly improves the
-  performance of downstream tasks.
-
-The original code can be found `here <https://github.com/microsoft/DeBERTa>`__.
-
-
-DebertaV2Config
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2Config
-    :members:
-
-
-DebertaV2Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2Tokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-DebertaV2Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2Model
-    :members: forward
-
-
-DebertaV2PreTrainedModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2PreTrainedModel
-    :members: forward
-
-
-DebertaV2ForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2ForMaskedLM
-    :members: forward
-
-
-DebertaV2ForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2ForSequenceClassification
-    :members: forward
-
-
-DebertaV2ForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2ForTokenClassification
-    :members: forward
-
-
-DebertaV2ForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2ForQuestionAnswering
-    :members: forward
--- a/docs/source/model_doc/fsmt.rst
+++ b/docs/source/model_doc/fsmt.rst
@@ -56,7 +56,7 @@ FSMTTokenizer

 .. autoclass:: transformers.FSMTTokenizer
    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
+        create_token_type_ids_from_sequences, prepare_seq2seq_batch, save_vocabulary


 FSMTModel
--- a/docs/source/model_doc/ibert.rst
+++ b/docs/source/model_doc/ibert.rst
@@ -1,88 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-I-BERT
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The I-BERT model was proposed in `I-BERT: Integer-only BERT Quantization <https://arxiv.org/abs/2101.01321>`__ by
-Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney and Kurt Keutzer. It's a quantized version of RoBERTa running
-inference up to four times faster.
-
-The abstract from the paper is the following:
-
-*Transformer based models, like BERT and RoBERTa, have achieved state-of-the-art results in many Natural Language
-Processing tasks. However, their memory footprint, inference latency, and power consumption are prohibitive for
-efficient inference at the edge, and even at the data center. While quantization can be a viable solution for this,
-previous work on quantizing Transformer based models use floating-point arithmetic during inference, which cannot
-efficiently utilize integer-only logical units such as the recent Turing Tensor Cores, or traditional integer-only ARM
-processors. In this work, we propose I-BERT, a novel quantization scheme for Transformer based models that quantizes
-the entire inference with integer-only arithmetic. Based on lightweight integer-only approximation methods for
-nonlinear operations, e.g., GELU, Softmax, and Layer Normalization, I-BERT performs an end-to-end integer-only BERT
-inference without any floating point calculation. We evaluate our approach on GLUE downstream tasks using
-RoBERTa-Base/Large. We show that for both cases, I-BERT achieves similar (and slightly higher) accuracy as compared to
-the full-precision baseline. Furthermore, our preliminary implementation of I-BERT shows a speedup of 2.4 - 4.0x for
-INT8 inference on a T4 GPU system as compared to FP32 inference. The framework has been developed in PyTorch and has
-been open-sourced.*
-
-
-The original code can be found `here <https://github.com/kssteven418/I-BERT>`__.
-
-IBertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.IBertConfig
-    :members:
-
-
-IBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.IBertModel
-    :members: forward
-
-
-IBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.IBertForMaskedLM
-    :members: forward
-
-
-IBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.IBertForSequenceClassification
-    :members: forward
-
-
-IBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.IBertForMultipleChoice
-    :members: forward
-
-
-IBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.IBertForTokenClassification
-    :members: forward
-
-
-IBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.IBertForQuestionAnswering
-    :members: forward
--- a/docs/source/model_doc/m2m_100.rst
+++ b/docs/source/model_doc/m2m_100.rst
@@ -1,128 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-M2M100
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The M2M100 model was proposed in `Beyond English-Centric Multilingual Machine Translation
-<https://arxiv.org/abs/2010.11125>`__ by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky,
-Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy
-Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-
-The abstract from the paper is the following:
-
-*Existing work in translation demonstrated the potential of massively multilingual machine translation by training a
-single model able to translate between any pair of languages. However, much of this work is English-Centric by training
-only on data which was translated from or to English. While this is supported by large sources of training data, it
-does not reflect translation needs worldwide. In this work, we create a true Many-to-Many multilingual translation
-model that can translate directly between any pair of 100 languages. We build and open source a training dataset that
-covers thousands of language directions with supervised data, created through large-scale mining. Then, we explore how
-to effectively increase model capacity through a combination of dense scaling and language-specific sparse parameters
-to create high quality models. Our focus on non-English-Centric models brings gains of more than 10 BLEU when directly
-translating between non-English directions while performing competitively to the best single systems of WMT. We
-open-source our scripts so that others may reproduce the data, evaluation, and final M2M-100 model.*
-
-
-Training and Generation
-_______________________________________________________________________________________________________________________
-
-M2M100 is a multilingual encoder-decoder (seq-to-seq) model primarily intended for translation tasks. As the model is
-multilingual it expects the sequences in a certain format: A special language id token is used as prefix in both the
-source and target text. The source text format is :obj:`[lang_code] X [eos]`, where :obj:`lang_code` is source language
-id for source text and target language id for target text, with :obj:`X` being the source or target text.
-
-The :class:`~transformers.M2M100Tokenizer` depends on :obj:`sentencepiece` so be sure to install it before running the
-examples. To install :obj:`sentencepiece` run ``pip install sentencepiece``.
-
- Supervised Training
-
-.. code-block::
-
-    from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
-
-    model = M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_418M')
-    tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M', src_lang="en", tgt_lang="fr")
-
-    src_text = "Life is like a box of chocolates."
-    tgt_lang = "La vie est comme une boîte de chocolat."
-
-    model_inputs = tokenizer(src_text, return_tensors="pt")
-    with tokenizer.as_target_tokenizer():
-        labels = tokenizer(tgt_text, return_tensors="pt").input_ids
-
-    loss = model(**model_inputs, labels=labels) # forward pass
-
-
- Generation
-
-    M2M100 uses the :obj:`eos_token_id` as the :obj:`decoder_start_token_id` for generation with the target language id
-    being forced as the first generated token. To force the target language id as the first generated token, pass the
-    `forced_bos_token_id` parameter to the `generate` method. The following example shows how to translate between
-    Hindi to French and Chinese to English using the `facebook/m2m100_418M` checkpoint.
-
-.. code-block::
-
-    >>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
-
-    >>> hi_text = "जीवन एक चॉकलेट बॉक्स की तरह है।"
-    >>> chinese_text = "生活就像一盒巧克力。"
-
-    >>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-    >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
-
-    >>> # translate Hindi to French
-    >>> tokenizer.src_lang = "hi"
-    >>> encoded_hi = tokenizer(hi_text, return_tensors="pt")
-    >>> generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("fr"))
-    >>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-    "La vie est comme une boîte de chocolat."
-
-    >>> # translate Chinese to English
-    >>> tokenizer.src_lang = "zh"
-    >>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
-    >>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
-    >>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-    "Life is like a box of chocolate."
-
-
-M2M100Config
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.M2M100Config
-    :members:
-
-
-M2M100Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.M2M100Tokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-M2M100Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.M2M100Model
-    :members: forward
-
-
-M2M100ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.M2M100ForConditionalGeneration
-    :members: forward
-
-
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -76,29 +76,27 @@ require 3 character language codes:

 .. code-block:: python

-    >>> from transformers import MarianMTModel, MarianTokenizer
-    >>> src_text = [
-    ...     '>>fra<< this is a sentence in english that we want to translate to french',
-    ...     '>>por<< This should go to portuguese',
-    ...     '>>esp<< And this to Spanish'
-    >>> ]
+    from transformers import MarianMTModel, MarianTokenizer
+    src_text = [
+        '>>fra<< this is a sentence in english that we want to translate to french',
+        '>>por<< This should go to portuguese',
+        '>>esp<< And this to Spanish'
+    ]

-    >>> model_name = 'Helsinki-NLP/opus-mt-en-roa'
-    >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
-    >>> print(tokenizer.supported_language_codes)
-    ['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']
-
-    >>> model = MarianMTModel.from_pretrained(model_name)
-    >>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
-    >>> [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
-    ["c'est une phrase en anglais que nous voulons traduire en français",
-     'Isto deve ir para o português.',
-     'Y esto al español']
+    model_name = 'Helsinki-NLP/opus-mt-en-roa'
+    tokenizer = MarianTokenizer.from_pretrained(model_name)
+    print(tokenizer.supported_language_codes)
+    model = MarianMTModel.from_pretrained(model_name)
+    translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
+    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+    # ["c'est une phrase en anglais que nous voulons traduire en français",
+    # 'Isto deve ir para o português.',
+    # 'Y esto al español']




-Here is the code to see all available pretrained models on the hub:
+Code to see available pretrained models:

 .. code-block:: python

@@ -149,22 +147,21 @@ Example of translating english to many romance languages, using old-style 2 char

 .. code-block::python

-    >>> from transformers import MarianMTModel, MarianTokenizer
-    >>> src_text = [
-    ...     '>>fr<< this is a sentence in english that we want to translate to french',
-    ...     '>>pt<< This should go to portuguese',
-    ...     '>>es<< And this to Spanish'
-    >>> ]
+    from transformers import MarianMTModel, MarianTokenizer
+    src_text = [
+        '>>fr<< this is a sentence in english that we want to translate to french',
+        '>>pt<< This should go to portuguese',
+        '>>es<< And this to Spanish'
+    ]

-    >>> model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
-    >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+    model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
+    tokenizer = MarianTokenizer.from_pretrained(model_name)
+    print(tokenizer.supported_language_codes)

-    >>> model = MarianMTModel.from_pretrained(model_name)
-    >>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
-    >>> tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
-    ["c'est une phrase en anglais que nous voulons traduire en français", 
-     'Isto deve ir para o português.',
-     'Y esto al español']
+    model = MarianMTModel.from_pretrained(model_name)
+    translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
+    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+    # ["c'est une phrase en anglais que nous voulons traduire en français", 'Isto deve ir para o português.',  'Y esto al español']



@@ -179,7 +176,7 @@ MarianTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.MarianTokenizer
-    :members: as_target_tokenizer
+    :members: prepare_seq2seq_batch


 MarianModel
--- a/docs/source/model_doc/mbart.rst
+++ b/docs/source/model_doc/mbart.rst
@@ -10,14 +10,14 @@
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.

-MBart and MBart-50
+MBart
 -----------------------------------------------------------------------------------------------------------------------

 **DISCLAIMER:** If you see something strange, file a `Github Issue
 <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
@patrickvonplaten

-Overview of MBart
+Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 The MBart model was presented in `Multilingual Denoising Pre-training for Neural Machine Translation
@@ -31,34 +31,33 @@ on the encoder, decoder, or reconstructing parts of the text.

 The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/mbart>`__

-Training of MBart
+Examples
 _______________________________________________________________________________________________________________________

-MBart is a multilingual encoder-decoder (sequence-to-sequence) model primarily intended for translation task. As the
-model is multilingual it expects the sequences in a different format. A special language id token is added in both the
-source and target text. The source text format is :obj:`X [eos, src_lang_code]` where :obj:`X` is the source text. The
-target text format is :obj:`[tgt_lang_code] X [eos]`. :obj:`bos` is never used.
+- Examples and scripts for fine-tuning mBART and other models for sequence to sequence tasks can be found in
+  :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
+- Given the large embeddings table, mBART consumes a large amount of GPU RAM, especially for fine-tuning.
+  :class:`MarianMTModel` is usually a better choice for bilingual machine translation.

-The regular :meth:`~transformers.MBartTokenizer.__call__` will encode source text format, and it should be wrapped
-inside the context manager :meth:`~transformers.MBartTokenizer.as_target_tokenizer` to encode target text format.
+Training
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+MBart is a multilingual encoder-decoder (seq-to-seq) model primarily intended for translation task. As the model is
+multilingual it expects the sequences in a different format. A special language id token is added in both the source
+and target text. The source text format is :obj:`X [eos, src_lang_code]` where :obj:`X` is the source text. The target
+text format is :obj:`[tgt_lang_code] X [eos]`. :obj:`bos` is never used.
+
+The :meth:`~transformers.MBartTokenizer.prepare_seq2seq_batch` handles this automatically and should be used to encode
+the sequences for sequence-to-sequence fine-tuning.

 - Supervised training

 .. code-block::

-    >>> from transformers import MBartForConditionalGeneration, MBartTokenizer
-
-    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
-    >>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
-    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-
-    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt", src_lang="en_XX", tgt_lang="ro_RO")
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")
-
-    >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
-    >>> # forward pass
-    >>> model(**inputs, labels=batch['labels'])
+    example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
+    expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
+    batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt")
+    model(input_ids=batch['input_ids'], labels=batch['labels']) # forward pass

 - Generation

@@ -67,95 +66,14 @@ inside the context manager :meth:`~transformers.MBartTokenizer.as_target_tokeniz

 .. code-block::

-    >>> from transformers import MBartForConditionalGeneration, MBartTokenizer
-
-    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX")
-    >>> article = "UN Chief Says There Is No Military Solution in Syria"
-    >>> inputs = tokenizer(article, return_tensors="pt")
-    >>> translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
-    >>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-    "Şeful ONU declară că nu există o soluţie militară în Siria"
-
-
-Overview of MBart-50
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-MBart-50 was introduced in the `Multilingual Translation with Extensible Multilingual Pretraining and Finetuning
-<https://arxiv.org/abs/2008.00401>` paper by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav
-Chaudhary, Jiatao Gu, Angela Fan. MBart-50 is created using the original `mbart-large-cc25` checkpoint by extendeding
-its embedding layers with randomly initialized vectors for an extra set of 25 language tokens and then pretrained on 50
-languages.
-
-According to the abstract
-
-*Multilingual translation models can be created through multilingual finetuning. Instead of finetuning on one
-direction, a pretrained model is finetuned on many directions at the same time. It demonstrates that pretrained models
-can be extended to incorporate additional languages without loss of performance. Multilingual finetuning improves on
-average 1 BLEU over the strongest baselines (being either multilingual from scratch or bilingual finetuning) while
-improving 9.3 BLEU on average over bilingual baselines from scratch.*
-
-
-Training of MBart-50
-_______________________________________________________________________________________________________________________
-
-The text format for MBart-50 is slightly different from mBART. For MBart-50 the language id token is used as a prefix
-for both source and target text i.e the text format is :obj:`[lang_code] X [eos]`, where :obj:`lang_code` is source
-language id for source text and target language id for target text, with :obj:`X` being the source or target text
-respectively.
-
-
-MBart-50 has its own tokenizer :class:`~transformers.MBart50Tokenizer`.
-
-  Supervised training
-
-.. code-block::
-
-    from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
-
-    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
-    tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
-
-    src_text = " UN Chief Says There Is No Military Solution in Syria"
-    tgt_text =  "Şeful ONU declară că nu există o soluţie militară în Siria"
-
-    model_inputs = tokenizer(src_text, return_tensors="pt")
-    with tokenizer.as_target_tokenizer():
-        labels = tokenizer(tgt_text, return_tensors="pt").input_ids
-
-    model(**model_inputs, labels=labels) # forward pass
-
-
- Generation
-
-    To generate using the mBART-50 multilingual translation models, :obj:`eos_token_id` is used as the
-    :obj:`decoder_start_token_id` and the target language id is forced as the first generated token. To force the
-    target language id as the first generated token, pass the `forced_bos_token_id` parameter to the `generate` method.
-    The following example shows how to translate between Hindi to French and Arabic to English using the
-    `facebook/mbart-50-large-many-to-many` checkpoint.
-
-.. code-block::
-
-    from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
-
-    article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है"
-    article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا."
-
-    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-    tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-
-    # translate Hindi to French
-    tokenizer.src_lang = "hi_IN"
-    encoded_hi = tokenizer(article_hi, return_tensors="pt")
-    generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"])
-    tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-    # => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire en Syria."
-
-    # translate Arabic to English
-    tokenizer.src_lang = "ar_AR"
-    encoded_ar = tokenizer(article_ar, return_tensors="pt")
-    generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
-    tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-    # => "The Secretary-General of the United Nations says there is no military solution in Syria."
+    from transformers import MBartForConditionalGeneration, MBartTokenizer
+    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
+    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
+    article = "UN Chief Says There Is No Military Solution in Syria"
+    batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang="en_XX", return_tensors="pt")
+    translated_tokens = model.generate(**batch, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
+    translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+    assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria"


 MBartConfig
@@ -169,7 +87,7 @@ MBartTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.MBartTokenizer
-    :members: as_target_tokenizer, build_inputs_with_special_tokens
+    :members: build_inputs_with_special_tokens, prepare_seq2seq_batch


 MBartTokenizerFast
@@ -179,20 +97,6 @@ MBartTokenizerFast
    :members:


-MBart50Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBart50Tokenizer
-    :members:
-
-
-MBart50TokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBart50TokenizerFast
-    :members:
-
-
 MBartModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/pegasus.rst
+++ b/docs/source/model_doc/pegasus.rst
@@ -51,8 +51,8 @@ All the `checkpoints <https://huggingface.co/models?search=pegasus>`__ are fine-
 Examples
 _______________________________________________________________________________________________________________________

- :prefix_link:`Script <examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh>` to fine-tune pegasus
-  on the XSUM dataset. Data download instructions at :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
+- :prefix_link:`Script <examples/seq2seq/finetune_pegasus_xsum.sh>` to fine-tune pegasus on the XSUM dataset. Data
+  download instructions at :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
 - FP16 is not supported (help/ideas on this appreciated!).
 - The adafactor optimizer is recommended for pegasus fine-tuning.

@@ -78,20 +78,20 @@ Usage Example

 .. code-block:: python

-    >>> from transformers import PegasusForConditionalGeneration, PegasusTokenizer
-    >>> import torch
-    >>> src_text = [
-    ...     """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
-    >>> ]
+    from transformers import PegasusForConditionalGeneration, PegasusTokenizer
+    import torch
+    src_text = [
+        """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
+    ]

-    >>> model_name = 'google/pegasus-xsum'
-    >>> device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    >>> tokenizer = PegasusTokenizer.from_pretrained(model_name)
-    >>> model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
-    >>> batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(torch_device)
-    >>> translated = model.generate(**batch)
-    >>> tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
-    >>> assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
+    model_name = 'google/pegasus-xsum'
+    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    tokenizer = PegasusTokenizer.from_pretrained(model_name)
+    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
+    batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest', return_tensors="pt").to(torch_device)
+    translated = model.generate(**batch)
+    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
+    assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."



@@ -107,7 +107,7 @@ PegasusTokenizer
 warning: ``add_tokens`` does not work at the moment.

 .. autoclass:: transformers.PegasusTokenizer
-    :members:
+    :members: __call__, prepare_seq2seq_batch


 PegasusTokenizerFast
--- a/docs/source/model_doc/rag.rst
+++ b/docs/source/model_doc/rag.rst
@@ -56,7 +56,7 @@ RagTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RagTokenizer
-    :members:
+    :members: prepare_seq2seq_batch


 Rag specific outputs
@@ -94,24 +94,3 @@ RagTokenForGeneration

 .. autoclass:: transformers.RagTokenForGeneration
    :members: forward, generate
-
-
-TFRagModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRagModel
-    :members: call
-
-
-TFRagSequenceForGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRagSequenceForGeneration
-    :members: call, generate
-
-
-TFRagTokenForGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRagTokenForGeneration
-    :members: call, generate
--- a/docs/source/model_doc/speech_to_text.rst
+++ b/docs/source/model_doc/speech_to_text.rst
@@ -1,152 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Speech2Text
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The Speech2Text model was proposed in `fairseq S2T: Fast Speech-to-Text Modeling with fairseq
-<https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. It's a
-transformer-based seq2seq (encoder-decoder) model designed for end-to-end Automatic Speech Recognition (ASR) and Speech
-Translation (ST). It uses a convolutional downsampler to reduce the length of speech inputs by 3/4th before they are
-fed into the encoder. The model is trained with standard autoregressive cross-entropy loss and generates the
-transcripts/translations autoregressively. Speech2Text has been fine-tuned on several datasets for ASR and ST:
-`LibriSpeech <http://www.openslr.org/12>`__, `CoVoST 2 <https://github.com/facebookresearch/covost>`__, `MuST-C
-<https://ict.fbk.eu/must-c/>`__.
-
-The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text>`__.
-
-
-Inference
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Speech2Text is a speech model that accepts a float tensor of log-mel filter-bank features extracted from the speech
-signal. It's a transformer-based seq2seq model, so the transcripts/translations are generated autoregressively. The
-:obj:`generate()` method can be used for inference.
-
-The :class:`~transformers.Speech2TextFeatureExtractor` class is responsible for extracting the log-mel filter-bank
-features. The :class:`~transformers.Speech2TextProcessor` wraps :class:`~transformers.Speech2TextFeatureExtractor` and
-:class:`~transformers.Speech2TextTokenizer` into a single instance to both extract the input features and decode the
-predicted token ids.
-
-The feature extractor depends on :obj:`torchaudio` and the tokenizer depends on :obj:`sentencepiece` so be sure to
-install those packages before running the examples. You could either install those as extra speech dependancies with
-``pip install transformers"[speech, sentencepiece]"`` or install the packages seperatly with ``pip install torchaudio
-sentencepiece``. Also ``torchaudio`` requires the development version of the `libsndfile
-<http://www.mega-nerd.com/libsndfile/>`__ package which can be installed via a system package manager. On Ubuntu it can
-be installed as follows: ``apt install libsndfile1-dev``
-
-
- ASR and Speech Translation
-
-.. code-block::
-
-        >>> import torch
-        >>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
-        >>> from datasets import load_dataset
-        >>> import soundfile as sf
-
-        >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
-        >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
-
-        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
-        ...     return batch
-
-        >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
-        >>> ds = ds.map(map_to_array)
-
-        >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
-        >>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask"])
-
-        >>> transcription = processor.batch_decode(generated_ids)
-
-
- Multilingual speech translation
-
-    For multilingual speech translation models, :obj:`eos_token_id` is used as the :obj:`decoder_start_token_id` and
-    the target language id is forced as the first generated token. To force the target language id as the first
-    generated token, pass the :obj:`forced_bos_token_id` parameter to the :obj:`generate()` method. The following
-    example shows how to transate English speech to French text using the `facebook/s2t-medium-mustc-multilingual-st`
-    checkpoint.
-
-.. code-block::
-
-        >>> import torch
-        >>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
-        >>> from datasets import load_dataset
-        >>> import soundfile as sf
-
-        >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
-        >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
-
-        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
-        ...     return batch
-
-        >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
-        >>> ds = ds.map(map_to_array)
-
-        >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
-        >>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask], forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"])
-
-        >>> translation = processor.batch_decode(generated_ids)
-
-
-See the `model hub <https://huggingface.co/models?filter=speech_to_text>`__ to look for Speech2Text checkpoints.
-
-
-Speech2TextConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextConfig
-    :members:
-
-
-Speech2TextTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-Speech2TextFeatureExtractor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextFeatureExtractor
-    :members: __call__
-
-
-Speech2TextProcessor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextProcessor
-    :members: __call__, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
-
-
-Speech2TextModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextModel
-    :members: forward
-
-
-Speech2TextForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextForConditionalGeneration
-    :members: forward
--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -104,7 +104,7 @@ T5Tokenizer

 .. autoclass:: transformers.T5Tokenizer
    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
+        create_token_type_ids_from_sequences, prepare_seq2seq_batch, save_vocabulary


 T5TokenizerFast
--- a/docs/source/model_doc/wav2vec2.rst
+++ b/docs/source/model_doc/wav2vec2.rst
@@ -34,7 +34,7 @@ Tips:

 - Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
 - Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
-  using :class:`~transformers.Wav2Vec2CTCTokenizer`.
+  using :class:`~transformers.Wav2Vec2Tokenizer`.


 Wav2Vec2Config
@@ -44,27 +44,13 @@ Wav2Vec2Config
    :members:


-Wav2Vec2CTCTokenizer
+Wav2Vec2Tokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.Wav2Vec2CTCTokenizer
+.. autoclass:: transformers.Wav2Vec2Tokenizer
    :members: __call__, save_vocabulary


-Wav2Vec2FeatureExtractor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2FeatureExtractor
-    :members: __call__
-
-
-Wav2Vec2Processor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2Processor
-    :members: __call__, pad, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
-
-
 Wav2Vec2Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -72,8 +58,8 @@ Wav2Vec2Model
    :members: forward


-Wav2Vec2ForCTC
+Wav2Vec2ForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.Wav2Vec2ForCTC
+.. autoclass:: transformers.Wav2Vec2ForMaskedLM
    :members: forward
--- a/docs/source/model_doc/xlsr_wav2vec2.rst
+++ b/docs/source/model_doc/xlsr_wav2vec2.rst
@@ -1,45 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-XLSR-Wav2Vec2
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The XLSR-Wav2Vec2 model was proposed in `Unsupervised Cross-Lingual Representation Learning For Speech Recognition
-<https://arxiv.org/abs/2006.13979>`__ by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael
-Auli.
-
-The abstract from the paper is the following:
-
-*This paper presents XLSR which learns cross-lingual speech representations by pretraining a single model from the raw
-waveform of speech in multiple languages. We build on wav2vec 2.0 which is trained by solving a contrastive task over
-masked latent speech representations and jointly learns a quantization of the latents shared across languages. The
-resulting model is fine-tuned on labeled data and experiments show that cross-lingual pretraining significantly
-outperforms monolingual pretraining. On the CommonVoice benchmark, XLSR shows a relative phoneme error rate reduction
-of 72% compared to the best known results. On BABEL, our approach improves word error rate by 16% relative compared to
-a comparable system. Our approach enables a single multilingual speech recognition model which is competitive to strong
-individual models. Analysis shows that the latent discrete speech representations are shared across languages with
-increased sharing for related languages. We hope to catalyze research in low-resource speech understanding by releasing
-XLSR-53, a large model pretrained in 53 languages.*
-
-Tips:
-
- XLSR-Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
- XLSR-Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be
-  decoded using :class:`~transformers.Wav2Vec2CTCTokenizer`.
-
-XLSR-Wav2Vec2's architecture is based on the Wav2Vec2 model, so one can refer to :doc:`Wav2Vec2's documentation page
-<wav2vec2>`.
-
-The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec>`__.
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -365,12 +365,6 @@ For the full list, refer to `https://huggingface.co/models <https://huggingface.
 |                    | ``reformer-crime-and-punishment``                          | | 6-layer, 256-hidden, 2-heads, 3M parameters                                                                                         |
 |                    |                                                            | | Trained on English text: Crime and Punishment novel by Fyodor Dostoyevsky.                                                          |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| M2M100             | ``facebook/m2m100_418M``                                   | | 24-layer, 1024-hidden, 16-heads, 418M parameters                                                                                    |
-|                    |                                                            | | multilingual machine translation model for 100 languages                                                                            |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``facebook/m2m100_1.2B``                                   | | 48-layer, 1024-hidden, 16-heads, 1.2B parameters                                                                                    |
-|                    |                                                            | | multilingual machine translation model for 100 languages                                                                            |
-+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | MarianMT           | ``Helsinki-NLP/opus-mt-{src}-{tgt}``                       | | 12-layer, 512-hidden, 8-heads, ~74M parameter Machine translation models. Parameter counts vary depending on vocab size.            |
 |                    |                                                            | | (see `model list <https://huggingface.co/Helsinki-NLP>`_)                                                                           |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
@@ -387,15 +381,6 @@ For the full list, refer to `https://huggingface.co/models <https://huggingface.
 |                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                    | ``facebook/mbart-large-en-ro``                             | | 24-layer, 1024-hidden, 16-heads, 610M parameters                                                                                    |
 |                    |                                                            | | mbart-large-cc25 model finetuned on WMT english romanian translation.                                                               |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``facebook/mbart-large-50``                                | | 24-layer, 1024-hidden, 16-heads,                                                                                                    |
-|                    |                                                            | | mBART model trained on 50 languages' monolingual corpus.                                                                            |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``facebook/mbart-large-50-one-to-many-mmt``                | | 24-layer, 1024-hidden, 16-heads,                                                                                                    |
-|                    |                                                            | | mbart-50-large model finetuned for one (English) to many multilingual machine translation covering 50 languages.                    |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``facebook/mbart-large-50-many-to-many-mmt``               | | 24-layer, 1024-hidden, 16-heads,                                                                                                    |
-|                    |                                                            | | mbart-50-large model finetuned for many to many multilingual machine translation covering 50 languages.                             |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | Lxmert             | ``lxmert-base-uncased``                                    | | 9-language layers, 9-relationship layers, and 12-cross-modality layers                                                              |
 |                    |                                                            | | 768-hidden, 12-heads (for each layer) ~ 228M parameters                                                                             |
@@ -449,30 +434,15 @@ For the full list, refer to `https://huggingface.co/models <https://huggingface.
 |                    |                                                            |                                                                                                                                       |
 |                    |                                                            | (see `details <https://github.com/microsoft/unilm/tree/master/layoutlm>`__)                                                           |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| DeBERTa            | ``microsoft/deberta-base``                                 | | 12-layer, 768-hidden, 12-heads, ~140M parameters                                                                                    |
+| DeBERTa            | ``microsoft/deberta-base``                                 | | 12-layer, 768-hidden, 12-heads, ~125M parameters                                                                                    |
 |                    |                                                            | | DeBERTa using the BERT-base architecture                                                                                            |
 |                    |                                                            |                                                                                                                                       |
 |                    |                                                            | (see `details <https://github.com/microsoft/DeBERTa>`__)                                                                              |
 |                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``microsoft/deberta-large``                                | | 24-layer, 1024-hidden, 16-heads, ~400M parameters                                                                                   |
+|                    | ``microsoft/deberta-large``                                | | 24-layer, 1024-hidden, 16-heads, ~390M parameters                                                                                   |
 |                    |                                                            | | DeBERTa using the BERT-large architecture                                                                                           |
 |                    |                                                            |                                                                                                                                       |
 |                    |                                                            | (see `details <https://github.com/microsoft/DeBERTa>`__)                                                                              |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``microsoft/deberta-xlarge``                               | | 48-layer, 1024-hidden, 16-heads, ~750M parameters                                                                                   |
-|                    |                                                            | | DeBERTa XLarge with similar BERT architecture                                                                                       |
-|                    |                                                            |                                                                                                                                       |
-|                    |                                                            | (see `details <https://github.com/microsoft/DeBERTa>`__)                                                                              |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``microsoft/deberta-xlarge-v2``                            | | 24-layer, 1536-hidden, 24-heads, ~900M parameters                                                                                   |
-|                    |                                                            | | DeBERTa XLarge V2 with similar BERT architecture                                                                                    |
-|                    |                                                            |                                                                                                                                       |
-|                    |                                                            | (see `details <https://github.com/microsoft/DeBERTa>`__)                                                                              |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``microsoft/deberta-xxlarge-v2``                           | | 48-layer, 1536-hidden, 24-heads, ~1.5B parameters                                                                                   |
-|                    |                                                            | | DeBERTa XXLarge V2 with similar BERT architecture                                                                                   |
-|                    |                                                            |                                                                                                                                       |
-|                    |                                                            | (see `details <https://github.com/microsoft/DeBERTa>`__)                                                                              |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | SqueezeBERT        | ``squeezebert/squeezebert-uncased``                        | | 12-layer, 768-hidden, 12-heads, 51M parameters, 4.3x faster than bert-base-uncased on a smartphone.                                 |
 |                    |                                                            | | SqueezeBERT architecture pretrained from scratch on masked language model (MLM) and sentence order prediction (SOP) tasks.          |
--- a/docs/source/task_summary.rst
+++ b/docs/source/task_summary.rst
@@ -54,11 +54,12 @@ Sequence Classification

 Sequence classification is the task of classifying sequences according to a given number of classes. An example of
 sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune a
-model on a GLUE sequence classification task, you may leverage the :prefix_link:`run_glue.py
-<examples/text-classification/run_glue.py>`, :prefix_link:`run_tf_glue.py
-<examples/text-classification/run_tf_glue.py>`, :prefix_link:`run_tf_text_classification.py
-<examples/text-classification/run_tf_text_classification.py>` or :prefix_link:`run_xnli.py
-<examples/text-classification/run_xnli.py>` scripts.
+model on a GLUE sequence classification task, you may leverage the `run_glue.py
+<https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_glue.py>`__ and
+`run_pl_glue.py
+<https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_pl_glue.py>`__ or
+`run_tf_glue.py
+<https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_tf_glue.py>`__ scripts.

 Here is an example of using pipelines to do sentiment analysis: identifying if a sequence is positive or negative. It
 leverages a fine-tuned model on sst2, which is a GLUE task.
@@ -167,8 +168,9 @@ Extractive Question Answering

 Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
 question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune a
-model on a SQuAD task, you may leverage the `run_qa.py
-<https://github.com/huggingface/transformers/tree/master/examples/question-answering/run_qa.py>`__ and `run_tf_squad.py
+model on a SQuAD task, you may leverage the `run_squad.py
+<https://github.com/huggingface/transformers/tree/master/examples/question-answering/run_squad.py>`__ and
+`run_tf_squad.py
 <https://github.com/huggingface/transformers/tree/master/examples/question-answering/run_tf_squad.py>`__ scripts.


@@ -240,6 +242,7 @@ Here is an example of question answering using a model and a tokenizer. The proc
    ...     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
    ...     input_ids = inputs["input_ids"].tolist()[0]
    ...
+    ...     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    ...     outputs = model(**inputs)
    ...     answer_start_scores = outputs.start_logits
    ...     answer_end_scores = outputs.end_logits
@@ -283,6 +286,7 @@ Here is an example of question answering using a model and a tokenizer. The proc
    ...     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf")
    ...     input_ids = inputs["input_ids"].numpy()[0]
    ...
+    ...     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    ...     outputs = model(inputs)
    ...     answer_start_scores = outputs.start_logits
    ...     answer_end_scores = outputs.end_logits
@@ -324,9 +328,7 @@ Masked language modeling is the task of masking tokens in a sequence with a mask
 fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the
 right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis for
 downstream tasks requiring bi-directional context, such as SQuAD (question answering, see `Lewis, Lui, Goyal et al.
-<https://arxiv.org/abs/1910.13461>`__, part 4.2). If you would like to fine-tune a model on a masked language modeling
-task, you may leverage the `run_mlm.py
-<https://github.com/huggingface/transformers/tree/master/examples/language-modeling/run_mlm.py>`__ script.
+<https://arxiv.org/abs/1910.13461>`__, part 4.2).

 Here is an example of using pipelines to replace a mask from a sequence:

@@ -434,8 +436,7 @@ Causal Language Modeling

 Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the
 model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
-for generation tasks. If you would like to fine-tune a model on a causal language modeling task, you may leverage the
-`run_clm.py <https://github.com/huggingface/transformers/tree/master/examples/language-modeling/run_clm.py>`__ script.
+for generation tasks.

 Usually, the next token is predicted by sampling from the logits of the last hidden state the model produces from the
 input sequence.
@@ -603,7 +604,11 @@ Named Entity Recognition (NER) is the task of classifying tokens according to a
 as a person, an organisation or a location. An example of a named entity recognition dataset is the CoNLL-2003 dataset,
 which is entirely based on that task. If you would like to fine-tune a model on an NER task, you may leverage the
 `run_ner.py <https://github.com/huggingface/transformers/tree/master/examples/token-classification/run_ner.py>`__
-script.
+(PyTorch), `run_pl_ner.py
+<https://github.com/huggingface/transformers/tree/master/examples/token-classification/run_pl_ner.py>`__ (leveraging
+pytorch-lightning) or the `run_tf_ner.py
+<https://github.com/huggingface/transformers/tree/master/examples/token-classification/run_tf_ner.py>`__ (TensorFlow)
+scripts.

 Here is an example of using pipelines to do named entity recognition, specifically, trying to identify tokens as
 belonging to one of 9 classes:
@@ -741,9 +746,7 @@ token. The following array should be the output:
 Summarization
 -----------------------------------------------------------------------------------------------------------------------

-Summarization is the task of summarizing a document or an article into a shorter text. If you would like to fine-tune a
-model on a summarization task, you may leverage the `run_summarization.py
-<https://github.com/huggingface/transformers/tree/master/examples/seq2seq/run_summarization.py>`__ script.
+Summarization is the task of summarizing a document or an article into a shorter text.

 An example of a summarization dataset is the CNN / Daily Mail dataset, which consists of long news articles and was
 created for the task of summarization. If you would like to fine-tune a model on a summarization task, various
@@ -821,9 +824,7 @@ CNN / Daily Mail), it yields very good results.
 Translation
 -----------------------------------------------------------------------------------------------------------------------

-Translation is the task of translating a text from one language to another. If you would like to fine-tune a model on a
-translation task, you may leverage the `run_translation.py
-<https://github.com/huggingface/transformers/tree/master/examples/seq2seq/run_translation.py>`__ script.
+Translation is the task of translating a text from one language to another.

 An example of a translation dataset is the WMT English to German dataset, which has sentences in English as the input
 data and the corresponding sentences in German as the target data. If you would like to fine-tune a model on a
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,5 +1,6 @@
 <!---
 Copyright 2020 The HuggingFace Team. All rights reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -33,43 +34,10 @@ Then cd in the example folder of your choice and run
 pip install -r requirements.txt
 ```

-To browse the examples corresponding to released versions of 🤗 Transformers, click on the line below and then on your desired version of the library:
-
-<details>
-  <summary>Examples for older versions of 🤗 Transformers</summary>
-
-  - [v4.3.3](https://github.com/huggingface/transformers/tree/v4.3.3/examples)
-  - [v4.2.2](https://github.com/huggingface/transformers/tree/v4.2.2/examples)
-  - [v4.1.1](https://github.com/huggingface/transformers/tree/v4.1.1/examples)
-  - [v4.0.1](https://github.com/huggingface/transformers/tree/v4.0.1/examples)
-  - [v3.5.1](https://github.com/huggingface/transformers/tree/v3.5.1/examples)
-  - [v3.4.0](https://github.com/huggingface/transformers/tree/v3.4.0/examples)
-  - [v3.3.1](https://github.com/huggingface/transformers/tree/v3.3.1/examples)
-  - [v3.2.0](https://github.com/huggingface/transformers/tree/v3.2.0/examples)
-  - [v3.1.0](https://github.com/huggingface/transformers/tree/v3.1.0/examples)
-  - [v3.0.2](https://github.com/huggingface/transformers/tree/v3.0.2/examples)
-  - [v2.11.0](https://github.com/huggingface/transformers/tree/v2.11.0/examples)
-  - [v2.10.0](https://github.com/huggingface/transformers/tree/v2.10.0/examples)
-  - [v2.9.1](https://github.com/huggingface/transformers/tree/v2.9.1/examples)
-  - [v2.8.0](https://github.com/huggingface/transformers/tree/v2.8.0/examples)
-  - [v2.7.0](https://github.com/huggingface/transformers/tree/v2.7.0/examples)
-  - [v2.6.0](https://github.com/huggingface/transformers/tree/v2.6.0/examples)
-  - [v2.5.1](https://github.com/huggingface/transformers/tree/v2.5.1/examples)
-  - [v2.4.0](https://github.com/huggingface/transformers/tree/v2.4.0/examples)
-  - [v2.3.0](https://github.com/huggingface/transformers/tree/v2.3.0/examples)
-  - [v2.2.0](https://github.com/huggingface/transformers/tree/v2.2.0/examples)
-  - [v2.1.1](https://github.com/huggingface/transformers/tree/v2.1.0/examples)
-  - [v2.0.0](https://github.com/huggingface/transformers/tree/v2.0.0/examples)
-  - [v1.2.0](https://github.com/huggingface/transformers/tree/v1.2.0/examples)
-  - [v1.1.0](https://github.com/huggingface/transformers/tree/v1.1.0/examples)
-  - [v1.0.0](https://github.com/huggingface/transformers/tree/v1.0.0/examples)
-</details>
-
-Alternatively, you can find switch your cloned 🤗 Transformers to a specific version (for instance with v3.5.1) with
+Alternatively, you can run the version of the examples as they were for your current version of Transformers via (for instance with v3.5.1):
 ```bash
 git checkout tags/v3.5.1
 ```
-and run the example command as usual afterward.

 ## The Big Table of Tasks

@@ -95,6 +63,12 @@ Coming soon!
 | [**`translation`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)                       | WMT             | ✅  | - | - | -


+<!--
+## One-click Deploy to Cloud (wip)
+
+**Coming soon!**
+-->
+
 ## Distributed training and mixed precision

 All the PyTorch scripts mentioned above work out of the box with distributed training and mixed precision, thanks to
@@ -173,7 +147,7 @@ python xla_spawn.py --num_cores 8 \
 You can easily log and monitor your runs code. The following are currently supported:

 * [TensorBoard](https://www.tensorflow.org/tensorboard)
-* [Weights & Biases](https://docs.wandb.ai/integrations/huggingface)
+* [Weights & Biases](https://docs.wandb.com/library/integrations/huggingface)
 * [Comet ML](https://www.comet.ml/docs/python-sdk/huggingface/)

 ### Weights & Biases
@@ -197,46 +171,9 @@ import wandb
 wandb.login()
 ```

-To enable logging to W&B, include `"wandb"` in the `report_to` of your `TrainingArguments` or script. Or just pass along `--report_to all` if you have `wandb` installed.
-
 Whenever you use `Trainer` or `TFTrainer` classes, your losses, evaluation metrics, model topology and gradients (for `Trainer` only) will automatically be logged.

-Advanced configuration is possible by setting environment variables:
-
-<table>
-  <thead>
-    <tr>
-      <th style="text-align:left">Environment Variables</th>
-      <th style="text-align:left">Options</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td style="text-align:left">WANDB_LOG_MODEL</td>
-      <td style="text-align:left">Log the model as artifact at the end of training (<b>false</b> by default)</td>
-    </tr>
-    <tr>
-      <td style="text-align:left">WANDB_WATCH</td>
-      <td style="text-align:left">
-        <ul>
-          <li><b>gradients</b> (default): Log histograms of the gradients</li>
-          <li><b>all</b>: Log histograms of gradients and parameters</li>
-          <li><b>false</b>: No gradient or parameter logging</li>
-        </ul>
-      </td>
-    </tr>
-    <tr>
-      <td style="text-align:left">WANDB_PROJECT</td>
-      <td style="text-align:left">Organize runs by project</td>
-    </tr>
-  </tbody>
-</table>
-
-Set run names with `run_name` argument present in scripts or as part of `TrainingArguments`.
-
-Additional configuration options are available through generic [wandb environment variables](https://docs.wandb.com/library/environment-variables).
-
-Refer to related [documentation & examples](https://docs.wandb.ai/integrations/huggingface).
+When using 🤗 Transformers with PyTorch Lightning, runs can be tracked through `WandbLogger`. Refer to related [documentation & examples](https://docs.wandb.com/library/integrations/lightning).

 ### Comet.ml

--- a/examples/_tests_requirements.txt
+++ b/examples/_tests_requirements.txt
@@ -2,7 +2,7 @@ tensorboard
 scikit-learn
 seqeval
 psutil
-sacrebleu >= 1.4.12
+sacrebleu
 rouge-score
 tensorflow_datasets
 matplotlib
--- a/examples/benchmarking/run_benchmark.py
+++ b/examples/benchmarking/run_benchmark.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/benchmarking/run_benchmark_tf.py
+++ b/examples/benchmarking/run_benchmark_tf.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2018 The HuggingFace Inc. team.
 # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/language-modeling/run_clm.py
+++ b/examples/language-modeling/run_clm.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
 #
@@ -44,12 +43,8 @@ from transformers import (
    set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version


-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
-
 logger = logging.getLogger(__name__)


@@ -118,21 +113,6 @@ class DataTrainingArguments:
        default=None,
        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_val_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
-            "value if set."
-        },
-    )
-
    block_size: Optional[int] = field(
        default=None,
        metadata={
@@ -365,7 +345,6 @@ def main():
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
-
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
@@ -373,26 +352,12 @@ def main():
        load_from_cache_file=not data_args.overwrite_cache,
    )

-    if training_args.do_train:
-        if "train" not in tokenized_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = lm_datasets["train"]
-        if data_args.max_train_samples is not None:
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-
-    if training_args.do_eval:
-        if "validation" not in tokenized_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = lm_datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
-
    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
+        train_dataset=lm_datasets["train"] if training_args.do_train else None,
+        eval_dataset=lm_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        # Data collator will default to DataCollatorWithPadding, so we change it.
        data_collator=default_data_collator,
@@ -409,30 +374,36 @@ def main():
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload

-        metrics = train_result.metrics
+        output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_train_file, "w") as writer:
+                logger.info("***** Train results *****")
+                for key, value in sorted(train_result.metrics.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")

-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
+            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
+            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
+    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

-        metrics = trainer.evaluate()
+        eval_output = trainer.evaluate()

-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
-        perplexity = math.exp(metrics["eval_loss"])
-        metrics["perplexity"] = perplexity
+        perplexity = math.exp(eval_output["eval_loss"])
+        results["perplexity"] = perplexity

-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results_clm.txt")
+        if trainer.is_world_process_zero():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in sorted(results.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")
+
+    return results


 def _mp_fn(index):
--- a/examples/language-modeling/run_mlm.py
+++ b/examples/language-modeling/run_mlm.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2020 The HuggingFace Team All rights reserved.
 #
@@ -44,12 +43,8 @@ from transformers import (
    set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version


-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
-
 logger = logging.getLogger(__name__)
 MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
@@ -150,20 +145,6 @@ class DataTrainingArguments:
            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
        },
    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_val_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
-            "value if set."
-        },
-    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -321,22 +302,6 @@ def main():
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

-    if data_args.max_seq_length is None:
-        max_seq_length = tokenizer.model_max_length
-        if max_seq_length > 1024:
-            logger.warn(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
-            )
-            max_seq_length = 1024
-    else:
-        if data_args.max_seq_length > tokenizer.model_max_length:
-            logger.warn(
-                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
-                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-            )
-        max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
    if data_args.line_by_line:
        # When using line_by_line, we just tokenize each nonempty line.
        padding = "max_length" if data_args.pad_to_max_length else False
@@ -348,7 +313,7 @@ def main():
                examples["text"],
                padding=padding,
                truncation=True,
-                max_length=max_seq_length,
+                max_length=data_args.max_seq_length,
                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
                # receives the `special_tokens_mask`.
                return_special_tokens_mask=True,
@@ -376,6 +341,22 @@ def main():
            load_from_cache_file=not data_args.overwrite_cache,
        )

+        if data_args.max_seq_length is None:
+            max_seq_length = tokenizer.model_max_length
+            if max_seq_length > 1024:
+                logger.warn(
+                    f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
+                    "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
+                )
+                max_seq_length = 1024
+        else:
+            if data_args.max_seq_length > tokenizer.model_max_length:
+                logger.warn(
+                    f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+                    f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+                )
+            max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
        def group_texts(examples):
@@ -398,7 +379,6 @@ def main():
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
-
        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
@@ -406,20 +386,6 @@ def main():
            load_from_cache_file=not data_args.overwrite_cache,
        )

-    if training_args.do_train:
-        if "train" not in tokenized_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = tokenized_datasets["train"]
-        if data_args.max_train_samples is not None:
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-
-    if training_args.do_eval:
-        if "validation" not in tokenized_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = tokenized_datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
-
    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
@@ -428,8 +394,8 @@ def main():
    trainer = Trainer(
        model=model,
        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
+        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
+        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
@@ -444,30 +410,37 @@ def main():
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload
-        metrics = train_result.metrics

-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+        output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_train_file, "w") as writer:
+                logger.info("***** Train results *****")
+                for key, value in sorted(train_result.metrics.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")

-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
+            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
+            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
+    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

-        metrics = trainer.evaluate()
+        eval_output = trainer.evaluate()

-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
-        perplexity = math.exp(metrics["eval_loss"])
-        metrics["perplexity"] = perplexity
+        perplexity = math.exp(eval_output["eval_loss"])
+        results["perplexity"] = perplexity

-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm.txt")
+        if trainer.is_world_process_zero():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in sorted(results.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")
+
+    return results


 def _mp_fn(index):
--- a/examples/language-modeling/run_mlm_flax.py
+++ b/examples/language-modeling/run_mlm_flax.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2020 The HuggingFace Team All rights reserved.
 #
--- a/examples/language-modeling/run_plm.py
+++ b/examples/language-modeling/run_plm.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2020 The HuggingFace Team All rights reserved.
 #
@@ -40,12 +39,8 @@ from transformers import (
    set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version


-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
-
 logger = logging.getLogger(__name__)


@@ -147,20 +142,6 @@ class DataTrainingArguments:
            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
        },
    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_val_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
-            "value if set."
-        },
-    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -318,13 +299,6 @@ def main():
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

-    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warn(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
-            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-        )
-    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
    if data_args.line_by_line:
        # When using line_by_line, we just tokenize each nonempty line.
        padding = "max_length" if data_args.pad_to_max_length else False
@@ -332,7 +306,7 @@ def main():
        def tokenize_function(examples):
            # Remove empty lines
            examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
-            return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)
+            return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length)

        tokenized_datasets = datasets.map(
            tokenize_function,
@@ -354,6 +328,13 @@ def main():
            load_from_cache_file=not data_args.overwrite_cache,
        )

+        if data_args.max_seq_length > tokenizer.model_max_length:
+            logger.warn(
+                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+            )
+        max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
        def group_texts(examples):
@@ -376,7 +357,6 @@ def main():
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
-
        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
@@ -384,20 +364,6 @@ def main():
            load_from_cache_file=not data_args.overwrite_cache,
        )

-    if training_args.do_train:
-        if "train" not in tokenized_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = tokenized_datasets["train"]
-        if data_args.max_train_samples is not None:
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-
-    if training_args.do_eval:
-        if "validation" not in tokenized_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = tokenized_datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
-
    # Data collator
    data_collator = DataCollatorForPermutationLanguageModeling(
        tokenizer=tokenizer,
@@ -409,8 +375,8 @@ def main():
    trainer = Trainer(
        model=model,
        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
+        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
+        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
@@ -425,30 +391,37 @@ def main():
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload
-        metrics = train_result.metrics

-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+        output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_train_file, "w") as writer:
+                logger.info("***** Train results *****")
+                for key, value in sorted(train_result.metrics.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")

-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
+            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
+            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
+    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

-        metrics = trainer.evaluate()
+        eval_output = trainer.evaluate()

-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
-        perplexity = math.exp(metrics["eval_loss"])
-        metrics["perplexity"] = perplexity
+        perplexity = math.exp(eval_output["eval_loss"])
+        results["perplexity"] = perplexity

-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results_plm.txt")
+        if trainer.is_world_process_zero():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in sorted(results.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")
+
+    return results


 def _mp_fn(index):
--- a/examples/legacy/run_camembert.py
+++ b/examples/legacy/run_camembert.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 import torch

 from transformers import CamembertForMaskedLM, CamembertTokenizer
--- a/examples/legacy/run_chinese_ref.py
+++ b/examples/legacy/run_chinese_ref.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 import argparse
 import json
 from typing import List
--- a/examples/legacy/run_language_modeling.py
+++ b/examples/legacy/run_language_modeling.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/legacy/run_openai_gpt.py
+++ b/examples/legacy/run_openai_gpt.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/legacy/run_swag.py
+++ b/examples/legacy/run_swag.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/legacy/run_transfo_xl.py
+++ b/examples/legacy/run_transfo_xl.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/legacy/seq2seq/README.md
+++ b/examples/legacy/seq2seq/README.md
@@ -1,334 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# Sequence-to-Sequence Training and Evaluation
-
-This directory contains examples for finetuning and evaluating transformers on summarization and translation tasks.
-For deprecated `bertabs` instructions, see [`bertabs/README.md`](https://github.com/huggingface/transformers/blob/master/examples/research_projects/bertabs/README.md).
-
-### Supported Architectures
-
- `BartForConditionalGeneration`
- `MarianMTModel`
- `PegasusForConditionalGeneration`
- `MBartForConditionalGeneration`
- `FSMTForConditionalGeneration`
- `T5ForConditionalGeneration`
-
-### Downlowd the Datasets
-
-#### XSUM
-
-```bash
-cd examples/legacy/seq2seq
-wget https://cdn-datasets.huggingface.co/summarization/xsum.tar.gz
-tar -xzvf xsum.tar.gz
-export XSUM_DIR=${PWD}/xsum
-```
-this should make a directory called `xsum/` with files like `test.source`.
-To use your own data, copy that files format. Each article to be summarized is on its own line.
-
-#### CNN/DailyMail
-
-```bash
-cd examples/legacy/seq2seq
-wget https://cdn-datasets.huggingface.co/summarization/cnn_dm_v2.tgz
-tar -xzvf cnn_dm_v2.tgz  # empty lines removed
-mv cnn_cln cnn_dm
-export CNN_DIR=${PWD}/cnn_dm
-```
-this should make a directory called `cnn_dm/` with 6 files.
-
-#### WMT16 English-Romanian Translation Data
-
-download with this command:
-```bash
-wget https://cdn-datasets.huggingface.co/translation/wmt_en_ro.tar.gz
-tar -xzvf wmt_en_ro.tar.gz
-export ENRO_DIR=${PWD}/wmt_en_ro
-```
-this should make a directory called `wmt_en_ro/` with 6 files.
-
-#### WMT English-German
-
-```bash
-wget https://cdn-datasets.huggingface.co/translation/wmt_en_de.tgz
-tar -xzvf wmt_en_de.tgz
-export DATA_DIR=${PWD}/wmt_en_de
-```
-
-#### FSMT datasets (wmt)
-
-Refer to the scripts starting with `eval_` under:
-https://github.com/huggingface/transformers/tree/master/scripts/fsmt
-
-#### Pegasus (multiple datasets)
-
-Multiple eval datasets are available for download from:
-https://github.com/stas00/porting/tree/master/datasets/pegasus
-
-
-#### Your Data
-
-If you are using your own data, it must be formatted as one directory with 6 files:
-```
-train.source
-train.target
-val.source
-val.target
-test.source
-test.target
-```
-The `.source` files are the input, the `.target` files are the desired output.
-
-### Potential issues
-
- native AMP (`--fp16` and no apex) may lead to a huge memory leak and require 10x gpu memory. This has been fixed in pytorch-nightly and the minimal official version to have this fix will be pytorch-1.7.1. Until then if you have to use mixed precision please use AMP only with pytorch-nightly or NVIDIA's apex. Reference: https://github.com/huggingface/transformers/issues/8403
-
-
-### Tips and Tricks
-
-General Tips:
- since you need to run from `examples/legacy/seq2seq`, and likely need to modify code, the easiest workflow is fork transformers, clone your fork, and run `pip install -e .` before you get started.
- try `--freeze_encoder` or `--freeze_embeds` for faster training/larger batch size.  (3hr per epoch with bs=8, see the "xsum_shared_task" command below)
- `fp16_opt_level=O1` (the default works best).
- In addition to the pytorch-lightning .ckpt checkpoint, a transformers checkpoint will be saved.
-Load it with `BartForConditionalGeneration.from_pretrained(f'{output_dir}/best_tfmr)`.
- At the moment, `--do_predict` does not work in a multi-gpu setting. You need to use `evaluate_checkpoint` or the `run_eval.py` code.
- This warning can be safely ignored:
-    > "Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-xsum and are newly initialized: ['final_logits_bias']"
- Both finetuning and eval are 30% faster with `--fp16`. For that you need to [install apex](https://github.com/NVIDIA/apex#quick-start).
- Read scripts before you run them!
-
-Summarization Tips:
- (summ) 1 epoch at batch size 1 for bart-large takes 24 hours and requires 13GB GPU RAM with fp16 on an NVIDIA-V100.
- If you want to run experiments on improving the summarization finetuning process, try the XSUM Shared Task (below). It's faster to train than CNNDM because the summaries are shorter.
- For CNN/DailyMail, the default `val_max_target_length` and `test_max_target_length` will truncate the ground truth labels, resulting in slightly higher rouge scores. To get accurate rouge scores, you should rerun calculate_rouge on the `{output_dir}/test_generations.txt` file saved by `trainer.test()`
- `--max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 ` is a reasonable setting for XSUM.
- `wandb` can be used by specifying `--logger_name wandb`. It is useful for reproducibility. Specify the environment variable `WANDB_PROJECT='hf_xsum'` to do the XSUM shared task.
- If you are finetuning on your own dataset, start from `distilbart-cnn-12-6` if you want long summaries and `distilbart-xsum-12-6` if you want short summaries.
-(It rarely makes sense to start from `bart-large` unless you are a researching finetuning methods).
-
-**Update 2018-07-18**
-Datasets: `LegacySeq2SeqDataset` will be used for all tokenizers without a `prepare_seq2seq_batch` method. Otherwise, `Seq2SeqDataset` will be used.
-Future work/help wanted: A new dataset to support multilingual tasks.
-
-
-### Fine-tuning using Seq2SeqTrainer
-To use `Seq2SeqTrainer` for fine-tuning you should use the `finetune_trainer.py` script. It subclasses `Trainer` to extend it for seq2seq training. Except the `Trainer`-related `TrainingArguments`, it shares the same argument names as that of `finetune.py` file. One notable difference is that calculating generative metrics (BLEU, ROUGE) is optional and is controlled using the `--predict_with_generate` argument.
-
-With PyTorch 1.6+ it'll automatically use `native AMP` when `--fp16` is set.
-
-To see all the possible command line options, run:
-
-```bash
-python finetune_trainer.py --help
-```
-
-For multi-gpu training use `torch.distributed.launch`, e.g. with 2 gpus:
-```bash
-python -m torch.distributed.launch --nproc_per_node=2  finetune_trainer.py ...
-```
-
-**At the moment, `Seq2SeqTrainer` does not support *with teacher* distillation.**
-
-All `Seq2SeqTrainer`-based fine-tuning scripts are included in the `builtin_trainer` directory.
-
-#### TPU Training
-`Seq2SeqTrainer` supports TPU training with few caveats
-1. As `generate` method does not work on TPU at the moment, `predict_with_generate` cannot be used. You should use `--prediction_loss_only` to only calculate loss, and do not set `--do_predict` and `--predict_with_generate`.
-2. All sequences should be padded to be of equal length to avoid extremely slow training. (`finetune_trainer.py` does this automatically when running on TPU.)
-
-We provide a very simple launcher script named `xla_spawn.py` that lets you run our example scripts on multiple TPU cores without any boilerplate. Just pass a `--num_cores` flag to this script, then your regular training script with its arguments (this is similar to the `torch.distributed.launch` helper for `torch.distributed`).
-
-`builtin_trainer/finetune_tpu.sh` script provides minimal arguments needed for TPU training.
-
-The following command fine-tunes `sshleifer/student_marian_en_ro_6_3` on TPU V3-8 and should complete one epoch in ~5-6 mins.
-
-```bash
-./builtin_trainer/train_distil_marian_enro_tpu.sh
-```
-
-## Evaluation Commands
-
-To create summaries for each article in dataset, we use `run_eval.py`, here are a few commands that run eval for different tasks and models.
-If 'translation' is in your task name, the computed metric will be BLEU. Otherwise, ROUGE will be used.
-
-For t5, you need to specify --task translation_{src}_to_{tgt} as follows:
-```bash
-export DATA_DIR=wmt_en_ro
-./run_eval.py t5-base \
-    $DATA_DIR/val.source t5_val_generations.txt \
-    --reference_path $DATA_DIR/val.target \
-    --score_path enro_bleu.json \
-    --task translation_en_to_ro \
-    --n_obs 100 \
-    --device cuda \
-    --fp16 \
-    --bs 32
-```
-
-This command works for MBART, although the BLEU score is suspiciously low.
-```bash
-export DATA_DIR=wmt_en_ro
-./run_eval.py facebook/mbart-large-en-ro $DATA_DIR/val.source mbart_val_generations.txt \
-    --reference_path $DATA_DIR/val.target \
-    --score_path enro_bleu.json \
-    --task translation \
-    --n_obs 100 \
-    --device cuda \
-    --fp16 \
-    --bs 32
-```
-
-Summarization (xsum will be very similar):
-```bash
-export DATA_DIR=cnn_dm
-./run_eval.py sshleifer/distilbart-cnn-12-6 $DATA_DIR/val.source dbart_val_generations.txt \
-    --reference_path $DATA_DIR/val.target \
-    --score_path cnn_rouge.json \
-    --task summarization \
-    --n_obs 100 \
-
-th 56 \
-    --fp16 \
-    --bs 32
-```
-
-### Multi-GPU Evaluation
-here is a command to run xsum evaluation on 8 GPUS. It is more than linearly faster than run_eval.py in some cases
-because it uses SortishSampler to minimize padding. You can also use it on 1 GPU. `data_dir` must have
-`{type_path}.source` and `{type_path}.target`. Run `./run_distributed_eval.py --help` for all clargs.
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=8  run_distributed_eval.py \
-    --model_name sshleifer/distilbart-large-xsum-12-3  \
-    --save_dir xsum_generations \
-    --data_dir xsum \
-    --fp16  # you can pass generate kwargs like num_beams here, just like run_eval.py
-```
-
-Contributions that implement this command for other distributed hardware setups are welcome!
-
-#### Single-GPU Eval: Tips and Tricks
-
-When using `run_eval.py`, the following features can be useful:
-
-* if you running the script multiple times and want to make it easier to track what arguments produced that output, use `--dump-args`. Along with the results it will also dump any custom params that were passed to the script. For example if you used: `--num_beams 8 --early_stopping true`, the output will be:
-   ```
-   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True}
-   ```
-
-   `--info` is an additional argument available for the same purpose of tracking the conditions of the experiment. It's useful to pass things that weren't in the argument list, e.g. a language pair `--info "lang:en-ru"`. But also if you pass `--info` without a value it will fallback to the current date/time string, e.g. `2020-09-13 18:44:43`.
-
-   If using `--dump-args --info`, the output will be:
-
-   ```
-   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True, 'info': '2020-09-13 18:44:43'}
-   ```
-
-   If using `--dump-args --info "pair:en-ru chkpt=best`, the output will be:
-
-   ```
-   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True, 'info': 'pair=en-ru chkpt=best'}
-   ```
-
-
-* if you need to perform a parametric search in order to find the best ones that lead to the highest BLEU score, let `run_eval_search.py` to do the searching for you.
-
-   The script accepts the exact same arguments as `run_eval.py`, plus an additional argument `--search`. The value of `--search` is parsed, reformatted and fed to ``run_eval.py`` as additional args.
-
-   The format for the `--search` value is a simple string with hparams and colon separated values to try, e.g.:
-   ```
-    --search "num_beams=5:10 length_penalty=0.8:1.0:1.2 early_stopping=true:false"
-   ```
-   which will generate `12` `(2*3*2)` searches for a product of each hparam. For example the example that was just used will invoke `run_eval.py` repeatedly with:
-
-   ```
-    --num_beams 5 --length_penalty 0.8 --early_stopping true
-    --num_beams 5 --length_penalty 0.8 --early_stopping false
-    [...]
-    --num_beams 10 --length_penalty 1.2 --early_stopping false
-   ```
-
-   On completion, this function prints a markdown table of the results sorted by the best BLEU score and the winning arguments.
-
-```
-bleu  | num_beams | length_penalty | early_stopping
----- | --------- | -------------- | --------------
-26.71 |         5 |            1.1 |              1
-26.66 |         5 |            0.9 |              1
-26.66 |         5 |            0.9 |              0
-26.41 |         5 |            1.1 |              0
-21.94 |         1 |            0.9 |              1
-21.94 |         1 |            0.9 |              0
-21.94 |         1 |            1.1 |              1
-21.94 |         1 |            1.1 |              0
-
-Best score args:
-stas/wmt19-en-ru data/en-ru/val.source data/en-ru/test_translations.txt --reference_path data/en-ru/val.target --score_path data/en-ru/test_bleu.json --bs 8 --task translation --num_beams 5 --length_penalty 1.1 --early_stopping True
-```
-
-If you pass `--info "some experiment-specific info"` it will get printed before the results table - this is useful for scripting and multiple runs, so one can tell the different sets of results from each other.
-
-
-### Contributing
- follow the standard contributing guidelines and code of conduct.
- add tests to `test_seq2seq_examples.py`
- To run only the seq2seq tests, you must be in the root of the repository and run:
-```bash
-pytest examples/seq2seq/
-```
-
-### Converting pytorch-lightning checkpoints
-pytorch lightning ``-do_predict`` often fails, after you are done training, the best way to evaluate your model is to convert it.
-
-This should be done for you, with a file called `{save_dir}/best_tfmr`.
-
-If that file doesn't exist but you have a lightning `.ckpt` file, you can run
-```bash
-python convert_pl_checkpoint_to_hf.py PATH_TO_CKPT  randomly_initialized_hf_model_path save_dir/best_tfmr
-```
-Then either `run_eval` or `run_distributed_eval` with `save_dir/best_tfmr` (see previous sections)
-
-
-# Experimental Features
-These features are harder to use and not always useful.
-
-###  Dynamic Batch Size for MT
-`finetune.py` has a command line arg `--max_tokens_per_batch` that allows batches to be dynamically sized.
-This feature can only be used:
- with fairseq installed
- on 1 GPU
- without sortish sampler
- after calling `./save_len_file.py $tok $data_dir`
-
-For example,
-```bash
-./save_len_file.py Helsinki-NLP/opus-mt-en-ro  wmt_en_ro
-./dynamic_bs_example.sh --max_tokens_per_batch=2000 --output_dir benchmark_dynamic_bs
-```
-splits `wmt_en_ro/train` into 11,197 uneven lengthed batches and can finish 1 epoch in 8 minutes on a v100.
-
-For comparison,
-```bash
-./dynamic_bs_example.sh --sortish_sampler --train_batch_size 48
-```
-uses 12,723 batches of length 48 and takes slightly more time 9.5 minutes.
-
-The feature is still experimental, because:
-+ we can make it much more robust if we have memory mapped/preprocessed datasets.
-+ The speedup over sortish sampler is not that large at the moment.
--- a/examples/legacy/seq2seq/requirements.txt
+++ b/examples/legacy/seq2seq/requirements.txt
@@ -1,20 +0,0 @@
-tensorboard
-scikit-learn
-seqeval
-psutil
-sacrebleu
-rouge-score
-tensorflow_datasets
-matplotlib
-git-python==1.0.3
-faiss-cpu
-streamlit
-elasticsearch
-nltk
-pandas
-datasets >= 1.1.3
-fire
-pytest
-conllu
-sentencepiece != 0.1.92
-protobuf
--- a/examples/legacy/token-classification/README.md
+++ b/examples/legacy/token-classification/README.md
@@ -129,71 +129,6 @@ On the test dataset the following results could be achieved:
 10/04/2019 00:42:42 - INFO - __main__ -     recall = 0.8624150210424085
 ```

-#### Run the Tensorflow 2 version
-
-To start training, just run:
-
-```bash
-python3 run_tf_ner.py --data_dir ./ \
--labels ./labels.txt \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_device_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict
-```
-
-Such as the Pytorch version, if your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
-
-#### Evaluation
-
-Evaluation on development dataset outputs the following for our example:
-```bash
-           precision    recall  f1-score   support
-
- LOCderiv     0.7619    0.6154    0.6809        52
-  PERpart     0.8724    0.8997    0.8858      4057
-  OTHpart     0.9360    0.9466    0.9413       711
-  ORGpart     0.7015    0.6989    0.7002       269
-  LOCpart     0.7668    0.8488    0.8057       496
-      LOC     0.8745    0.9191    0.8963       235
- ORGderiv     0.7723    0.8571    0.8125        91
- OTHderiv     0.4800    0.6667    0.5581        18
-      OTH     0.5789    0.6875    0.6286        16
- PERderiv     0.5385    0.3889    0.4516        18
-      PER     0.5000    0.5000    0.5000         2
-      ORG     0.0000    0.0000    0.0000         3
-
-micro avg     0.8574    0.8862    0.8715      5968
-macro avg     0.8575    0.8862    0.8713      5968
-```
-
-On the test dataset the following results could be achieved:
-```bash
-           precision    recall  f1-score   support
-
-  PERpart     0.8847    0.8944    0.8896      9397
-  OTHpart     0.9376    0.9353    0.9365      1639
-  ORGpart     0.7307    0.7044    0.7173       697
-      LOC     0.9133    0.9394    0.9262       561
-  LOCpart     0.8058    0.8157    0.8107      1150
-      ORG     0.0000    0.0000    0.0000         8
- OTHderiv     0.5882    0.4762    0.5263        42
- PERderiv     0.6571    0.5227    0.5823        44
-      OTH     0.4906    0.6667    0.5652        39
- ORGderiv     0.7016    0.7791    0.7383       172
- LOCderiv     0.8256    0.6514    0.7282       109
-      PER     0.0000    0.0000    0.0000        11
-
-micro avg     0.8722    0.8774    0.8748     13869
-macro avg     0.8712    0.8774    0.8740     13869
-```
-
 ### Emerging and Rare Entities task: WNUT’17 (English NER) dataset

 Description of the WNUT’17 task from the [shared task website](http://noisy-text.github.io/2017/index.html):
--- a/examples/multiple-choice/run_swag.py
+++ b/examples/multiple-choice/run_swag.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
 #
@@ -39,15 +38,10 @@ from transformers import (
    default_data_collator,
    set_seed,
 )
-from transformers.file_utils import PaddingStrategy
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version


-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
-
 logger = logging.getLogger(__name__)


@@ -120,20 +114,6 @@ class DataTrainingArguments:
            "efficient on GPU but very bad for TPU."
        },
    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_val_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
-            "value if set."
-        },
-    )

    def __post_init__(self):
        if self.train_file is not None:
@@ -152,7 +132,7 @@ class DataCollatorForMultipleChoice:
    Args:
        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
            The tokenizer used for encoding the data.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:

@@ -245,8 +225,6 @@ def main():
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
@@ -307,22 +285,6 @@ def main():
    context_name = "sent1"
    question_header_name = "sent2"

-    if data_args.max_seq_length is None:
-        max_seq_length = tokenizer.model_max_length
-        if max_seq_length > 1024:
-            logger.warn(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
-            )
-            max_seq_length = 1024
-    else:
-        if data_args.max_seq_length > tokenizer.model_max_length:
-            logger.warn(
-                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
-                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-            )
-        max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
    # Preprocessing the datasets.
    def preprocess_function(examples):
        first_sentences = [[context] * 4 for context in examples[context_name]]
@@ -340,37 +302,18 @@ def main():
            first_sentences,
            second_sentences,
            truncation=True,
-            max_length=max_seq_length,
+            max_length=data_args.max_seq_length,
            padding="max_length" if data_args.pad_to_max_length else False,
        )
        # Un-flatten
        return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

-    if training_args.do_train:
-        train_dataset = datasets["train"]
-        if "train" not in datasets:
-            raise ValueError("--do_train requires a train dataset")
-        if data_args.max_train_samples is not None:
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-        train_dataset = train_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-
-    if training_args.do_eval:
-        if "validation" not in datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
-        eval_dataset = eval_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
+    tokenized_datasets = datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        load_from_cache_file=not data_args.overwrite_cache,
+    )

    # Data collator
    data_collator = (
@@ -389,8 +332,8 @@ def main():
    trainer = Trainer(
        model=model,
        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
+        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
+        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
@@ -406,27 +349,34 @@ def main():
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload
-        metrics = train_result.metrics

-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+        output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_train_file, "w") as writer:
+                logger.info("***** Train results *****")
+                for key, value in sorted(train_result.metrics.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")

-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
+            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
+            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
+    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

-        metrics = trainer.evaluate()
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+        results = trainer.evaluate()

-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results_swag.txt")
+        if trainer.is_world_process_zero():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in sorted(results.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")
+
+    return results


 def _mp_fn(index):
--- a/examples/multiple-choice/run_tf_multiple_choice.py
+++ b/examples/multiple-choice/run_tf_multiple_choice.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/question-answering/run_qa.py
+++ b/examples/question-answering/run_qa.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2020 The HuggingFace Team All rights reserved.
 #
@@ -41,13 +40,9 @@ from transformers import (
    set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version
 from utils_qa import postprocess_qa_predictions


-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
-
 logger = logging.getLogger(__name__)


@@ -122,20 +117,6 @@ class DataTrainingArguments:
            "be faster on GPU but will be slower on TPU)."
        },
    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_val_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
-            "value if set."
-        },
-    )
    version_2_with_negative: bool = field(
        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
    )
@@ -219,8 +200,6 @@ def main():
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
@@ -242,12 +221,9 @@ def main():
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-
+        extension = data_args.train_file.split(".")[-1]
        datasets = load_dataset(extension, data_files=data_files, field="data")
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -300,13 +276,6 @@ def main():
    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

-    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warn(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
-            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-        )
-    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
    # Training preprocessing
    def prepare_train_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
@@ -316,7 +285,7 @@ def main():
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
-            max_length=max_seq_length,
+            max_length=data_args.max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
@@ -381,23 +350,13 @@ def main():
        return tokenized_examples

    if training_args.do_train:
-        if "train" not in datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = datasets["train"]
-        if data_args.max_train_samples is not None:
-            # We will select sample from whole data if agument is specified
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-        # Create train feature from dataset
-        train_dataset = train_dataset.map(
+        train_dataset = datasets["train"].map(
            prepare_train_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )
-        if data_args.max_train_samples is not None:
-            # Number of samples might increase during Feature Creation, We select only specified max samples
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))

    # Validation preprocessing
    def prepare_validation_features(examples):
@@ -408,7 +367,7 @@ def main():
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
-            max_length=max_seq_length,
+            max_length=data_args.max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
@@ -442,23 +401,13 @@ def main():
        return tokenized_examples

    if training_args.do_eval:
-        if "validation" not in datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = datasets["validation"]
-        if data_args.max_val_samples is not None:
-            # We will select sample from whole data
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
-        # Validation Feature Creation
-        eval_dataset = eval_dataset.map(
+        validation_dataset = datasets["validation"].map(
            prepare_validation_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )
-        if data_args.max_val_samples is not None:
-            # During Feature creation dataset samples might increase, we will select required samples again
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))

    # Data collator
    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
@@ -503,7 +452,7 @@ def main():
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
+        eval_dataset=validation_dataset if training_args.do_eval else None,
        eval_examples=datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
@@ -522,26 +471,32 @@ def main():
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload

-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+        output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_train_file, "w") as writer:
+                logger.info("***** Train results *****")
+                for key, value in sorted(train_result.metrics.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")

-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
+            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
+            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
+    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate()
+        results = trainer.evaluate()

-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in sorted(results.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")

-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
+    return results


 def _mp_fn(index):
--- a/examples/question-answering/run_qa_beam_search.py
+++ b/examples/question-answering/run_qa_beam_search.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2020 The HuggingFace Team All rights reserved.
 #
@@ -40,13 +39,9 @@ from transformers import (
    set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version
 from utils_qa import postprocess_qa_predictions_with_beam_search


-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
-
 logger = logging.getLogger(__name__)


@@ -121,20 +116,6 @@ class DataTrainingArguments:
            "be faster on GPU but will be slower on TPU)."
        },
    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_val_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
-            "value if set."
-        },
-    )
    version_2_with_negative: bool = field(
        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
    )
@@ -218,8 +199,6 @@ def main():
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
@@ -287,13 +266,6 @@ def main():
    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

-    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warn(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
-            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-        )
-    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
    # Training preprocessing
    def prepare_train_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
@@ -303,7 +275,7 @@ def main():
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
-            max_length=max_seq_length,
+            max_length=data_args.max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
@@ -391,23 +363,13 @@ def main():
        return tokenized_examples

    if training_args.do_train:
-        if "train" not in datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = datasets["train"]
-        if data_args.max_train_samples is not None:
-            # Select samples from Dataset, This will help to decrease processing time
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-        # Create Training Features
-        train_dataset = train_dataset.map(
+        train_dataset = datasets["train"].map(
            prepare_train_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )
-        if data_args.max_train_samples is not None:
-            # Select samples from dataset again since Feature Creation might increase number of features
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))

    # Validation preprocessing
    def prepare_validation_features(examples):
@@ -418,7 +380,7 @@ def main():
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
-            max_length=max_seq_length,
+            max_length=data_args.max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
@@ -476,23 +438,13 @@ def main():
        return tokenized_examples

    if training_args.do_eval:
-        if "validation" not in datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = datasets["validation"]
-        if data_args.max_val_samples is not None:
-            # Selecting Eval Samples from Dataset
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
-        # Create Features from Eval Dataset
-        eval_dataset = eval_dataset.map(
+        validation_dataset = datasets["validation"].map(
            prepare_validation_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )
-        if data_args.max_val_samples is not None:
-            # Selecting Samples from Dataset again since Feature Creation might increase samples size
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))

    # Data collator
    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
@@ -539,7 +491,7 @@ def main():
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
+        eval_dataset=validation_dataset if training_args.do_eval else None,
        eval_examples=datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
@@ -558,27 +510,32 @@ def main():
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload

-        metrics = train_result.metrics
+        output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_train_file, "w") as writer:
+                logger.info("***** Train results *****")
+                for key, value in sorted(train_result.metrics.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")

-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
+            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
+            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
+    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
-        metrics = trainer.evaluate()
+        results = trainer.evaluate()

-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in sorted(results.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")

-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
+    return results


 def _mp_fn(index):
--- a/examples/question-answering/run_tf_squad.py
+++ b/examples/question-answering/run_tf_squad.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
+++ b/examples/research_projects/bert-loses-patience/test_run_glue_with_pabee.py
@@ -4,7 +4,7 @@ import sys
 from unittest.mock import patch

 import run_glue_with_pabee
-from transformers.testing_utils import TestCasePlus
+from transformers.testing_utils import TestCasePlus, require_torch_non_multi_gpu_but_fix_me


 logging.basicConfig(level=logging.DEBUG)
@@ -20,6 +20,7 @@ def get_setup_file():


 class PabeeTests(TestCasePlus):
+    @require_torch_non_multi_gpu_but_fix_me
    def test_run_glue(self):
        stream_handler = logging.StreamHandler(sys.stdout)
        logger.addHandler(stream_handler)
--- a/examples/research_projects/bertology/run_prune_gpt.py
+++ b/examples/research_projects/bertology/run_prune_gpt.py
@@ -1,388 +1,388 @@
-#!/usr/bin/env python3
-""" This script is adapted from the Bertology pruning code (https://github.com/huggingface/transformers/blob/783d7d2629e97c5f0c5f9ef01b8c66410275c204/examples/research_projects/bertology/run_bertology.py)
-to prune GPT-like models. The author is @altsoph.
-"""
-
-import argparse
-import logging
-import os
-from datetime import datetime
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, RandomSampler, TensorDataset
-from tqdm import tqdm
-
-from transformers import GPT2LMHeadModel
-
-
-logger = logging.getLogger(__name__)
-
-
-def save_model(model, dirpath):
-    # save results
-    if os.path.exists(dirpath):
-        if os.path.exists(os.path.join(dirpath, "config.json")) and os.path.isfile(
-            os.path.join(dirpath, "config.json")
-        ):
-            os.remove(os.path.join(dirpath, "config.json"))
-        if os.path.exists(os.path.join(dirpath, "pytorch_model.bin")) and os.path.isfile(
-            os.path.join(dirpath, "pytorch_model.bin")
-        ):
-            os.remove(os.path.join(dirpath, "pytorch_model.bin"))
-    else:
-        os.makedirs(dirpath)
-    model.save_pretrained(dirpath)
-
-
-def entropy(p, unlogit=False):
-    """ Compute the entropy of a probability distribution """
-    exponent = 2
-    if unlogit:
-        p = torch.pow(p, exponent)
-    plogp = p * torch.log(p)
-    plogp[p == 0] = 0
-    return -plogp.sum(dim=-1)
-
-
-def print_2d_tensor(tensor):
-    """ Print a 2D tensor """
-    logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
-    for row in range(len(tensor)):
-        if tensor.dtype != torch.long:
-            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:.5f}" for x in tensor[row].cpu().data))
-        else:
-            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data))
-
-
-def compute_heads_importance(
-    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False
-):
-    """This method shows how to compute:
-    - head attention entropy
-    - head importance scores according to http://arxiv.org/abs/1905.10650
-    """
-    # Prepare our tensors
-    n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads
-    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
-    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
-
-    if head_mask is None:
-        head_mask = torch.ones(n_layers, n_heads).to(args.device)
-
-    head_mask.requires_grad_(requires_grad=True)
-    # If actually pruned attention multi-head, set head mask to None to avoid shape mismatch
-    if actually_pruned:
-        head_mask = None
-
-    tot_tokens = 0.0
-    total_loss = 0.0
-    for step, inputs in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-        inputs = tuple(t.to(args.device) for t in inputs)
-        (input_ids,) = inputs
-
-        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
-        outputs = model(input_ids, labels=input_ids, head_mask=head_mask)
-        #  (loss), lm_logits, presents, (all hidden_states), (attentions)
-        loss, _, all_attentions = (
-            outputs[0],
-            outputs[1],
-            outputs[-1],
-        )  # Loss and logits are the first, attention the last
-        loss.backward()  # Backpropagate to populate the gradients in the head mask
-        total_loss += loss.detach().cpu().numpy()
-        if compute_entropy:
-            for layer, attn in enumerate(all_attentions):
-                masked_entropy = entropy(attn.detach(), True)
-                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).sum(0).detach()
-
-        if compute_importance:
-            head_importance += head_mask.grad.abs().detach()
-        tot_tokens += torch.ones_like(input_ids).float().detach().sum().data
-
-    # Normalize
-    attn_entropy /= tot_tokens
-    head_importance /= tot_tokens
-    # Layerwise importance normalization
-    if not args.dont_normalize_importance_by_layer:
-        exponent = 2
-        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1 / exponent)
-        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20
-
-    if not args.dont_normalize_global_importance:
-        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
-
-    # Print matrices
-    if compute_entropy:
-        logger.info("Attention entropies")
-        print_2d_tensor(attn_entropy)
-    if compute_importance:
-        logger.info("Head importance scores")
-        print_2d_tensor(head_importance)
-    logger.info("Head ranked by importance scores")
-    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
-    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(
-        head_importance.numel(), device=args.device
-    )
-    head_ranks = head_ranks.view_as(head_importance)
-    print_2d_tensor(head_ranks)
-    return attn_entropy, head_importance, total_loss
-
-
-def mask_heads(args, model, eval_dataloader):
-    """This method shows how to mask head (set some heads to zero), to test the effect on the network,
-    based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
-    """
-    _, head_importance, loss = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
-    original_score = 1 / loss  # instead of downsteam score use the LM loss
-    logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)
-
-    new_head_mask = torch.ones_like(head_importance)
-    num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))
-
-    current_score = original_score
-    while current_score >= original_score * args.masking_threshold:
-        head_mask = new_head_mask.clone().detach()  # save current head mask
-        # heads from least important to most - keep only not-masked heads
-        head_importance[head_mask == 0.0] = float("Inf")
-        current_heads_to_mask = head_importance.view(-1).sort()[1]
-
-        if len(current_heads_to_mask) <= num_to_mask:
-            print("BREAK BY num_to_mask")
-            break
-
-        # mask heads
-        current_heads_to_mask = current_heads_to_mask[:num_to_mask]
-        logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
-        new_head_mask = new_head_mask.view(-1)
-        new_head_mask[current_heads_to_mask] = 0.0
-        new_head_mask = new_head_mask.view_as(head_mask)
-        new_head_mask = new_head_mask.clone().detach()
-        print_2d_tensor(new_head_mask)
-
-        # Compute metric and head importance again
-        _, head_importance, loss = compute_heads_importance(
-            args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask
-        )
-        current_score = 1 / loss
-        logger.info(
-            "Masking: current score: %f, remaining heads %d (%.1f percents)",
-            current_score,
-            new_head_mask.sum(),
-            new_head_mask.sum() / new_head_mask.numel() * 100,
-        )
-
-    logger.info("Final head mask")
-    print_2d_tensor(head_mask)
-    np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy())
-
-    return head_mask
-
-
-def prune_heads(args, model, eval_dataloader, head_mask):
-    """This method shows how to prune head (remove heads weights) based on
-    the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
-    """
-    # Try pruning and test time speedup
-    # Pruning is like masking but we actually remove the masked weights
-    before_time = datetime.now()
-    _, _, loss = compute_heads_importance(
-        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask
-    )
-    score_masking = 1 / loss
-    original_time = datetime.now() - before_time
-
-    original_num_params = sum(p.numel() for p in model.parameters())
-    heads_to_prune = dict(
-        (layer, (1 - head_mask[layer].long()).nonzero().squeeze().tolist()) for layer in range(len(head_mask))
-    )
-
-    for k, v in heads_to_prune.items():
-        if isinstance(v, int):
-            heads_to_prune[k] = [
-                v,
-            ]
-
-    assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
-    model.prune_heads(heads_to_prune)
-    pruned_num_params = sum(p.numel() for p in model.parameters())
-
-    before_time = datetime.now()
-    _, _, loss = compute_heads_importance(
-        args,
-        model,
-        eval_dataloader,
-        compute_entropy=False,
-        compute_importance=False,
-        head_mask=None,
-        actually_pruned=True,
-    )
-
-    score_pruning = 1 / loss
-    new_time = datetime.now() - before_time
-
-    logger.info(
-        "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)",
-        original_num_params,
-        pruned_num_params,
-        pruned_num_params / original_num_params * 100,
-    )
-    logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
-    logger.info("Pruning: speed ratio (original timing / new timing): %f percents", original_time / new_time * 100)
-    save_model(model, args.output_dir)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name",
-        default="",
-        type=str,
-        help="Pretrained config name or path if not the same as model_name_or_path",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name_or_path",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default=None,
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from s3",
-    )
-    parser.add_argument(
-        "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances."
-    )
-    parser.add_argument(
-        "--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory"
-    )
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-
-    parser.add_argument(
-        "--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers"
-    )
-    parser.add_argument(
-        "--dont_normalize_global_importance",
-        action="store_true",
-        help="Don't normalize all importance scores between 0 and 1",
-    )
-
-    parser.add_argument(
-        "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy."
-    )
-    parser.add_argument(
-        "--masking_threshold",
-        default=0.9,
-        type=float,
-        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).",
-    )
-    parser.add_argument(
-        "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step."
-    )
-    parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")
-
-    parser.add_argument(
-        "--max_seq_length",
-        default=128,
-        type=int,
-        help="The maximum total input sequence length after WordPiece tokenization. \n"
-        "Sequences longer than this will be truncated, sequences shorter padded.",
-    )
-    parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
-
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-    args = parser.parse_args()
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup devices and distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        args.device = torch.device("cuda", args.local_rank)
-        args.n_gpu = 1
-        torch.distributed.init_process_group(backend="nccl")  # Initializes the distributed backend
-
-    # Setup logging
-    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))
-
-    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
-
-    # Distributed and parallel training
-    model.to(args.device)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-    elif args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Print/save training arguments
-    os.makedirs(args.output_dir, exist_ok=True)
-    torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Prepare dataset
-    numpy_data = np.concatenate(
-        [
-            np.loadtxt(args.data_dir, dtype=np.int64),
-        ]
-    )
-    train_tensor_dataset = (torch.from_numpy(numpy_data),)
-    train_data = TensorDataset(*train_tensor_dataset)
-    train_sampler = RandomSampler(train_data)
-    eval_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size)
-
-    # Compute head entropy and importance score
-    compute_heads_importance(args, model, eval_dataloader)
-
-    # Try head masking (set heads to zero until the score goes under a threshole)
-    # and head pruning (remove masked heads and see the effect on the network)
-    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
-        head_mask = mask_heads(args, model, eval_dataloader)
-        prune_heads(args, model, eval_dataloader, head_mask)
-
-
-if __name__ == "__main__":
-    main()
+#!/usr/bin/env python3
+""" This script is adapted from the Bertology pruning code (https://github.com/huggingface/transformers/blob/783d7d2629e97c5f0c5f9ef01b8c66410275c204/examples/research_projects/bertology/run_bertology.py)
+to prune GPT-like models. The author is @altsoph.
+"""
+
+import argparse
+import logging
+import os
+from datetime import datetime
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, RandomSampler, TensorDataset
+from tqdm import tqdm
+
+from transformers import GPT2LMHeadModel
+
+
+logger = logging.getLogger(__name__)
+
+
+def save_model(model, dirpath):
+    # save results
+    if os.path.exists(dirpath):
+        if os.path.exists(os.path.join(dirpath, "config.json")) and os.path.isfile(
+            os.path.join(dirpath, "config.json")
+        ):
+            os.remove(os.path.join(dirpath, "config.json"))
+        if os.path.exists(os.path.join(dirpath, "pytorch_model.bin")) and os.path.isfile(
+            os.path.join(dirpath, "pytorch_model.bin")
+        ):
+            os.remove(os.path.join(dirpath, "pytorch_model.bin"))
+    else:
+        os.makedirs(dirpath)
+    model.save_pretrained(dirpath)
+
+
+def entropy(p, unlogit=False):
+    """ Compute the entropy of a probability distribution """
+    exponent = 2
+    if unlogit:
+        p = torch.pow(p, exponent)
+    plogp = p * torch.log(p)
+    plogp[p == 0] = 0
+    return -plogp.sum(dim=-1)
+
+
+def print_2d_tensor(tensor):
+    """ Print a 2D tensor """
+    logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
+    for row in range(len(tensor)):
+        if tensor.dtype != torch.long:
+            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:.5f}" for x in tensor[row].cpu().data))
+        else:
+            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data))
+
+
+def compute_heads_importance(
+    args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None, actually_pruned=False
+):
+    """This method shows how to compute:
+    - head attention entropy
+    - head importance scores according to http://arxiv.org/abs/1905.10650
+    """
+    # Prepare our tensors
+    n_layers, n_heads = model.config.num_hidden_layers, model.config.num_attention_heads
+    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
+    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
+
+    if head_mask is None:
+        head_mask = torch.ones(n_layers, n_heads).to(args.device)
+
+    head_mask.requires_grad_(requires_grad=True)
+    # If actually pruned attention multi-head, set head mask to None to avoid shape mismatch
+    if actually_pruned:
+        head_mask = None
+
+    tot_tokens = 0.0
+    total_loss = 0.0
+    for step, inputs in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+        inputs = tuple(t.to(args.device) for t in inputs)
+        (input_ids,) = inputs
+
+        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
+        outputs = model(input_ids, labels=input_ids, head_mask=head_mask)
+        #  (loss), lm_logits, presents, (all hidden_states), (attentions)
+        loss, _, all_attentions = (
+            outputs[0],
+            outputs[1],
+            outputs[-1],
+        )  # Loss and logits are the first, attention the last
+        loss.backward()  # Backpropagate to populate the gradients in the head mask
+        total_loss += loss.detach().cpu().numpy()
+        if compute_entropy:
+            for layer, attn in enumerate(all_attentions):
+                masked_entropy = entropy(attn.detach(), True)
+                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).sum(0).detach()
+
+        if compute_importance:
+            head_importance += head_mask.grad.abs().detach()
+        tot_tokens += torch.ones_like(input_ids).float().detach().sum().data
+
+    # Normalize
+    attn_entropy /= tot_tokens
+    head_importance /= tot_tokens
+    # Layerwise importance normalization
+    if not args.dont_normalize_importance_by_layer:
+        exponent = 2
+        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1 / exponent)
+        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20
+
+    if not args.dont_normalize_global_importance:
+        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
+
+    # Print matrices
+    if compute_entropy:
+        logger.info("Attention entropies")
+        print_2d_tensor(attn_entropy)
+    if compute_importance:
+        logger.info("Head importance scores")
+        print_2d_tensor(head_importance)
+    logger.info("Head ranked by importance scores")
+    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
+    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(
+        head_importance.numel(), device=args.device
+    )
+    head_ranks = head_ranks.view_as(head_importance)
+    print_2d_tensor(head_ranks)
+    return attn_entropy, head_importance, total_loss
+
+
+def mask_heads(args, model, eval_dataloader):
+    """This method shows how to mask head (set some heads to zero), to test the effect on the network,
+    based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
+    """
+    _, head_importance, loss = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
+    original_score = 1 / loss  # instead of downsteam score use the LM loss
+    logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)
+
+    new_head_mask = torch.ones_like(head_importance)
+    num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))
+
+    current_score = original_score
+    while current_score >= original_score * args.masking_threshold:
+        head_mask = new_head_mask.clone().detach()  # save current head mask
+        # heads from least important to most - keep only not-masked heads
+        head_importance[head_mask == 0.0] = float("Inf")
+        current_heads_to_mask = head_importance.view(-1).sort()[1]
+
+        if len(current_heads_to_mask) <= num_to_mask:
+            print("BREAK BY num_to_mask")
+            break
+
+        # mask heads
+        current_heads_to_mask = current_heads_to_mask[:num_to_mask]
+        logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
+        new_head_mask = new_head_mask.view(-1)
+        new_head_mask[current_heads_to_mask] = 0.0
+        new_head_mask = new_head_mask.view_as(head_mask)
+        new_head_mask = new_head_mask.clone().detach()
+        print_2d_tensor(new_head_mask)
+
+        # Compute metric and head importance again
+        _, head_importance, loss = compute_heads_importance(
+            args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask
+        )
+        current_score = 1 / loss
+        logger.info(
+            "Masking: current score: %f, remaining heads %d (%.1f percents)",
+            current_score,
+            new_head_mask.sum(),
+            new_head_mask.sum() / new_head_mask.numel() * 100,
+        )
+
+    logger.info("Final head mask")
+    print_2d_tensor(head_mask)
+    np.save(os.path.join(args.output_dir, "head_mask.npy"), head_mask.detach().cpu().numpy())
+
+    return head_mask
+
+
+def prune_heads(args, model, eval_dataloader, head_mask):
+    """This method shows how to prune head (remove heads weights) based on
+    the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
+    """
+    # Try pruning and test time speedup
+    # Pruning is like masking but we actually remove the masked weights
+    before_time = datetime.now()
+    _, _, loss = compute_heads_importance(
+        args, model, eval_dataloader, compute_entropy=False, compute_importance=False, head_mask=head_mask
+    )
+    score_masking = 1 / loss
+    original_time = datetime.now() - before_time
+
+    original_num_params = sum(p.numel() for p in model.parameters())
+    heads_to_prune = dict(
+        (layer, (1 - head_mask[layer].long()).nonzero().squeeze().tolist()) for layer in range(len(head_mask))
+    )
+
+    for k, v in heads_to_prune.items():
+        if isinstance(v, int):
+            heads_to_prune[k] = [
+                v,
+            ]
+
+    assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
+    model.prune_heads(heads_to_prune)
+    pruned_num_params = sum(p.numel() for p in model.parameters())
+
+    before_time = datetime.now()
+    _, _, loss = compute_heads_importance(
+        args,
+        model,
+        eval_dataloader,
+        compute_entropy=False,
+        compute_importance=False,
+        head_mask=None,
+        actually_pruned=True,
+    )
+
+    score_pruning = 1 / loss
+    new_time = datetime.now() - before_time
+
+    logger.info(
+        "Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)",
+        original_num_params,
+        pruned_num_params,
+        pruned_num_params / original_num_params * 100,
+    )
+    logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
+    logger.info("Pruning: speed ratio (original timing / new timing): %f percents", original_time / new_time * 100)
+    save_model(model, args.output_dir)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+
+    # Other parameters
+    parser.add_argument(
+        "--config_name",
+        default="",
+        type=str,
+        help="Pretrained config name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name_or_path",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default=None,
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from s3",
+    )
+    parser.add_argument(
+        "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances."
+    )
+    parser.add_argument(
+        "--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory"
+    )
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+
+    parser.add_argument(
+        "--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers"
+    )
+    parser.add_argument(
+        "--dont_normalize_global_importance",
+        action="store_true",
+        help="Don't normalize all importance scores between 0 and 1",
+    )
+
+    parser.add_argument(
+        "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy."
+    )
+    parser.add_argument(
+        "--masking_threshold",
+        default=0.9,
+        type=float,
+        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).",
+    )
+    parser.add_argument(
+        "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step."
+    )
+    parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")
+
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. \n"
+        "Sequences longer than this will be truncated, sequences shorter padded.",
+    )
+    parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
+
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
+    args = parser.parse_args()
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup devices and distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        args.device = torch.device("cuda", args.local_rank)
+        args.n_gpu = 1
+        torch.distributed.init_process_group(backend="nccl")  # Initializes the distributed backend
+
+    # Setup logging
+    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))
+
+    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
+
+    # Distributed and parallel training
+    model.to(args.device)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
+    elif args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Print/save training arguments
+    os.makedirs(args.output_dir, exist_ok=True)
+    torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Prepare dataset
+    numpy_data = np.concatenate(
+        [
+            np.loadtxt(args.data_dir, dtype=np.int64),
+        ]
+    )
+    train_tensor_dataset = (torch.from_numpy(numpy_data),)
+    train_data = TensorDataset(*train_tensor_dataset)
+    train_sampler = RandomSampler(train_data)
+    eval_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size)
+
+    # Compute head entropy and importance score
+    compute_heads_importance(args, model, eval_dataloader)
+
+    # Try head masking (set heads to zero until the score goes under a threshole)
+    # and head pruning (remove masked heads and see the effect on the network)
+    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
+        head_mask = mask_heads(args, model, eval_dataloader)
+        prune_heads(args, model, eval_dataloader, head_mask)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/research_projects/deebert/test_glue_deebert.py
+++ b/examples/research_projects/deebert/test_glue_deebert.py
@@ -1,10 +1,11 @@
 import argparse
 import logging
 import sys
+import unittest
 from unittest.mock import patch

 import run_glue_deebert
-from transformers.testing_utils import TestCasePlus, get_gpu_count, require_torch_non_multi_gpu, slow
+from transformers.testing_utils import require_torch_non_multi_gpu_but_fix_me, slow


 logging.basicConfig(level=logging.DEBUG)
@@ -19,34 +20,17 @@ def get_setup_file():
    return args.f


-class DeeBertTests(TestCasePlus):
+class DeeBertTests(unittest.TestCase):
    def setup(self) -> None:
        stream_handler = logging.StreamHandler(sys.stdout)
        logger.addHandler(stream_handler)

-    def run_and_check(self, args):
-        n_gpu = get_gpu_count()
-
-        if n_gpu > 1:
-            pass
-            # XXX: doesn't quite work with n_gpu > 1 https://github.com/huggingface/transformers/issues/10560
-            # script = f"{self.examples_dir_str}/research_projects/deebert/run_glue_deebert.py"
-            # distributed_args = f"-m torch.distributed.launch --nproc_per_node={n_gpu} {script}".split()
-            # cmd = [sys.executable] + distributed_args + args
-            # execute_subprocess_async(cmd, env=self.get_env())
-            # XXX: test the results - need to save them first into .json file
-        else:
-            args.insert(0, "run_glue_deebert.py")
-            with patch.object(sys, "argv", args):
-                result = run_glue_deebert.main()
-                for value in result.values():
-                    self.assertGreaterEqual(value, 0.666)
-
    @slow
-    @require_torch_non_multi_gpu
+    @require_torch_non_multi_gpu_but_fix_me
    def test_glue_deebert_train(self):

        train_args = """
+            run_glue_deebert.py
            --model_type roberta
            --model_name_or_path roberta-base
            --task_name MRPC
@@ -67,9 +51,13 @@ class DeeBertTests(TestCasePlus):
            --overwrite_cache
            --eval_after_first_stage
            """.split()
-        self.run_and_check(train_args)
+        with patch.object(sys, "argv", train_args):
+            result = run_glue_deebert.main()
+            for value in result.values():
+                self.assertGreaterEqual(value, 0.666)

        eval_args = """
+            run_glue_deebert.py
            --model_type roberta
            --model_name_or_path ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
            --task_name MRPC
@@ -84,9 +72,13 @@ class DeeBertTests(TestCasePlus):
            --overwrite_cache
            --per_gpu_eval_batch_size=1
            """.split()
-        self.run_and_check(eval_args)
+        with patch.object(sys, "argv", eval_args):
+            result = run_glue_deebert.main()
+            for value in result.values():
+                self.assertGreaterEqual(value, 0.666)

        entropy_eval_args = """
+            run_glue_deebert.py
            --model_type roberta
            --model_name_or_path ./examples/deebert/saved_models/roberta-base/MRPC/two_stage
            --task_name MRPC
@@ -101,4 +93,7 @@ class DeeBertTests(TestCasePlus):
            --overwrite_cache
            --per_gpu_eval_batch_size=1
            """.split()
-        self.run_and_check(entropy_eval_args)
+        with patch.object(sys, "argv", entropy_eval_args):
+            result = run_glue_deebert.main()
+            for value in result.values():
+                self.assertGreaterEqual(value, 0.666)
--- a/examples/research_projects/rag/test_distributed_retriever.py
+++ b/examples/research_projects/rag/test_distributed_retriever.py
@@ -17,7 +17,7 @@ from transformers.integrations import is_ray_available
 from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES as DPR_VOCAB_FILES_NAMES
 from transformers.models.rag.retrieval_rag import CustomHFIndex, RagRetriever
 from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES as BART_VOCAB_FILES_NAMES
-from transformers.testing_utils import require_ray
+from transformers.testing_utils import require_ray, require_torch_non_multi_gpu_but_fix_me


 sys.path.append(os.path.join(os.getcwd()))  # noqa: E402 # noqa: E402 # isort:skip
@@ -265,6 +265,7 @@ class RagRetrieverTest(TestCase):
        self.assertEqual(doc_dicts[1]["id"][0], "0")  # max inner product is reached with first doc
        self.assertListEqual(doc_ids.tolist(), [[1], [0]])

+    @require_torch_non_multi_gpu_but_fix_me
    def test_pytorch_distributed_retriever_retrieve(self):
        n_docs = 1
        hidden_states = np.array(
@@ -275,6 +276,7 @@ class RagRetrieverTest(TestCase):
            self.get_dummy_pytorch_distributed_retriever(init_retrieval=True), hidden_states, n_docs
        )

+    @require_torch_non_multi_gpu_but_fix_me
    def test_custom_hf_index_pytorch_retriever_retrieve(self):
        n_docs = 1
        hidden_states = np.array(
@@ -287,6 +289,7 @@ class RagRetrieverTest(TestCase):
            n_docs,
        )

+    @require_torch_non_multi_gpu_but_fix_me
    def test_custom_pytorch_distributed_retriever_retrieve_from_disk(self):
        n_docs = 1
        hidden_states = np.array(
--- a/examples/research_projects/seq2seq-distillation/_test_make_student.py
+++ b/examples/research_projects/seq2seq-distillation/_test_make_student.py
@@ -4,7 +4,7 @@ import unittest
 from make_student import create_student_by_copying_alternating_layers
 from transformers import AutoConfig
 from transformers.file_utils import cached_property
-from transformers.testing_utils import require_torch
+from transformers.testing_utils import require_torch, require_torch_non_multi_gpu_but_fix_me


 TINY_BART = "sshleifer/bart-tiny-random"
@@ -17,23 +17,28 @@ class MakeStudentTester(unittest.TestCase):
    def teacher_config(self):
        return AutoConfig.from_pretrained(TINY_BART)

+    @require_torch_non_multi_gpu_but_fix_me
    def test_valid_t5(self):
        student, *_ = create_student_by_copying_alternating_layers(TINY_T5, tempfile.mkdtemp(), e=1, d=1)
        self.assertEqual(student.config.num_hidden_layers, 1)

+    @require_torch_non_multi_gpu_but_fix_me
    def test_asymmetric_t5(self):
        student, *_ = create_student_by_copying_alternating_layers(TINY_T5, tempfile.mkdtemp(), e=1, d=None)

+    @require_torch_non_multi_gpu_but_fix_me
    def test_same_decoder_small_encoder(self):
        student, *_ = create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=1, d=None)
        self.assertEqual(student.config.encoder_layers, 1)
        self.assertEqual(student.config.decoder_layers, self.teacher_config.encoder_layers)

+    @require_torch_non_multi_gpu_but_fix_me
    def test_small_enc_small_dec(self):
        student, *_ = create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=1, d=1)
        self.assertEqual(student.config.encoder_layers, 1)
        self.assertEqual(student.config.decoder_layers, 1)

+    @require_torch_non_multi_gpu_but_fix_me
    def test_raises_assert(self):
        with self.assertRaises(AssertionError):
            create_student_by_copying_alternating_layers(TINY_BART, tempfile.mkdtemp(), e=None, d=None)
--- a/examples/research_projects/wav2vec2/README.md
+++ b/examples/research_projects/wav2vec2/README.md
@@ -1,8 +0,0 @@
-## Fine-tuning Wav2Vec2
-
-The `run_training.py` script allows one to finetune pretrained Wav2Vec2 models that can be found [here](https://huggingface.co/models?search=facebook/wav2vec2).
-
-This finetuning script can also be run as a google colab [TODO: here]( ).
-
-The script is actively maintained by [Patrick von Platen](https://github.com/patrickvonplaten). 
-Feel free to ask a question on the [Forum](https://discuss.huggingface.co/) or post an issue on [GitHub](https://github.com/huggingface/transformers/issues/new/choose) and adding `@patrickvonplaten` as a tag.
--- a/examples/research_projects/wav2vec2/finetune_base_100.sh
+++ b/examples/research_projects/wav2vec2/finetune_base_100.sh
@@ -1,21 +0,0 @@
-#!/usr/bin/env bash
-python run_asr.py \
--output_dir="./wav2vec2-base-100h" \
--num_train_epochs="30" \
--per_device_train_batch_size="32" \
--per_device_eval_batch_size="32" \
--evaluation_strategy="steps" \
--save_total_limit="3" \
--save_steps="500" \
--eval_steps="100" \
--logging_steps="50" \
--learning_rate="5e-4" \
--warmup_steps="3000" \
--model_name_or_path="facebook/wav2vec2-base" \
--fp16 \
--dataset_name="librispeech_asr" \
--dataset_config_name="clean" \
--train_split_name="train.100" \
--preprocessing_num_workers="32" \
--group_by_length \
--freeze_feature_extractor
--- a/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh
+++ b/examples/research_projects/wav2vec2/finetune_large_lv60_100.sh
@@ -1,21 +0,0 @@
-#!/usr/bin/env bash
-python run_asr.py \
--output_dir="./wav2vec2-large-lv60-100h" \
--num_train_epochs="30" \
--per_device_train_batch_size="16" \
--per_device_eval_batch_size="16" \
--evaluation_strategy="steps" \
--save_total_limit="3" \
--save_steps="500" \
--eval_steps="100" \
--logging_steps="50" \
--learning_rate="5e-4" \
--warmup_steps="3000" \
--model_name_or_path="facebook/wav2vec2-large-lv60" \
--fp16 \
--dataset_name="librispeech_asr" \
--dataset_config_name="clean" \
--train_split_name="train.100" \
--preprocessing_num_workers="32" \
--group_by_length \
--freeze_feature_extractor
--- a/examples/research_projects/wav2vec2/requirements.txt
+++ b/examples/research_projects/wav2vec2/requirements.txt
@@ -1,4 +0,0 @@
-transformers
-datasets
-torch >= 1.5.0
-jiwer
--- a/examples/research_projects/wav2vec2/run_asr.py
+++ b/examples/research_projects/wav2vec2/run_asr.py
@@ -1,281 +0,0 @@
-#!/usr/bin/env python3
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Union
-
-import datasets
-import numpy as np
-import torch
-import torch.nn as nn
-from packaging import version
-
-import soundfile as sf
-from transformers import (
-    HfArgumentParser,
-    Trainer,
-    TrainingArguments,
-    Wav2Vec2ForCTC,
-    Wav2Vec2Processor,
-    is_apex_available,
-)
-
-
-if is_apex_available():
-    from apex import amp
-
-
-if version.parse(torch.__version__) >= version.parse("1.6"):
-    _is_native_amp_available = True
-    from torch.cuda.amp import autocast
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    freeze_feature_extractor: Optional[bool] = field(
-        default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    dataset_name: str = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    train_split_name: Optional[str] = field(
-        default="train",
-        metadata={
-            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-
-
-@dataclass
-class DataCollatorCTCWithPadding:
-    """
-    Data collator that will dynamically pad the inputs received.
-    Args:
-        processor (:class:`~transformers.Wav2Vec2Processor`)
-            The processor used for proccessing the data.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
-            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
-            among:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-              sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
-              maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
-              different lengths).
-        max_length (:obj:`int`, `optional`):
-            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
-        max_length_labels (:obj:`int`, `optional`):
-            Maximum length of the ``labels`` returned list and optionally padding length (see above).
-        pad_to_multiple_of (:obj:`int`, `optional`):
-            If set will pad the sequence to a multiple of the provided value.
-            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
-            7.5 (Volta).
-    """
-
-    processor: Wav2Vec2Processor
-    padding: Union[bool, str] = True
-    max_length: Optional[int] = None
-    max_length_labels: Optional[int] = None
-    pad_to_multiple_of: Optional[int] = None
-    pad_to_multiple_of_labels: Optional[int] = None
-
-    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lenghts and need
-        # different padding methods
-        input_features = [{"input_values": feature["input_values"]} for feature in features]
-        label_features = [{"input_ids": feature["labels"]} for feature in features]
-
-        batch = self.processor.pad(
-            input_features,
-            padding=self.padding,
-            max_length=self.max_length,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors="pt",
-        )
-        with self.processor.as_target_processor():
-            labels_batch = self.processor.pad(
-                label_features,
-                padding=self.padding,
-                max_length=self.max_length_labels,
-                pad_to_multiple_of=self.pad_to_multiple_of_labels,
-                return_tensors="pt",
-            )
-
-        # replace padding with -100 to ignore loss correctly
-        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
-
-        batch["labels"] = labels
-
-        return batch
-
-
-class CTCTrainer(Trainer):
-    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
-        """
-        Perform a training step on a batch of inputs.
-
-        Subclass and override to inject custom behavior.
-
-        Args:
-            model (:obj:`nn.Module`):
-                The model to train.
-            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
-                The inputs and targets of the model.
-
-                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
-
-        Return:
-            :obj:`torch.Tensor`: The tensor with training loss on this batch.
-        """
-
-        model.train()
-        inputs = self._prepare_inputs(inputs)
-
-        if self.use_amp:
-            with autocast():
-                loss = self.compute_loss(model, inputs)
-        else:
-            loss = self.compute_loss(model, inputs)
-
-        if self.args.n_gpu > 1:
-            if model.module.config.ctc_loss_reduction == "mean":
-                loss = loss.mean()
-            elif model.module.config.ctc_loss_reduction == "sum":
-                loss = loss.sum() / (inputs["labels"] >= 0).sum()
-            else:
-                raise ValueError(f"{model.config.ctc_loss_reduction} is not valid. Choose one of ['mean', 'sum']")
-
-        if self.args.gradient_accumulation_steps > 1:
-            loss = loss / self.args.gradient_accumulation_steps
-
-        if self.use_amp:
-            self.scaler.scale(loss).backward()
-        elif self.use_apex:
-            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
-                scaled_loss.backward()
-        elif self.deepspeed:
-            self.deepspeed.backward(loss)
-        else:
-            loss.backward()
-
-        return loss.detach()
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    model = Wav2Vec2ForCTC.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
-    processor = Wav2Vec2Processor.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
-
-    train_dataset = datasets.load_dataset(
-        data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name
-    )
-    val_dataset = datasets.load_dataset(data_args.dataset_name, data_args.dataset_config_name, split="validation")
-
-    wer_metric = datasets.load_metric("wer")
-
-    def map_to_array(batch):
-        speech_array, sampling_rate = sf.read(batch["file"])
-        batch["speech"] = speech_array
-        batch["sampling_rate"] = sampling_rate
-        return batch
-
-    train_dataset = train_dataset.map(map_to_array, remove_columns=["file"])
-    val_dataset = val_dataset.map(map_to_array, remove_columns=["file"])
-
-    def prepare_dataset(batch):
-        # check that all files have the correct sampling rate
-        assert (
-            len(set(batch["sampling_rate"])) == 1
-        ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
-
-        batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
-        with processor.as_target_processor():
-            batch["labels"] = processor(batch["text"]).input_ids
-        return batch
-
-    train_dataset = train_dataset.map(
-        prepare_dataset,
-        batch_size=training_args.per_device_train_batch_size,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-    )
-    val_dataset = val_dataset.map(
-        prepare_dataset,
-        batch_size=training_args.per_device_train_batch_size,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-    )
-
-    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
-
-    def compute_metrics(pred):
-        pred_logits = pred.predictions
-        pred_ids = np.argmax(pred_logits, axis=-1)
-
-        pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
-
-        pred_str = processor.batch_decode(pred_ids)
-        # we do not want to group tokens when computing the metrics
-        label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
-
-        wer = wer_metric.compute(predictions=pred_str, references=label_str)
-
-        return {"wer": wer}
-
-    if model_args.freeze_feature_extractor:
-        model.freeze_feature_extractor()
-
-    trainer = CTCTrainer(
-        model=model,
-        data_collator=data_collator,
-        args=training_args,
-        compute_metrics=compute_metrics,
-        train_dataset=train_dataset,
-        eval_dataset=val_dataset,
-        tokenizer=processor.feature_extractor,
-    )
-
-    trainer.train()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/research_projects/zero-shot-distillation/README.md
+++ b/examples/research_projects/zero-shot-distillation/README.md
@@ -1,155 +0,0 @@
-# Zero-shot classifier distillation
-
-Author: @joeddav 
-
-This script provides a way to improve the speed and memory performance of a zero-shot classifier by training a more
-efficient student model from the zero-shot teacher's predictions over an unlabeled dataset.
-
-The zero-shot classification pipeline uses a model pre-trained on natural language inference (NLI) to determine the
-compatibility of a set of candidate class names with a given sequence. This serves as a convenient out-of-the-box
-classifier without the need for labeled training data. However, for a given sequence, the method requires each
-possible label to be fed through the large NLI model separately. Thus for `N` sequences and `K` classes, a total of
-`N*K` forward passes through the model are required. This requirement slows inference considerably, particularly as
-`K` grows.
-
-Given (1) an unlabeled corpus and (2) a set of candidate class names, the provided script trains a student model
-with a standard classification head with `K` output dimensions. The resulting student model can then be used for
-classifying novel text instances with a significant boost in speed and memory performance while retaining similar
-classification performance to the original zero-shot model
-
-### Usage
-
-A teacher NLI model can be distilled to a more efficient student model by running `distill_classifier.py`:
-
-```
-python distill_classifier.py \
--data_file <unlabeled_data.txt> \
--class_names_file <class_names.txt> \
--output_dir <output_dir>
-```
-
-`<unlabeled_data.txt>` should be a text file with a single unlabeled example per line. `<class_names.txt>` is a text file with one class name per line.
-
-Other optional arguments include:
-
- `--teacher_name_or_path` (default: `roberta-large-mnli`): The name or path of the NLI teacher model.
- `--student_name_or_path` (default: `distillbert-base-uncased`): The name or path of the student model which will
-be fine-tuned to copy the teacher predictions.
- `--hypothesis_template` (default `"This example is {}."`): The template used to turn each label into an NLI-style
-hypothesis when generating teacher predictions. This template must include a `{}` or similar syntax for the
-candidate label to be inserted into the template. For example, the default template is `"This example is {}."` With
-the candidate label `sports`, this would be fed into the model like `[CLS] sequence to classify [SEP] This example
-is sports . [SEP]`.
- `--multi_class`: Whether or not multiple candidate labels can be true. By default, the scores are normalized such
-that the sum of the label likelihoods for each sequence is 1. If `--multi_class` is passed, the labels are
-considered independent and probabilities are normalized for each candidate by doing a softmax of the entailment
-score vs. the contradiction score. This is sometimes called "multi-class multi-label" classification.
- `--temperature` (default: `1.0`): The temperature applied to the softmax of the teacher model predictions. A
-higher temperature results in a student with smoother (lower confidence) predictions than the teacher while a value
-`<1` resultings in a higher-confidence, peaked distribution. The default `1.0` is equivalent to no smoothing.
- `--teacher_batch_size` (default: `32`): The batch size used for generating a single set of teacher predictions.
-Does not affect training. Use `--per_device_train_batch_size` to change the training batch size.
-
-Any of the arguments in the 🤗 Trainer's
-[`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html?#trainingarguments) can also be
-modified, such as `--learning_rate`, `--fp16`, `--no_cuda`, `--warmup_steps`, etc. Run `python distill_classifier.py
-h` for a full list of available arguments or consult the [Trainer
-documentation](https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments).
-
-> **Note**: Distributed and TPU training are not currently supported. Single-node multi-GPU is supported, however,
-and will run automatically if multiple GPUs are available.
-
-### Example: Topic classification
-
-> A full colab demo notebook of this example can be found [here](https://colab.research.google.com/drive/1mjBjd0cR8G57ZpsnFCS3ngGyo5nCa9ya?usp=sharing).
-
-Let's say we're interested in classifying news articles into one of four topic categories: "the world", "sports",
-"business", or "science/tech". We have an unlabeled dataset, [AG's News](https://huggingface.co/datasets/ag_news),
-which corresponds to this problem (in reality AG's News is annotated, but we will pretend it is not for the sake of
-example).
-
-We can use an NLI model like `roberta-large-mnli` for zero-shot classification like so:
-
-```python
->>> class_names = ["the world", "sports", "business", "science/tech"]
->>> hypothesis_template = "This text is about {}."
->>> sequence = "A new moon has been discovered in Jupiter's orbit"
-
->>> zero_shot_classifier = pipeline("zero-shot-classification", model="roberta-large-mnli")
->>> zero_shot_classifier(sequence, class_names, hypothesis_template=hypothesis_template)
-{'sequence': "A new moon has been discovered in Jupiter's orbit",
- 'labels': ['science/tech', 'the world', 'business', 'sports'],
- 'scores': [0.7035840153694153, 0.18744826316833496, 0.06027870625257492, 0.04868902638554573]}
-```
-
-Unfortunately, inference is slow since each of our 4 class names must be fed through the large model for every
-sequence to be classified. But with our unlabeled data we can distill the model to a small distilbert classifier to
-make future inference much faster.
-
-To run the script, we will need to put each training example (text only) from AG's News on its own line in
-`agnews/train_unlabeled.txt`, and each of the four class names in the newline-separated `agnews/class_names.txt`.
-Then we can run distillation with the following command:
-
-```bash
-python distill_classifier.py \
--data_file ./agnews/unlabeled.txt \
--class_names_files ./agnews/class_names.txt \
--teacher_name_or_path roberta-large-mnli \
--hypothesis_template "This text is about {}." \
--output_dir ./agnews/distilled
-```
-
-The script will generate a set of soft zero-shot predictions from `roberta-large-mnli` for each example in
-`agnews/unlabeled.txt`. It will then train a student distilbert classifier on the teacher predictions and
-save the resulting model in `./agnews/distilled`.
-
-The resulting model can then be loaded and used like any other pre-trained classifier:
-
-```python
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-model = AutoModelForSequenceClassification.from_pretrained("./agnews/distilled")
-tokenizer = AutoTokenizer.from_pretrained("./agnews/distilled")
-```
-
-and even used trivially with a `TextClassificationPipeline`:
-
-```python
->>> distilled_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
->>> distilled_classifier(sequence)
-[[{'label': 'the world', 'score': 0.14899294078350067},
-  {'label': 'sports', 'score': 0.03205857425928116},
-  {'label': 'business', 'score': 0.05943061783909798},
-  {'label': 'science/tech', 'score': 0.7595179080963135}]]
-```
-
-> Tip: pass `device=0` when constructing a pipeline to run on a GPU
-
-As we can see, the results of the student closely resemble that of the trainer despite never having seen this
-example during training. Now let's do a quick & dirty speed comparison simulating 16K examples with a batch size of
-16:
-
-```python
-for _ in range(1000):
-    zero_shot_classifier([sequence] * 16, class_names)
-# runs in 1m 23s on a single V100 GPU
-```
-
-```python
-%%time
-for _ in range(1000):
-    distilled_classifier([sequence] * 16)
-# runs in 10.3s on a single V100 GPU
-```
-
-As we can see, the distilled student model runs an order of magnitude faster than its teacher NLI model. This is
-also a seeting where we only have `K=4` possible labels. The higher the number of classes for a given task, the more
-drastic the speedup will be, since the zero-shot teacher's complexity scales linearly with the number of classes.
-
-Since we secretly have access to ground truth labels for AG's news, we can evaluate the accuracy of each model. The
-original zero-shot model `roberta-large-mnli` gets an accuracy of 69.3% on the held-out test set. After training a
-student on the unlabeled training set, the distilled model gets a similar score of 70.4%.
-
-Lastly, you can share the distilled model with the community and/or use it with our inference API by [uploading it
-to the 🤗 Hub](https://huggingface.co/transformers/model_sharing.html). We've uploaded the distilled model from this
-example at
-[joeddav/distilbert-base-uncased-agnews-student](https://huggingface.co/joeddav/distilbert-base-uncased-agnews-student).
--- a/examples/research_projects/zero-shot-distillation/distill_classifier.py
+++ b/examples/research_projects/zero-shot-distillation/distill_classifier.py
@@ -1,338 +0,0 @@
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import List, Optional
-
-import torch
-from datasets import Dataset
-from torch import nn
-from tqdm.auto import tqdm
-
-from transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    HfArgumentParser,
-    Trainer,
-    TrainingArguments,
-    set_seed,
-    utils,
-)
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-
-
-DESCRIPTION = """
-Distills an NLI-based zero-shot classifier to a smaller, more efficient model with a fixed set of candidate class
-names. Useful for speeding up zero-shot classification in cases where labeled training data is not available, but
-when only a single fixed set of classes is needed. Takes a teacher NLI model, student classifier model, unlabeled
-dataset, and set of K possible class names. Yields a single classifier with K outputs corresponding to the provided
-class names.
-"""
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class TeacherModelArguments:
-    teacher_name_or_path: Optional[str] = field(
-        default="roberta-large-mnli", metadata={"help": "The NLI/zero-shot teacher model to be distilled."}
-    )
-    hypothesis_template: Optional[str] = field(
-        default="This example is {}.",
-        metadata={
-            "help": (
-                "Template used to turn class names into mock hypotheses for teacher NLI model. Must include {{}}"
-                "where class name is inserted."
-            )
-        },
-    )
-    teacher_batch_size: Optional[int] = field(
-        default=32, metadata={"help": "Batch size for generating teacher predictions."}
-    )
-    multi_label: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": (
-                "Allow multiple classes to be true rather than forcing them to sum to 1 (sometimes called"
-                "multi-class multi-label classification)."
-            )
-        },
-    )
-    temperature: Optional[float] = field(
-        default=1.0, metadata={"help": "Temperature applied to teacher softmax for distillation."}
-    )
-
-
-@dataclass
-class StudentModelArguments:
-    student_name_or_path: Optional[str] = field(
-        default="distilbert-base-uncased", metadata={"help": "The NLI/zero-shot teacher model to be distilled."}
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    data_file: str = field(metadata={"help": "Text file with one unlabeled instance per line."})
-    class_names_file: str = field(metadata={"help": "Text file with one class name per line."})
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the Rust tokenizers library) or not."},
-    )
-
-
-@dataclass
-class DistillTrainingArguments(TrainingArguments):
-    output_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
-    )
-    per_device_train_batch_size: int = field(
-        default=32, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
-    )
-    per_device_eval_batch_size: int = field(
-        default=128, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
-    )
-    num_train_epochs: float = field(default=1.0, metadata={"help": "Total number of training epochs to perform."})
-    do_train: bool = field(default=True, metadata={"help": "Whether to run training of student model."})
-    do_eval: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "Whether to evaluate the agreement of the final student predictions and the teacher predictions"
-                "after training."
-            )
-        },
-    )
-    save_total_limit: Optional[int] = field(
-        default=0,
-        metadata={
-            "help": (
-                "Limit the total amount of checkpoints."
-                "Deletes the older checkpoints in the output_dir. Default is 0 (no checkpoints)."
-            )
-        },
-    )
-
-
-class DistillationTrainer(Trainer):
-    def compute_loss(self, model, inputs, return_outputs=False):
-        target_p = inputs["labels"]
-        outputs = model(inputs["input_ids"], attention_mask=inputs["attention_mask"])
-        logits = outputs[0]
-
-        loss = -torch.sum(target_p * logits.log_softmax(dim=-1), axis=-1).mean()
-
-        if return_outputs:
-            return loss, outputs
-
-        return loss
-
-
-def read_lines(path):
-    lines = []
-    with open(path, "r") as f:
-        for line in f:
-            line = line.strip()
-            if len(line) > 0:
-                lines.append(line)
-    return lines
-
-
-def get_premise_hypothesis_pairs(examples, class_names, hypothesis_template):
-    premises = []
-    hypotheses = []
-    for example in examples:
-        for name in class_names:
-            premises.append(example)
-            hypotheses.append(hypothesis_template.format(name))
-    return premises, hypotheses
-
-
-def get_entailment_id(config):
-    for label, ind in config.label2id.items():
-        if label.lower().startswith("entail"):
-            return ind
-    logging.warning("Could not identify entailment dimension from teacher config label2id. Setting to -1.")
-    return -1
-
-
-def get_teacher_predictions(
-    model_path: str,
-    examples: List[str],
-    class_names: List[str],
-    hypothesis_template: str,
-    batch_size: int,
-    temperature: float,
-    multi_label: bool,
-    use_fast_tokenizer: bool,
-    no_cuda: bool,
-    fp16: bool,
-):
-    """
-    Gets predictions by the same method as the zero-shot pipeline but with DataParallel & more efficient batching
-    """
-    model = AutoModelForSequenceClassification.from_pretrained(model_path)
-    model_config = model.config
-    if not no_cuda and torch.cuda.is_available():
-        model = nn.DataParallel(model.cuda())
-        batch_size *= len(model.device_ids)
-    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=use_fast_tokenizer)
-
-    premises, hypotheses = get_premise_hypothesis_pairs(examples, class_names, hypothesis_template)
-    logits = []
-
-    for i in tqdm(range(0, len(premises), batch_size)):
-        batch_premises = premises[i : i + batch_size]
-        batch_hypotheses = hypotheses[i : i + batch_size]
-
-        encodings = tokenizer(
-            batch_premises,
-            batch_hypotheses,
-            padding=True,
-            truncation="only_first",
-            return_tensors="pt",
-        )
-
-        with torch.cuda.amp.autocast(enabled=fp16):
-            with torch.no_grad():
-                outputs = model(**encodings)
-        logits.append(outputs.logits.detach().cpu().float())
-
-    entail_id = get_entailment_id(model_config)
-    contr_id = -1 if entail_id == 0 else 0
-    logits = torch.cat(logits, dim=0)  # N*K x 3
-    nli_logits = logits.reshape(len(examples), len(class_names), -1)[..., [contr_id, entail_id]]  # N x K x 2
-
-    if multi_label:
-        # softmax over (contr, entail) logits for each class independently
-        nli_prob = (nli_logits / temperature).softmax(-1)
-    else:
-        # softmax over entail logits across classes s.t. class probabilities sum to 1.
-        nli_prob = (nli_logits / temperature).softmax(1)
-
-    return nli_prob[..., 1]  # N x K
-
-
-def main():
-    parser = HfArgumentParser(
-        (DataTrainingArguments, TeacherModelArguments, StudentModelArguments, DistillTrainingArguments),
-        description=DESCRIPTION,
-    )
-
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        data_args, teacher_args, student_args, training_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
-    else:
-        data_args, teacher_args, student_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        utils.logging.set_verbosity_info()
-        utils.logging.enable_default_handler()
-        utils.logging.enable_explicit_format()
-
-    if training_args.local_rank != -1:
-        raise ValueError("Distributed training is not currently supported.")
-    if training_args.tpu_num_cores is not None:
-        raise ValueError("TPU acceleration is not currently supported.")
-
-    logger.info(f"Training/evaluation parameters {training_args}")
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # 1. read in data
-    examples = read_lines(data_args.data_file)
-    class_names = read_lines(data_args.class_names_file)
-
-    # 2. get teacher predictions and load into dataset
-    logger.info("Generating predictions from zero-shot teacher model")
-    teacher_soft_preds = get_teacher_predictions(
-        teacher_args.teacher_name_or_path,
-        examples,
-        class_names,
-        teacher_args.hypothesis_template,
-        teacher_args.teacher_batch_size,
-        teacher_args.temperature,
-        teacher_args.multi_label,
-        data_args.use_fast_tokenizer,
-        training_args.no_cuda,
-        training_args.fp16,
-    )
-    dataset = Dataset.from_dict(
-        {
-            "text": examples,
-            "labels": teacher_soft_preds,
-        }
-    )
-
-    # 3. create student
-    logger.info("Initializing student model")
-    model = AutoModelForSequenceClassification.from_pretrained(
-        student_args.student_name_or_path, num_labels=len(class_names)
-    )
-    tokenizer = AutoTokenizer.from_pretrained(student_args.student_name_or_path, use_fast=data_args.use_fast_tokenizer)
-    model.config.id2label = {i: label for i, label in enumerate(class_names)}
-    model.config.label2id = {label: i for i, label in enumerate(class_names)}
-
-    # 4. train student on teacher predictions
-    dataset = dataset.map(tokenizer, input_columns="text")
-    dataset.set_format("torch")
-
-    def compute_metrics(p, return_outputs=False):
-        preds = p.predictions.argmax(-1)
-        proxy_labels = p.label_ids.argmax(-1)  # "label_ids" are actually distributions
-        return {"agreement": (preds == proxy_labels).mean().item()}
-
-    trainer = DistillationTrainer(
-        model=model,
-        tokenizer=tokenizer,
-        args=training_args,
-        train_dataset=dataset,
-        compute_metrics=compute_metrics,
-    )
-
-    if training_args.do_train:
-        logger.info("Training student model on teacher predictions")
-        trainer.train()
-
-    if training_args.do_eval:
-        agreement = trainer.evaluate(eval_dataset=dataset)["eval_agreement"]
-        logger.info(f"Agreement of student and teacher predictions: {agreement * 100:0.2f}%")
-
-    trainer.save_model()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/seq2seq/README.md
+++ b/examples/seq2seq/README.md
@@ -19,7 +19,6 @@ limitations under the License.
 This directory contains examples for finetuning and evaluating transformers on summarization and translation tasks.
 Please tag @patil-suraj with any issues/unexpected behaviors, or send a PR!
 For deprecated `bertabs` instructions, see [`bertabs/README.md`](https://github.com/huggingface/transformers/blob/master/examples/research_projects/bertabs/README.md).
-For the old `finetune_trainer.py` and related utils, see [`examples/legacy/seq2seq`](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq).

 ### Supported Architectures

@@ -27,207 +26,395 @@ For the old `finetune_trainer.py` and related utils, see [`examples/legacy/seq2s
 - `MarianMTModel`
 - `PegasusForConditionalGeneration`
 - `MBartForConditionalGeneration`
- `FSMTForConditionalGeneration` (translation only)
+- `FSMTForConditionalGeneration`
 - `T5ForConditionalGeneration`

-`run_summarization.py` and `run_translation.py` are lightweight examples of how to download and preprocess a dataset from the [🤗 Datasets](https://github.com/huggingface/datasets) library or use your own files (jsonlines or csv), then fine-tune one of the architectures above on it.
+This directory is in a bit of messy state and is undergoing some cleaning, please bare with us in the meantime :-) Here are the instructions to use the new and old scripts for fine-tuning sequence-to-sequence models.

-For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets.html#json-files
-and you also will find examples of these below.
+## New script

-### Summarization
+The new script for fine-tuning a model on a summarization or translation task is `run_seq2seq.py`. It is a lightweight example of how to download and preprocess a dataset from the [🤗 Datasets](https://github.com/huggingface/datasets) library or use your own files (json or csv), then fine-tune one of the architectures above on it.

 Here is an example on a summarization task:
 ```bash
-python examples/seq2seq/run_summarization.py \
+python examples/seq2seq/run_seq2seq.py \
    --model_name_or_path t5-small \
    --do_train \
    --do_eval \
+    --task summarization \
    --dataset_name xsum \
-    --output_dir /tmp/tst-summarization \
+    --output_dir ~/tmp/tst-summarization \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --overwrite_output_dir \
-    --predict_with_generate \
-    --max_train_samples 500 \
-    --max_val_samples 500
+    --predict_with_generate
 ```

-CNN/DailyMail dataset is another commonly used dataset for the task of summarization. To use it replace `--dataset_name xsum` with `--dataset_name cnn_dailymail --dataset_config "3.0.0"`.
-
-And here is how you would use it on your own files, after adjusting the values for the arguments
-`--train_file`, `--validation_file`, `--text_column` and `--summary_column` to match your setup:
-
+And here is how you would use it on your own files (replace `path_to_csv_or_json_file`, `text_column_name` and `summary_column_name` by the relevant values):
 ```bash
-python examples/seq2seq/run_summarization.py \
-    --model_name_or_path t5-small \
+python examples/seq2seq/run_seq2seq.py \
+    -model_name_or_path t5-small \
    --do_train \
    --do_eval \
-    --train_file path_to_csv_or_jsonlines_file \
-    --validation_file path_to_csv_or_jsonlines_file \
-    --output_dir /tmp/tst-summarization \
+    --task summarization \
+    --train_file path_to_csv_or_json_file \
+    --validation_file path_to_csv_or_json_file \
+    --output_dir ~/tmp/tst-summarization \
    --overwrite_output_dir \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --predict_with_generate \
-    --max_train_samples 500 \
-    --max_val_samples 500
-```
-
-The task of summarization supports custom CSV and JSONLINES formats.
-
-#### Custom CSV Files
-
-If it's a csv file the training and validation files should have a column for the inputs texts and a column for the summaries.
-
-If the csv file has just two columns as in the following example:
-
-```csv
-text,summary
-"I'm sitting here in a boring room. It's just another rainy Sunday afternoon. I'm wasting my time I got nothing to do. I'm hanging around I'm waiting for you. But nothing ever happens. And I wonder","I'm sitting in a room where I'm waiting for something to happen"
-"I see trees so green, red roses too. I see them bloom for me and you. And I think to myself what a wonderful world. I see skies so blue and clouds so white. The bright blessed day, the dark sacred night. And I think to myself what a wonderful world.","I'm a gardener and I'm a big fan of flowers."
-"Christmas time is here. Happiness and cheer. Fun for all that children call. Their favorite time of the year. Snowflakes in the air. Carols everywhere. Olden times and ancient rhymes. Of love and dreams to share","It's that time of year again."
-```
-
-The first column is assumed to be for `text` and the second is for summary.
-
-If the csv file has multiple columns, you can then specify the names of the columns to use:
-
-```bash
    --text_column text_column_name \
-    --summary_column summary_column_name \
+    --summary_column summary_column_name 
 ```
+The training and validation files should have a column for the inputs texts and a column for the summaries.

-For example if the columns were:
-
-```csv
-id,date,text,summary
-```
-
-and you wanted to select only `text` and `summary`, then you'd pass these additional arguments:
-
+Here is an example of a translation fine-tuning:
 ```bash
-    --text_column text \
-    --summary_column summary \
-```
-
-#### Custom JSONFILES Files
-
-The second supported format is jsonlines. Here is an example of a jsonlines custom data file.
-
-
-```json
-{"text": "I'm sitting here in a boring room. It's just another rainy Sunday afternoon. I'm wasting my time I got nothing to do. I'm hanging around I'm waiting for you. But nothing ever happens. And I wonder", "summary": "I'm sitting in a room where I'm waiting for something to happen"}
-{"text": "I see trees so green, red roses too. I see them bloom for me and you. And I think to myself what a wonderful world. I see skies so blue and clouds so white. The bright blessed day, the dark sacred night. And I think to myself what a wonderful world.", "summary": "I'm a gardener and I'm a big fan of flowers."}
-{"text": "Christmas time is here. Happiness and cheer. Fun for all that children call. Their favorite time of the year. Snowflakes in the air. Carols everywhere. Olden times and ancient rhymes. Of love and dreams to share", "summary": "It's that time of year again."}
-```
-
-Same as with the CSV files, by default the first value will be used as the text record and the second as the summary record. Therefore you can use any key names for the entries, in this example `text` and `summary` were used.
-
-And as with the CSV files, you can specify which values to select from the file, by explicitly specifying the corresponding key names. In our example this again would be:
-
-```bash
-    --text_column text \
-    --summary_column summary \
-```
-
-
-
-### Translation
-
-Here is an example of a translation fine-tuning with T5:
-
-```bash
-python examples/seq2seq/run_translation.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --source_lang en \
-    --target_lang ro \
-    --dataset_name wmt16 \
-    --dataset_config_name ro-en \
-    --output_dir /tmp/tst-translation \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --max_train_samples 500 \
-    --max_val_samples 500
-```
-
-And the same with MBart:
-
-```bash
-python examples/seq2seq/run_translation.py \
-    --model_name_or_path facebook/mbart-large-en-ro  \
+python examples/seq2seq/run_seq2seq.py \
+    --model_name_or_path sshleifer/student_marian_en_ro_6_1 \
    --do_train \
    --do_eval \
+    --task translation_en_to_ro \
    --dataset_name wmt16 \
    --dataset_config_name ro-en \
    --source_lang en_XX \
-    --target_lang ro_RO \
-    --output_dir /tmp/tst-translation \
+    --target_lang ro_RO\
+    --output_dir ~/tmp/tst-translation \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --overwrite_output_dir \
-    --predict_with_generate \
-    --max_train_samples 500 \
-    --max_val_samples 500
- ```
-
-Note, that depending on the used model additional language-specific command-line arguments are sometimes required. Specifically:
-
-* MBart models require different `--{source,target}_lang` values, e.g. in place of `en` it expects `en_XX`, for `ro` it expects `ro_RO`. The full MBart specification for language codes can be looked up [here](https://huggingface.co/facebook/mbart-large-cc25)
-* T5 models can use a `--source_prefix` argument to override the otherwise automated prefix of the form `translate {source_lang} to {target_lang}` for `run_translation.py` and `summarize: ` for `run_summarization.py`
-
-Also, if you switch to a different language pair, make sure to adjust the source and target values in all command line arguments.
-
-And here is how you would use the translation finetuning on your own files, after adjusting the
-values for the arguments `--train_file`, `--validation_file` to match your setup:
+    --predict_with_generate
+```

+And here is how you would use it on your own files (replace `path_to_json_file`, by the relevant values):
 ```bash
-python examples/seq2seq/run_translation.py \
-    --model_name_or_path t5-small \
+python examples/seq2seq/run_seq2seq.py \
+    --model_name_or_path sshleifer/student_marian_en_ro_6_1 \
    --do_train \
    --do_eval \
-    --source_lang en \
-    --target_lang ro \
+    --task translation_en_to_ro \
    --dataset_name wmt16 \
    --dataset_config_name ro-en \
-    --train_file path_to_jsonlines_file \
-    --validation_file path_to_jsonlines_file \
-    --output_dir /tmp/tst-translation \
+    --source_lang en_XX \
+    --target_lang ro_RO\
+    --train_file path_to_json_file \
+    --validation_file path_to_json_file \
+    --output_dir ~/tmp/tst-translation \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --overwrite_output_dir \
-    --predict_with_generate \
-    --max_train_samples 500 \
-    --max_val_samples 500
+    --predict_with_generate
 ```
+Here the files are expected to be JSON files, with each input being a dictionary with a key `"translation"` containing one key per language (here `"en"` and `"ro"`).

-The task of translation supports only custom JSONLINES files, with each line being a dictionary with a key `"translation"` and its value another dictionary whose keys is the language pair. For example:
+## Old script

-```json
-{ "translation": { "en": "Others have dismissed him as a joke.", "ro": "Alții l-au numit o glumă." } }
-{ "translation": { "en": "And some are holding out for an implosion.", "ro": "Iar alții așteaptă implozia." } }
-```
-Here the languages are Romanian (`ro`) and English (`en`).
+The new script is very new and hasn't been widely tested yet. It also misses a few functionality offered by the old
+script, which is why we are leaving the old script here for now.

-If you want to use a pre-processed dataset that leads to high bleu scores, but for the `en-de` language pair, you can use `--dataset_name wmt14-en-de-pre-processed`, as following:
+### Downlowd the Datasets
+
+#### XSUM

 ```bash
-python examples/seq2seq/run_translation.py \
-    --model_name_or_path t5-small \
-    --do_train \
-    --do_eval \
-    --source_lang en \
-    --target_lang de \
-    --dataset_name wmt14-en-de-pre-processed \
-    --output_dir /tmp/tst-translation \
-    --per_device_train_batch_size=4 \
-    --per_device_eval_batch_size=4 \
-    --overwrite_output_dir \
-    --predict_with_generate \
-    --max_train_samples 500 \
-    --max_val_samples 500
- ```
+cd examples/seq2seq
+wget https://cdn-datasets.huggingface.co/summarization/xsum.tar.gz
+tar -xzvf xsum.tar.gz
+export XSUM_DIR=${PWD}/xsum
+```
+this should make a directory called `xsum/` with files like `test.source`.
+To use your own data, copy that files format. Each article to be summarized is on its own line.
+
+#### CNN/DailyMail
+
+```bash
+cd examples/seq2seq
+wget https://cdn-datasets.huggingface.co/summarization/cnn_dm_v2.tgz
+tar -xzvf cnn_dm_v2.tgz  # empty lines removed
+mv cnn_cln cnn_dm
+export CNN_DIR=${PWD}/cnn_dm
+```
+this should make a directory called `cnn_dm/` with 6 files.
+
+#### WMT16 English-Romanian Translation Data
+
+download with this command:
+```bash
+wget https://cdn-datasets.huggingface.co/translation/wmt_en_ro.tar.gz
+tar -xzvf wmt_en_ro.tar.gz
+export ENRO_DIR=${PWD}/wmt_en_ro
+```
+this should make a directory called `wmt_en_ro/` with 6 files.
+
+#### WMT English-German
+
+```bash
+wget https://cdn-datasets.huggingface.co/translation/wmt_en_de.tgz
+tar -xzvf wmt_en_de.tgz
+export DATA_DIR=${PWD}/wmt_en_de
+```
+
+#### FSMT datasets (wmt)
+
+Refer to the scripts starting with `eval_` under:
+https://github.com/huggingface/transformers/tree/master/scripts/fsmt
+
+#### Pegasus (multiple datasets)
+
+Multiple eval datasets are available for download from: 
+https://github.com/stas00/porting/tree/master/datasets/pegasus
+
+
+#### Your Data
+
+If you are using your own data, it must be formatted as one directory with 6 files:
+```
+train.source
+train.target
+val.source
+val.target
+test.source
+test.target
+```
+The `.source` files are the input, the `.target` files are the desired output.
+
+### Potential issues
+
+- native AMP (`--fp16` and no apex) may lead to a huge memory leak and require 10x gpu memory. This has been fixed in pytorch-nightly and the minimal official version to have this fix will be pytorch-1.7.1. Until then if you have to use mixed precision please use AMP only with pytorch-nightly or NVIDIA's apex. Reference: https://github.com/huggingface/transformers/issues/8403
+
+
+### Tips and Tricks
+
+General Tips:
+- since you need to run from `examples/seq2seq`, and likely need to modify code, the easiest workflow is fork transformers, clone your fork, and run `pip install -e .` before you get started.
+- try `--freeze_encoder` or `--freeze_embeds` for faster training/larger batch size.  (3hr per epoch with bs=8, see the "xsum_shared_task" command below)
+- `fp16_opt_level=O1` (the default works best).
+- In addition to the pytorch-lightning .ckpt checkpoint, a transformers checkpoint will be saved.
+Load it with `BartForConditionalGeneration.from_pretrained(f'{output_dir}/best_tfmr)`.
+- At the moment, `--do_predict` does not work in a multi-gpu setting. You need to use `evaluate_checkpoint` or the `run_eval.py` code.
+- This warning can be safely ignored:
+    > "Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-xsum and are newly initialized: ['final_logits_bias']"
+- Both finetuning and eval are 30% faster with `--fp16`. For that you need to [install apex](https://github.com/NVIDIA/apex#quick-start).
+- Read scripts before you run them!
+
+Summarization Tips:
+- (summ) 1 epoch at batch size 1 for bart-large takes 24 hours and requires 13GB GPU RAM with fp16 on an NVIDIA-V100.
+- If you want to run experiments on improving the summarization finetuning process, try the XSUM Shared Task (below). It's faster to train than CNNDM because the summaries are shorter.
+- For CNN/DailyMail, the default `val_max_target_length` and `test_max_target_length` will truncate the ground truth labels, resulting in slightly higher rouge scores. To get accurate rouge scores, you should rerun calculate_rouge on the `{output_dir}/test_generations.txt` file saved by `trainer.test()`
+- `--max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 ` is a reasonable setting for XSUM.
+- `wandb` can be used by specifying `--logger_name wandb`. It is useful for reproducibility. Specify the environment variable `WANDB_PROJECT='hf_xsum'` to do the XSUM shared task.
+- If you are finetuning on your own dataset, start from `distilbart-cnn-12-6` if you want long summaries and `distilbart-xsum-12-6` if you want short summaries.
+(It rarely makes sense to start from `bart-large` unless you are a researching finetuning methods).
+
+**Update 2018-07-18**
+Datasets: `LegacySeq2SeqDataset` will be used for all tokenizers without a `prepare_seq2seq_batch` method. Otherwise, `Seq2SeqDataset` will be used.
+Future work/help wanted: A new dataset to support multilingual tasks.
+
+
+### Fine-tuning using Seq2SeqTrainer
+To use `Seq2SeqTrainer` for fine-tuning you should use the `finetune_trainer.py` script. It subclasses `Trainer` to extend it for seq2seq training. Except the `Trainer`-related `TrainingArguments`, it shares the same argument names as that of `finetune.py` file. One notable difference is that calculating generative metrics (BLEU, ROUGE) is optional and is controlled using the `--predict_with_generate` argument.
+
+With PyTorch 1.6+ it'll automatically use `native AMP` when `--fp16` is set.
+
+To see all the possible command line options, run:
+
+```bash
+python finetune_trainer.py --help
+```
+
+For multi-gpu training use `torch.distributed.launch`, e.g. with 2 gpus:
+```bash
+python -m torch.distributed.launch --nproc_per_node=2  finetune_trainer.py ...
+```
+
+**At the moment, `Seq2SeqTrainer` does not support *with teacher* distillation.**
+
+All `Seq2SeqTrainer`-based fine-tuning scripts are included in the `builtin_trainer` directory.
+
+#### TPU Training
+`Seq2SeqTrainer` supports TPU training with few caveats
+1. As `generate` method does not work on TPU at the moment, `predict_with_generate` cannot be used. You should use `--prediction_loss_only` to only calculate loss, and do not set `--do_predict` and `--predict_with_generate`.
+2. All sequences should be padded to be of equal length to avoid extremely slow training. (`finetune_trainer.py` does this automatically when running on TPU.)
+
+We provide a very simple launcher script named `xla_spawn.py` that lets you run our example scripts on multiple TPU cores without any boilerplate. Just pass a `--num_cores` flag to this script, then your regular training script with its arguments (this is similar to the `torch.distributed.launch` helper for `torch.distributed`).
+
+`builtin_trainer/finetune_tpu.sh` script provides minimal arguments needed for TPU training.
+
+The following command fine-tunes `sshleifer/student_marian_en_ro_6_3` on TPU V3-8 and should complete one epoch in ~5-6 mins.
+
+```bash
+./builtin_trainer/train_distil_marian_enro_tpu.sh
+```
+
+## Evaluation Commands
+
+To create summaries for each article in dataset, we use `run_eval.py`, here are a few commands that run eval for different tasks and models.
+If 'translation' is in your task name, the computed metric will be BLEU. Otherwise, ROUGE will be used.
+
+For t5, you need to specify --task translation_{src}_to_{tgt} as follows:
+```bash
+export DATA_DIR=wmt_en_ro
+./run_eval.py t5-base \
+    $DATA_DIR/val.source t5_val_generations.txt \
+    --reference_path $DATA_DIR/val.target \
+    --score_path enro_bleu.json \
+    --task translation_en_to_ro \
+    --n_obs 100 \
+    --device cuda \
+    --fp16 \
+    --bs 32
+```
+
+This command works for MBART, although the BLEU score is suspiciously low.
+```bash
+export DATA_DIR=wmt_en_ro
+./run_eval.py facebook/mbart-large-en-ro $DATA_DIR/val.source mbart_val_generations.txt \
+    --reference_path $DATA_DIR/val.target \
+    --score_path enro_bleu.json \
+    --task translation \
+    --n_obs 100 \
+    --device cuda \
+    --fp16 \
+    --bs 32
+```
+
+Summarization (xsum will be very similar):
+```bash
+export DATA_DIR=cnn_dm
+./run_eval.py sshleifer/distilbart-cnn-12-6 $DATA_DIR/val.source dbart_val_generations.txt \
+    --reference_path $DATA_DIR/val.target \
+    --score_path cnn_rouge.json \
+    --task summarization \
+    --n_obs 100 \
+
+th 56 \
+    --fp16 \
+    --bs 32
+```
+
+### Multi-GPU Evaluation
+here is a command to run xsum evaluation on 8 GPUS. It is more than linearly faster than run_eval.py in some cases 
+because it uses SortishSampler to minimize padding. You can also use it on 1 GPU. `data_dir` must have 
+`{type_path}.source` and `{type_path}.target`. Run `./run_distributed_eval.py --help` for all clargs.
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=8  run_distributed_eval.py \
+    --model_name sshleifer/distilbart-large-xsum-12-3  \
+    --save_dir xsum_generations \
+    --data_dir xsum \
+    --fp16  # you can pass generate kwargs like num_beams here, just like run_eval.py
+```
+
+Contributions that implement this command for other distributed hardware setups are welcome!
+
+#### Single-GPU Eval: Tips and Tricks
+
+When using `run_eval.py`, the following features can be useful:
+
+* if you running the script multiple times and want to make it easier to track what arguments produced that output, use `--dump-args`. Along with the results it will also dump any custom params that were passed to the script. For example if you used: `--num_beams 8 --early_stopping true`, the output will be:
+   ```
+   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True}
+   ```
+
+   `--info` is an additional argument available for the same purpose of tracking the conditions of the experiment. It's useful to pass things that weren't in the argument list, e.g. a language pair `--info "lang:en-ru"`. But also if you pass `--info` without a value it will fallback to the current date/time string, e.g. `2020-09-13 18:44:43`.
+
+   If using `--dump-args --info`, the output will be:
+   
+   ```
+   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True, 'info': '2020-09-13 18:44:43'}
+   ```
+
+   If using `--dump-args --info "pair:en-ru chkpt=best`, the output will be:
+   
+   ```
+   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True, 'info': 'pair=en-ru chkpt=best'}
+   ```
+      
+
+* if you need to perform a parametric search in order to find the best ones that lead to the highest BLEU score, let `run_eval_search.py` to do the searching for you.
+
+   The script accepts the exact same arguments as `run_eval.py`, plus an additional argument `--search`. The value of `--search` is parsed, reformatted and fed to ``run_eval.py`` as additional args.
+
+   The format for the `--search` value is a simple string with hparams and colon separated values to try, e.g.:
+   ```
+    --search "num_beams=5:10 length_penalty=0.8:1.0:1.2 early_stopping=true:false"
+   ```
+   which will generate `12` `(2*3*2)` searches for a product of each hparam. For example the example that was just used will invoke `run_eval.py` repeatedly with:
+   
+   ```
+    --num_beams 5 --length_penalty 0.8 --early_stopping true
+    --num_beams 5 --length_penalty 0.8 --early_stopping false
+    [...]
+    --num_beams 10 --length_penalty 1.2 --early_stopping false
+   ```
+   
+   On completion, this function prints a markdown table of the results sorted by the best BLEU score and the winning arguments.
+
+```
+bleu  | num_beams | length_penalty | early_stopping
+----- | --------- | -------------- | --------------
+26.71 |         5 |            1.1 |              1
+26.66 |         5 |            0.9 |              1
+26.66 |         5 |            0.9 |              0
+26.41 |         5 |            1.1 |              0
+21.94 |         1 |            0.9 |              1
+21.94 |         1 |            0.9 |              0
+21.94 |         1 |            1.1 |              1
+21.94 |         1 |            1.1 |              0
+
+Best score args:
+stas/wmt19-en-ru data/en-ru/val.source data/en-ru/test_translations.txt --reference_path data/en-ru/val.target --score_path data/en-ru/test_bleu.json --bs 8 --task translation --num_beams 5 --length_penalty 1.1 --early_stopping True
+```
+
+If you pass `--info "some experiment-specific info"` it will get printed before the results table - this is useful for scripting and multiple runs, so one can tell the different sets of results from each other.
+
+
+### Contributing
+- follow the standard contributing guidelines and code of conduct.
+- add tests to `test_seq2seq_examples.py`
+- To run only the seq2seq tests, you must be in the root of the repository and run:
+```bash
+pytest examples/seq2seq/
+```
+
+### Converting pytorch-lightning checkpoints
+pytorch lightning ``-do_predict`` often fails, after you are done training, the best way to evaluate your model is to convert it.
+
+This should be done for you, with a file called `{save_dir}/best_tfmr`. 
+
+If that file doesn't exist but you have a lightning `.ckpt` file, you can run
+```bash
+python convert_pl_checkpoint_to_hf.py PATH_TO_CKPT  randomly_initialized_hf_model_path save_dir/best_tfmr
+```
+Then either `run_eval` or `run_distributed_eval` with `save_dir/best_tfmr` (see previous sections)
+
+
+# Experimental Features 
+These features are harder to use and not always useful.
+
+###  Dynamic Batch Size for MT
+`finetune.py` has a command line arg `--max_tokens_per_batch` that allows batches to be dynamically sized.
+This feature can only be used:
+- with fairseq installed
+- on 1 GPU
+- without sortish sampler
+- after calling `./save_len_file.py $tok $data_dir`
+
+For example, 
+```bash
+./save_len_file.py Helsinki-NLP/opus-mt-en-ro  wmt_en_ro
+./dynamic_bs_example.sh --max_tokens_per_batch=2000 --output_dir benchmark_dynamic_bs
+```
+splits `wmt_en_ro/train` into 11,197 uneven lengthed batches and can finish 1 epoch in 8 minutes on a v100.
+
+For comparison,
+```bash
+./dynamic_bs_example.sh --sortish_sampler --train_batch_size 48
+```
+uses 12,723 batches of length 48 and takes slightly more time 9.5 minutes.
+
+The feature is still experimental, because:
+ we can make it much more robust if we have memory mapped/preprocessed datasets.
+ The speedup over sortish sampler is not that large at the moment.
+
--- a/examples/legacy/seq2seq/init.py
+++ b/examples/legacy/seq2seq/init.py
--- a/examples/legacy/seq2seq/convert_model_to_fp16.py
+++ b/examples/legacy/seq2seq/convert_model_to_fp16.py
--- a/examples/legacy/seq2seq/download_wmt.py
+++ b/examples/legacy/seq2seq/download_wmt.py
--- a/examples/seq2seq/ds_config.json
+++ b/examples/seq2seq/ds_config.json
@@ -0,0 +1,47 @@
+{
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+   "zero_optimization": {
+       "stage": 2,
+       "allgather_partitions": true,
+       "allgather_bucket_size": 2e8,
+       "overlap_comm": true,
+       "reduce_scatter": true,
+       "reduce_bucket_size": 2e8,
+       "contiguous_gradients": true,
+       "cpu_offload": true
+   },
+
+   "zero_allow_untested_optimizer": true,
+
+   "optimizer": {
+     "type": "AdamW",
+     "params": {
+       "lr": 3e-5,
+       "betas": [
+         0.8,
+         0.999
+       ],
+       "eps": 1e-8,
+       "weight_decay": 3e-7
+     }
+   },
+
+   "scheduler": {
+     "type": "WarmupLR",
+     "params": {
+       "warmup_min_lr": 0,
+       "warmup_max_lr": 3e-5,
+       "warmup_num_steps": 500
+     }
+   },
+
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
--- a/examples/legacy/seq2seq/finetune.sh
+++ b/examples/legacy/seq2seq/finetune.sh
--- a/examples/legacy/seq2seq/finetune_tpu.sh
+++ b/examples/legacy/seq2seq/finetune_tpu.sh
--- a/examples/legacy/seq2seq/finetune_trainer.py
+++ b/examples/legacy/seq2seq/finetune_trainer.py
--- a/examples/legacy/seq2seq/minify_dataset.py
+++ b/examples/legacy/seq2seq/minify_dataset.py
--- a/examples/legacy/seq2seq/pack_dataset.py
+++ b/examples/legacy/seq2seq/pack_dataset.py
--- a/examples/seq2seq/requirements.txt
+++ b/examples/seq2seq/requirements.txt
@@ -1,6 +1,20 @@
+tensorboard
+scikit-learn
+seqeval
+psutil
+sacrebleu
+rouge-score
+tensorflow_datasets
+matplotlib
+git-python==1.0.3
+faiss-cpu
+streamlit
+elasticsearch
+nltk
+pandas
 datasets >= 1.1.3
+fire
+pytest
+conllu
 sentencepiece != 0.1.92
 protobuf
-sacrebleu >= 1.4.12
-rouge-score
-nltk
--- a/examples/legacy/seq2seq/romanian_postprocessing.md
+++ b/examples/legacy/seq2seq/romanian_postprocessing.md
--- a/examples/legacy/seq2seq/rouge_cli.py
+++ b/examples/legacy/seq2seq/rouge_cli.py
--- a/examples/legacy/seq2seq/run_distributed_eval.py
+++ b/examples/legacy/seq2seq/run_distributed_eval.py
--- a/examples/legacy/seq2seq/run_eval.py
+++ b/examples/legacy/seq2seq/run_eval.py
@@ -132,14 +132,8 @@ def run_generate(verbose=True):
    if args.n_obs > 0:
        examples = examples[: args.n_obs]
    Path(args.save_path).parent.mkdir(exist_ok=True)
-
    if args.reference_path is None and Path(args.score_path).exists():
        warnings.warn(f"score_path {args.score_path} will be overwritten unless you type ctrl-c.")
-
-    if args.device == "cpu" and args.fp16:
-        # this mix leads to RuntimeError: "threshold_cpu" not implemented for 'Half'
-        raise ValueError("Can't mix --fp16 and --device cpu")
-
    runtime_metrics = generate_summaries_or_translations(
        examples,
        args.save_path,
--- a/examples/legacy/seq2seq/run_eval_search.py
+++ b/examples/legacy/seq2seq/run_eval_search.py
--- a/examples/seq2seq/run_translation.py
+++ b/examples/seq2seq/run_translation.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
 #
@@ -20,6 +19,7 @@ Fine-tuning the library models for sequence to sequence.

 import logging
 import os
+import re
 import sys
 from dataclasses import dataclass, field
 from typing import Optional
@@ -35,19 +35,14 @@ from transformers import (
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    MBartTokenizer,
-    MBartTokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    default_data_collator,
    set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version


-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
-
 logger = logging.getLogger(__name__)


@@ -93,28 +88,31 @@ class DataTrainingArguments:
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

-    source_lang: str = field(default=None, metadata={"help": "Source language id for translation."})
-    target_lang: str = field(default=None, metadata={"help": "Target language id for translation."})
-
+    task: str = field(
+        default="summarization",
+        metadata={
+            "help": "The name of the task, should be summarization (or summarization_{dataset} for evaluating "
+            "pegasus) or translation (or translation_{xx}_to_{yy})."
+        },
+    )
    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
-    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a jsonlines)."})
+    text_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
+    )
+    summary_column: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."},
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
    validation_file: Optional[str] = field(
        default=None,
-        metadata={
-            "help": "An optional input evaluation data file to evaluate the metrics (sacreblue) on "
-            "a jsonlines file."
-        },
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "An optional input test data file to evaluate the metrics (sacreblue) on " "a jsonlines file."
-        },
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
@@ -168,20 +166,9 @@ class DataTrainingArguments:
            "value if set."
        },
    )
-    max_test_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
-            "value if set."
-        },
-    )
-    num_beams: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
-            "which is used during ``evaluate`` and ``predict``."
-        },
-    )
+    source_lang: Optional[str] = field(default=None, metadata={"help": "Source language id for translation."})
+    target_lang: Optional[str] = field(default=None, metadata={"help": "Target language id for translation."})
+    eval_beams: Optional[int] = field(default=None, metadata={"help": "Number of beams to use for evaluation."})
    ignore_pad_token_for_loss: bool = field(
        default=True,
        metadata={
@@ -195,19 +182,36 @@ class DataTrainingArguments:
    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
            raise ValueError("Need either a dataset name or a training/validation file.")
-        elif self.source_lang is None or self.target_lang is None:
-            raise ValueError("Need to specify the source language and the target language.")
-
-        if self.train_file is not None:
-            extension = self.train_file.split(".")[-1]
-            assert extension == "json", "`train_file` should be a json file."
-        if self.validation_file is not None:
-            extension = self.validation_file.split(".")[-1]
-            assert extension == "json", "`validation_file` should be a json file."
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+        if not self.task.startswith("summarization") and not self.task.startswith("translation"):
+            raise ValueError(
+                "`task` should be summarization, summarization_{dataset}, translation or translation_{xx}_to_{yy}."
+            )
        if self.val_max_target_length is None:
            self.val_max_target_length = self.max_target_length


+summarization_name_mapping = {
+    "amazon_reviews_multi": ("review_body", "review_title"),
+    "big_patent": ("description", "abstract"),
+    "cnn_dailymail": ("article", "highlights"),
+    "orange_sum": ("text", "summary"),
+    "pn_summary": ("article", "summary"),
+    "psc": ("extract_text", "summary_text"),
+    "samsum": ("dialogue", "summary"),
+    "thaisum": ("body", "summary"),
+    "xglue": ("news_body", "news_title"),
+    "xsum": ("document", "summary"),
+    "wiki_summary": ("article", "highlights"),
+}
+
+
 def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
@@ -221,18 +225,6 @@ def main():
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

-    if data_args.source_prefix is None and model_args.model_name_or_path in [
-        "t5-small",
-        "t5-base",
-        "t5-large",
-        "t5-3b",
-        "t5-11b",
-    ]:
-        logger.warning(
-            "You're running a t5 model but didn't provide a source prefix, which is expected, e.g. with "
-            "`--source_prefix 'translate English to German: ' `"
-        )
-
    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
@@ -269,10 +261,13 @@ def main():
    # Set seed before initializing model.
    set_seed(training_args.seed)

-    # Get the datasets: you can either provide your own JSON training and evaluation files (see below)
+    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
+    # For CSV/JSON files in the summarization task, this script will use the first column for the full texts and the
+    # second column for the summaries (unless you specify column names for this with the `text_column` and
+    # `summary_column` arguments).
    # For translation, only JSON files are supported, with one field named "translation" containing two keys for the
    # source and target languages (unless you adapt what follows).
    #
@@ -289,9 +284,6 @@ def main():
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
            extension = data_args.validation_file.split(".")[-1]
-        if data_args.test_file is not None:
-            data_files["test"] = data_args.test_file
-            extension = data_args.test_file.split(".")[-1]
        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -324,57 +316,81 @@ def main():
    )

    # Set decoder_start_token_id
-    if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        assert (
-            data_args.target_lang is not None and data_args.source_lang is not None
-        ), "mBart requires --target_lang and --source_lang"
-        if isinstance(tokenizer, MBartTokenizer):
-            model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang]
-        else:
-            model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(data_args.target_lang)
-
+    if model.config.decoder_start_token_id is None and isinstance(tokenizer, MBartTokenizer):
+        model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang]
    if model.config.decoder_start_token_id is None:
        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")

-    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
+    # Get the default prefix if None is passed.
+    if data_args.source_prefix is None:
+        task_specific_params = model.config.task_specific_params
+        if task_specific_params is not None:
+            prefix = task_specific_params.get("prefix", "")
+        else:
+            prefix = ""
+    else:
+        prefix = data_args.source_prefix

    # Preprocessing the datasets.
    # We need to tokenize inputs and targets.
    if training_args.do_train:
        column_names = datasets["train"].column_names
-    elif training_args.do_eval:
-        column_names = datasets["validation"].column_names
-    elif training_args.do_predict:
-        column_names = datasets["test"].column_names
    else:
-        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
-        return
+        column_names = datasets["validation"].column_names

    # For translation we set the codes of our source and target languages (only useful for mBART, the others will
    # ignore those attributes).
-    if isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
+    if data_args.task.startswith("translation"):
        if data_args.source_lang is not None:
            tokenizer.src_lang = data_args.source_lang
        if data_args.target_lang is not None:
            tokenizer.tgt_lang = data_args.target_lang

-    # Get the language codes for input/target.
-    source_lang = data_args.source_lang.split("_")[0]
-    target_lang = data_args.target_lang.split("_")[0]
+    # To serialize preprocess_function below, each of those four variables needs to be defined (even if we won't use
+    # them all).
+    source_lang, target_lang, text_column, summary_column = None, None, None, None
+
+    if data_args.task.startswith("summarization"):
+        # Get the column names for input/target.
+        dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None)
+        if data_args.text_column is None:
+            text_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+        else:
+            text_column = data_args.text_column
+        if data_args.summary_column is None:
+            summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+        else:
+            summary_column = data_args.summary_column
+    else:
+        # Get the language codes for input/target.
+        lang_search = re.match("translation_([a-z]+)_to_([a-z]+)", data_args.task)
+        if data_args.source_lang is not None:
+            source_lang = data_args.source_lang.split("_")[0]
+        else:
+            assert (
+                lang_search is not None
+            ), "Provide a source language via --source_lang or rename your task 'translation_xx_to_yy'."
+            source_lang = lang_search.groups()[0]
+
+        if data_args.target_lang is not None:
+            target_lang = data_args.target_lang.split("_")[0]
+        else:
+            assert (
+                lang_search is not None
+            ), "Provide a target language via --target_lang or rename your task 'translation_xx_to_yy'."
+            target_lang = lang_search.groups()[1]

    # Temporarily set max_target_length for training.
    max_target_length = data_args.max_target_length
    padding = "max_length" if data_args.pad_to_max_length else False

-    if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
-        logger.warn(
-            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
-            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
-        )
-
    def preprocess_function(examples):
-        inputs = [ex[source_lang] for ex in examples["translation"]]
-        targets = [ex[target_lang] for ex in examples["translation"]]
+        if data_args.task.startswith("translation"):
+            inputs = [ex[source_lang] for ex in examples["translation"]]
+            targets = [ex[target_lang] for ex in examples["translation"]]
+        else:
+            inputs = examples[text_column]
+            targets = examples[summary_column]
        inputs = [prefix + inp for inp in inputs]
        model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)

@@ -394,8 +410,6 @@ def main():

    if training_args.do_train:
        train_dataset = datasets["train"]
-        if "train" not in datasets:
-            raise ValueError("--do_train requires a train dataset")
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
        train_dataset = train_dataset.map(
@@ -408,8 +422,6 @@ def main():

    if training_args.do_eval:
        max_target_length = data_args.val_max_target_length
-        if "validation" not in datasets:
-            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = datasets["validation"]
        if data_args.max_val_samples is not None:
            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
@@ -421,21 +433,6 @@ def main():
            load_from_cache_file=not data_args.overwrite_cache,
        )

-    if training_args.do_predict:
-        max_target_length = data_args.val_max_target_length
-        if "test" not in datasets:
-            raise ValueError("--do_predict requires a test dataset")
-        test_dataset = datasets["test"]
-        if data_args.max_test_samples is not None:
-            test_dataset = test_dataset.select(range(data_args.max_test_samples))
-        test_dataset = test_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-
    # Data collator
    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
    if data_args.pad_to_max_length:
@@ -443,19 +440,13 @@ def main():
    else:
        data_collator = DataCollatorForSeq2Seq(
            tokenizer,
-            model=model,
            label_pad_token_id=label_pad_token_id,
            pad_to_multiple_of=8 if training_args.fp16 else None,
        )

    # Metric
-    metric = load_metric("sacrebleu")
-
-    def postprocess_text(preds, labels):
-        preds = [pred.strip() for pred in preds]
-        labels = [[label.strip()] for label in labels]
-
-        return preds, labels
+    metric_name = "rouge" if data_args.task.startswith("summarization") else "sacrebleu"
+    metric = load_metric(metric_name)

    def compute_metrics(eval_preds):
        preds, labels = eval_preds
@@ -468,14 +459,22 @@ def main():
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Some simple post-processing
-        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+        decoded_preds = [pred.strip() for pred in decoded_preds]
+        decoded_labels = [label.strip() for label in decoded_labels]
+        if metric_name == "sacrebleu":
+            decoded_labels = [[label] for label in decoded_labels]

        result = metric.compute(predictions=decoded_preds, references=decoded_labels)
-        result = {"bleu": result["score"]}
+
+        # Extract a few results from ROUGE
+        if metric_name == "rouge":
+            result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
+        else:
+            result = {"bleu": result["score"]}

        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
        result["gen_len"] = np.mean(prediction_lens)
-        result = {k: round(v, 4) for k, v in result.items()}
+
        return result

    # Initialize our Trainer
@@ -500,55 +499,31 @@ def main():
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload

-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+        output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_train_file, "w") as writer:
+                logger.info("***** Train results *****")
+                for key, value in sorted(train_result.metrics.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")

-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
+            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
+            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

-        metrics = trainer.evaluate(
-            max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval"
-        )
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    if training_args.do_predict:
-        logger.info("*** Test ***")
-
-        test_results = trainer.predict(
-            test_dataset,
-            metric_key_prefix="test",
-            max_length=data_args.val_max_target_length,
-            num_beams=data_args.num_beams,
-        )
-        metrics = test_results.metrics
-        max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len(test_dataset)
-        metrics["test_samples"] = min(max_test_samples, len(test_dataset))
-
-        trainer.log_metrics("test", metrics)
-        trainer.save_metrics("test", metrics)
+        results = trainer.evaluate()

+        output_eval_file = os.path.join(training_args.output_dir, "eval_results_seq2seq.txt")
        if trainer.is_world_process_zero():
-            if training_args.predict_with_generate:
-                test_preds = tokenizer.batch_decode(
-                    test_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
-                )
-                test_preds = [pred.strip() for pred in test_preds]
-                output_test_preds_file = os.path.join(training_args.output_dir, "test_generations.txt")
-                with open(output_test_preds_file, "w") as writer:
-                    writer.write("\n".join(test_preds))
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in sorted(results.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")

    return results

--- a/examples/seq2seq/run_summarization.py
+++ b/examples/seq2seq/run_summarization.py
@@ -1,595 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for sequence to sequence.
-"""
-# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-import nltk  # Here to have a nice missing dependency error message early on
-import numpy as np
-from datasets import load_dataset, load_metric
-
-import transformers
-from filelock import FileLock
-from transformers import (
-    AutoConfig,
-    AutoModelForSeq2SeqLM,
-    AutoTokenizer,
-    DataCollatorForSeq2Seq,
-    HfArgumentParser,
-    Seq2SeqTrainer,
-    Seq2SeqTrainingArguments,
-    default_data_collator,
-    set_seed,
-)
-from transformers.file_utils import is_offline_mode
-from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version
-
-
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
-
-logger = logging.getLogger(__name__)
-
-try:
-    nltk.data.find("tokenizers/punkt")
-except (LookupError, OSError):
-    if is_offline_mode():
-        raise LookupError(
-            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
-        )
-    with FileLock(".lock") as lock:
-        nltk.download("punkt", quiet=True)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    use_auth_token: bool = field(
-        default=False,
-        metadata={
-            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
-            "with private models)."
-        },
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    dataset_name: Optional[str] = field(
-        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
-    )
-    dataset_config_name: Optional[str] = field(
-        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
-    )
-    text_column: Optional[str] = field(
-        default=None,
-        metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
-    )
-    summary_column: Optional[str] = field(
-        default=None,
-        metadata={"help": "The name of the column in the datasets containing the summaries (for summarization)."},
-    )
-    train_file: Optional[str] = field(
-        default=None, metadata={"help": "The input training data file (a jsonlines or csv file)."}
-    )
-    validation_file: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "An optional input evaluation data file to evaluate the metrics (rouge) on "
-            "(a jsonlines or csv file)."
-        },
-    )
-    test_file: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "An optional input test data file to evaluate the metrics (rouge) on " "(a jsonlines or csv file)."
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_source_length: Optional[int] = field(
-        default=1024,
-        metadata={
-            "help": "The maximum total input sequence length after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        },
-    )
-    max_target_length: Optional[int] = field(
-        default=128,
-        metadata={
-            "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        },
-    )
-    val_max_target_length: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
-            "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
-            "during ``evaluate`` and ``predict``."
-        },
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether to pad all samples to model maximum sentence length. "
-            "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
-            "efficient on GPU but very bad for TPU."
-        },
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_val_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
-            "value if set."
-        },
-    )
-    max_test_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
-            "value if set."
-        },
-    )
-    num_beams: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
-            "which is used during ``evaluate`` and ``predict``."
-        },
-    )
-    ignore_pad_token_for_loss: bool = field(
-        default=True,
-        metadata={
-            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
-        },
-    )
-    source_prefix: Optional[str] = field(
-        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
-    )
-
-    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
-        else:
-            if self.train_file is not None:
-                extension = self.train_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
-            if self.validation_file is not None:
-                extension = self.validation_file.split(".")[-1]
-                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
-        if self.val_max_target_length is None:
-            self.val_max_target_length = self.max_target_length
-
-
-summarization_name_mapping = {
-    "amazon_reviews_multi": ("review_body", "review_title"),
-    "big_patent": ("description", "abstract"),
-    "cnn_dailymail": ("article", "highlights"),
-    "orange_sum": ("text", "summary"),
-    "pn_summary": ("article", "summary"),
-    "psc": ("extract_text", "summary_text"),
-    "samsum": ("dialogue", "summary"),
-    "thaisum": ("body", "summary"),
-    "xglue": ("news_body", "news_title"),
-    "xsum": ("document", "summary"),
-    "wiki_summary": ("article", "highlights"),
-}
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    if data_args.source_prefix is None and model_args.model_name_or_path in [
-        "t5-small",
-        "t5-base",
-        "t5-large",
-        "t5-3b",
-        "t5-11b",
-    ]:
-        logger.warning(
-            "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
-            "`--source_prefix 'summarize: ' `"
-        )
-
-    # Detecting last checkpoint.
-    last_checkpoint = None
-    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
-        last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
-            )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        handlers=[logging.StreamHandler(sys.stdout)],
-    )
-    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
-
-    # Log on each process the small summary:
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Set seed before initializing model.
-    set_seed(training_args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files this script will use the first column for the full texts and the second column for the
-    # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if data_args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
-    else:
-        data_files = {}
-        if data_args.train_file is not None:
-            data_files["train"] = data_args.train_file
-            extension = data_args.train_file.split(".")[-1]
-        if data_args.validation_file is not None:
-            data_files["validation"] = data_args.validation_file
-            extension = data_args.validation_file.split(".")[-1]
-        if data_args.test_file is not None:
-            data_files["test"] = data_args.test_file
-            extension = data_args.test_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files)
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=model_args.use_fast_tokenizer,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-    )
-    model = AutoModelForSeq2SeqLM.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-        revision=model_args.model_revision,
-        use_auth_token=True if model_args.use_auth_token else None,
-    )
-
-    if model.config.decoder_start_token_id is None:
-        raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
-
-    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
-
-    # Preprocessing the datasets.
-    # We need to tokenize inputs and targets.
-    if training_args.do_train:
-        column_names = datasets["train"].column_names
-    elif training_args.do_eval:
-        column_names = datasets["validation"].column_names
-    elif training_args.do_predict:
-        column_names = datasets["test"].column_names
-    else:
-        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
-        return
-
-    # Get the column names for input/target.
-    dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None)
-    if data_args.text_column is None:
-        text_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
-    else:
-        text_column = data_args.text_column
-        if text_column not in column_names:
-            raise ValueError(
-                f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}"
-            )
-    if data_args.summary_column is None:
-        summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
-    else:
-        summary_column = data_args.summary_column
-        if summary_column not in column_names:
-            raise ValueError(
-                f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}"
-            )
-
-    # Temporarily set max_target_length for training.
-    max_target_length = data_args.max_target_length
-    padding = "max_length" if data_args.pad_to_max_length else False
-
-    if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
-        logger.warn(
-            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
-            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
-        )
-
-    def preprocess_function(examples):
-        inputs = examples[text_column]
-        targets = examples[summary_column]
-        inputs = [prefix + inp for inp in inputs]
-        model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
-
-        # Setup the tokenizer for targets
-        with tokenizer.as_target_tokenizer():
-            labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
-
-        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
-        # padding in the loss.
-        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
-            labels["input_ids"] = [
-                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
-            ]
-
-        model_inputs["labels"] = labels["input_ids"]
-        return model_inputs
-
-    if training_args.do_train:
-        train_dataset = datasets["train"]
-        if "train" not in datasets:
-            raise ValueError("--do_train requires a train dataset")
-        if data_args.max_train_samples is not None:
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-        train_dataset = train_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-
-    if training_args.do_eval:
-        max_target_length = data_args.val_max_target_length
-        if "validation" not in datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
-        eval_dataset = eval_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-
-    if training_args.do_predict:
-        max_target_length = data_args.val_max_target_length
-        if "test" not in datasets:
-            raise ValueError("--do_predict requires a test dataset")
-        test_dataset = datasets["test"]
-        if data_args.max_test_samples is not None:
-            test_dataset = test_dataset.select(range(data_args.max_test_samples))
-        test_dataset = test_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
-
-    # Data collator
-    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
-    if data_args.pad_to_max_length:
-        data_collator = default_data_collator
-    else:
-        data_collator = DataCollatorForSeq2Seq(
-            tokenizer,
-            model=model,
-            label_pad_token_id=label_pad_token_id,
-            pad_to_multiple_of=8 if training_args.fp16 else None,
-        )
-
-    # Metric
-    metric = load_metric("rouge")
-
-    def postprocess_text(preds, labels):
-        preds = [pred.strip() for pred in preds]
-        labels = [label.strip() for label in labels]
-
-        # rougeLSum expects newline after each sentence
-        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
-        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
-
-        return preds, labels
-
-    def compute_metrics(eval_preds):
-        preds, labels = eval_preds
-        if isinstance(preds, tuple):
-            preds = preds[0]
-        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
-        if data_args.ignore_pad_token_for_loss:
-            # Replace -100 in the labels as we can't decode them.
-            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
-        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
-
-        # Some simple post-processing
-        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
-
-        result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
-        # Extract a few results from ROUGE
-        result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
-
-        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
-        result["gen_len"] = np.mean(prediction_lens)
-        result = {k: round(v, 4) for k, v in result.items()}
-        return result
-
-    # Initialize our Trainer
-    trainer = Seq2SeqTrainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
-        tokenizer=tokenizer,
-        data_collator=data_collator,
-        compute_metrics=compute_metrics if training_args.predict_with_generate else None,
-    )
-
-    # Training
-    if training_args.do_train:
-        if last_checkpoint is not None:
-            checkpoint = last_checkpoint
-        elif os.path.isdir(model_args.model_name_or_path):
-            checkpoint = model_args.model_name_or_path
-        else:
-            checkpoint = None
-        train_result = trainer.train(resume_from_checkpoint=checkpoint)
-        trainer.save_model()  # Saves the tokenizer too for easy upload
-
-        metrics = train_result.metrics
-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
-
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        metrics = trainer.evaluate(
-            max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval"
-        )
-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
-
-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
-
-    if training_args.do_predict:
-        logger.info("*** Test ***")
-
-        test_results = trainer.predict(
-            test_dataset,
-            metric_key_prefix="test",
-            max_length=data_args.val_max_target_length,
-            num_beams=data_args.num_beams,
-        )
-        metrics = test_results.metrics
-        max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len(test_dataset)
-        metrics["test_samples"] = min(max_test_samples, len(test_dataset))
-
-        trainer.log_metrics("test", metrics)
-        trainer.save_metrics("test", metrics)
-
-        if trainer.is_world_process_zero():
-            if training_args.predict_with_generate:
-                test_preds = tokenizer.batch_decode(
-                    test_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
-                )
-                test_preds = [pred.strip() for pred in test_preds]
-                output_test_preds_file = os.path.join(training_args.output_dir, "test_generations.txt")
-                with open(output_test_preds_file, "w") as writer:
-                    writer.write("\n".join(test_preds))
-
-    return results
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/seq2seq/save_len_file.py
+++ b/examples/legacy/seq2seq/save_len_file.py
--- a/examples/legacy/seq2seq/save_randomly_initialized_model.py
+++ b/examples/legacy/seq2seq/save_randomly_initialized_model.py
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Lysandre	800f385d78	Release: v4.3.0 Some checks failed Model templates runner / run_tests_templates (push) Has been cancelled Details Release - Conda / build_and_package (push) Has been cancelled Details	2021-02-08 18:31:49 +01:00
Anthony MOI	bcf49c0438	Update tokenizers requirement (#10077 )	2021-02-08 18:29:16 +01:00
Patrick von Platen	15a8906c71	Bump minimum Jax requirement to 2.8.0 (#10027 ) * Bump minimum Jax requirement to 2.8.0 * update table	2021-02-08 18:18:26 +01:00