Release: v4.3.1

Deprecate Wav2Vec2ForMaskedLM and add Wav2Vec2ForCTC (#10089 )
* add wav2vec2CTC and deprecate for maskedlm * remove from docs
2021-02-09 09:55:55 +01:00 · 2021-02-09 09:55:55 +01:00 · 2021-02-08 18:31:49 +01:00 · 2021-02-08 18:29:16 +01:00 · 2021-02-08 18:18:26 +01:00
715 changed files with 16774 additions and 73059 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -3,6 +3,7 @@ orbs:
    gcp-gke: circleci/gcp-gke@1.0.4
    go: circleci/go@1.3.0

+
 # TPU REFERENCES
 references:
    checkout_ml_testing: &checkout_ml_testing
@@ -68,8 +69,6 @@ jobs:
            - image: circleci/python:3.6
        environment:
            OMP_NUM_THREADS: 1
-            RUN_PT_TF_CROSS_TESTS: yes
-            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -78,45 +77,14 @@ jobs:
                  keys:
                      - v0.4-torch_and_tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
+            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
                    - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf ./tests/ -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
-            - store_artifacts:
-                  path: ~/transformers/tests_output.txt
-            - store_artifacts:
-                  path: ~/transformers/reports
-
-    run_tests_torch_and_flax:
-        working_directory: ~/transformers
-        docker:
-            - image: circleci/python:3.6
-        environment:
-            OMP_NUM_THREADS: 1
-            RUN_PT_FLAX_CROSS_TESTS: yes
-            TRANSFORMERS_IS_CI: yes
-        resource_class: xlarge
-        parallelism: 1
-        steps:
-            - checkout
-            - restore_cache:
-                  keys:
-                      - v0.4-torch_and_flax-{{ checksum "setup.py" }}
-                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
-            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
-            - save_cache:
-                key: v0.4-{{ checksum "setup.py" }}
-                paths:
-                    - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax ./tests/ -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
+            - run: RUN_PT_TF_CROSS_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf ./tests/ -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -128,7 +96,6 @@ jobs:
            - image: circleci/python:3.7
        environment:
            OMP_NUM_THREADS: 1
-            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -137,15 +104,14 @@ jobs:
                  keys:
                      - v0.4-torch-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
+            - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 4 --dist=loadfile -s --make-reports=tests_torch ./tests/ | tee tests_output.txt
+            - run: python -m pytest -n 8 --dist=loadfile -s --make-reports=tests_torch ./tests/ | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -157,7 +123,6 @@ jobs:
            - image: circleci/python:3.7
        environment:
            OMP_NUM_THREADS: 1
-            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -184,7 +149,6 @@ jobs:
            - image: circleci/python:3.7
        environment:
            OMP_NUM_THREADS: 1
-            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -194,7 +158,7 @@ jobs:
                    - v0.4-flax-{{ checksum "setup.py" }}
                    - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: sudo pip install .[flax,testing,sentencepiece]
+            - run: sudo pip install .[flax,sklearn,torch,testing,sentencepiece]
            - save_cache:
                  key: v0.4-flax-{{ checksum "setup.py" }}
                  paths:
@@ -211,8 +175,6 @@ jobs:
            - image: circleci/python:3.7
        environment:
            OMP_NUM_THREADS: 1
-            RUN_PIPELINE_TESTS: yes
-            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -221,15 +183,14 @@ jobs:
                  keys:
                      - v0.4-torch-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
+            - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test ./tests/ | tee tests_output.txt
+            - run: RUN_PIPELINE_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test ./tests/ | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -241,8 +202,6 @@ jobs:
            - image: circleci/python:3.7
        environment:
            OMP_NUM_THREADS: 1
-            RUN_PIPELINE_TESTS: yes
-            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -257,7 +216,7 @@ jobs:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf ./tests/ -m is_pipeline_test | tee tests_output.txt
+            - run: RUN_PIPELINE_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf ./tests/ -m is_pipeline_test | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -269,7 +228,6 @@ jobs:
            - image: circleci/python:3.7
        environment:
            RUN_CUSTOM_TOKENIZERS: yes
-            TRANSFORMERS_IS_CI: yes
        steps:
            - checkout
            - restore_cache:
@@ -277,7 +235,7 @@ jobs:
                      - v0.4-custom_tokenizers-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[ja,testing,sentencepiece,jieba]
+            - run: pip install .[ja,testing,sentencepiece]
            - run: python -m unidic download
            - save_cache:
                  key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
@@ -295,7 +253,6 @@ jobs:
            - image: circleci/python:3.6
        environment:
            OMP_NUM_THREADS: 1
-            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -311,7 +268,7 @@ jobs:
                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/ | tee examples_output.txt
+            - run: python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/ | tee examples_output.txt
            - store_artifacts:
                  path: ~/transformers/examples_output.txt
            - store_artifacts:
@@ -321,9 +278,6 @@ jobs:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.7
-        environment:
-            RUN_GIT_LFS_TESTS: yes
-            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -334,7 +288,7 @@ jobs:
                git config --global user.name "ci"
            - run: pip install --upgrade pip
            - run: pip install .[testing]
-            - run: python -m pytest -sv ./tests/test_hf_api.py -k "HfLargefilesTest"
+            - run: RUN_GIT_LFS_TESTS=1 python -m pytest -sv ./tests/test_hf_api.py -k "HfLargefilesTest"

    build_doc:
        working_directory: ~/transformers
@@ -346,14 +300,13 @@ jobs:
                  keys:
                      - v0.4-build_doc-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install ."[docs]"
+            - run: pip install ."[all, docs]"
            - save_cache:
                  key: v0.4-build_doc-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: cd docs && make html SPHINXOPTS="-W -j 4"
+            - run: cd docs && make html SPHINXOPTS="-W"
            - store_artifacts:
                path: ./docs/_build

@@ -370,7 +323,7 @@ jobs:
                  keys:
                      - v0.4-deploy_doc-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: pip install ."[docs]"
+            - run: pip install ."[all,docs]"
            - save_cache:
                  key: v0.4-deploy_doc-{{ checksum "setup.py" }}
                  paths:
@@ -398,14 +351,12 @@ jobs:
                      - '~/.cache/pip'
            - run: black --check examples tests src utils
            - run: isort --check-only examples tests src utils
-            - run: python utils/custom_init_isort.py --check_only
            - run: flake8 examples tests src utils
            - run: python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
            - run: python utils/check_copies.py
            - run: python utils/check_table.py
            - run: python utils/check_dummies.py
            - run: python utils/check_repo.py
-            - run: python utils/check_inits.py

    check_repository_consistency:
        working_directory: ~/transformers
@@ -424,7 +375,6 @@ jobs:
            - image: circleci/python:3.6
        environment:
            OMP_NUM_THREADS: 1
-            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -463,7 +413,6 @@ workflows:
            - run_examples_torch
            - run_tests_custom_tokenizers
            - run_tests_torch_and_tf
-            - run_tests_torch_and_flax
            - run_tests_torch
            - run_tests_tf
            - run_tests_flax
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -3,7 +3,6 @@ cd docs
 function deploy_doc(){
 	echo "Creating doc at commit $1 and pushing to folder $2"
 	git checkout $1
-	pip install -U ..
 	if [ ! -z "$2" ]
 	then
 		if [ "$2" == "master" ]; then
@@ -46,7 +45,7 @@ deploy_doc "6f5a12a" v2.7.0
 deploy_doc "11c3257" v2.8.0
 deploy_doc "e7cfc1a" v2.9.0
 deploy_doc "7cb203f" v2.9.1
-deploy_doc "10d7239" v2.10.0
+deploy_doc "10d7239" v2.10.0 
 deploy_doc "b42586e" v2.11.0
 deploy_doc "7fb8bdf" v3.0.2
 deploy_doc "4b3ee9c" v3.1.0
@@ -54,12 +53,6 @@ deploy_doc "3ebb1b3" v3.2.0
 deploy_doc "0613f05" v3.3.1
 deploy_doc "eb0e0ce" v3.4.0
 deploy_doc "818878d" v3.5.1
-deploy_doc "c781171" v4.0.1
+deploy_doc "c781171" v4.0.0
 deploy_doc "bfa4ccf" v4.1.1
-deploy_doc "7d9a9d0" v4.2.2
-deploy_doc "bae0c79" v4.3.3
-deploy_doc "c988db5" v4.4.0
-deploy_doc "c5d6a28" v4.4.1
-deploy_doc "6bc89ed" v4.4.2
-deploy_doc "4906a29" v4.5.0
-deploy_doc "4bae96e"  # v4.5.1 Latest stable release
+deploy_doc "7d9a9d0" # v4.2.0 Latest stable release
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +0,0 @@
-*.py	eol=lf
-*.rst	eol=lf
-*.md	eol=lf
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -34,7 +34,7 @@ Models:
 - funnel: @sgugger
 - gpt2: @patrickvonplaten, @LysandreJik
 - rag: @patrickvonplaten, @lhoestq
- tensorflow: @Rocketknight1
+- tensorflow: @jplu

 Library:

@@ -42,19 +42,15 @@ Library:
 - deepspeed: @stas00
 - ray/raytune: @richardliaw, @amogkam
 - text generation: @patrickvonplaten
- tokenizers: @LysandreJik
+- tokenizers: @n1t0, @LysandreJik
 - trainer: @sgugger
 - pipelines: @LysandreJik

 Documentation: @sgugger

-Model hub:
-
- for issues with a model report at https://discuss.huggingface.co/ and tag the model's creator.
-
 HF projects:

- datasets: [different repo](https://github.com/huggingface/datasets)
+- nlp datasets: [different repo](https://github.com/huggingface/nlp)
 - rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)

 Examples:
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -30,7 +30,7 @@ Fixes # (issue)
 ## Who can review?

 Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
-members/contributors who may be interested in your PR.
+members/contributors which may be interested in your PR.

 <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @

@@ -46,7 +46,7 @@ Models:
 - funnel: @sgugger
 - gpt2: @patrickvonplaten, @LysandreJik
 - rag: @patrickvonplaten, @lhoestq
- tensorflow: @LysandreJik
+- tensorflow: @jplu

 Library:

@@ -62,7 +62,7 @@ Documentation: @sgugger

 HF projects:

- datasets: [different repo](https://github.com/huggingface/datasets)
+- nlp datasets: [different repo](https://github.com/huggingface/nlp)
 - rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)

 Examples:
--- a/.github/conda/meta.yaml
+++ b/.github/conda/meta.yaml
@@ -14,7 +14,7 @@ requirements:
  host:
    - python
    - pip
-    - numpy >=1.17
+    - numpy
    - dataclasses
    - packaging
    - filelock
@@ -23,10 +23,10 @@ requirements:
    - sacremoses
    - regex !=2019.12.17
    - protobuf
-    - tokenizers >=0.10.1,<0.11.0
+    - tokenizers ==0.9.4
  run:
    - python
-    - numpy >=1.17
+    - numpy
    - dataclasses
    - packaging
    - filelock
@@ -35,7 +35,7 @@ requirements:
    - sacremoses
    - regex !=2019.12.17
    - protobuf
-    - tokenizers >=0.10.1,<0.11.0
+    - tokenizers ==0.9.4

 test:
  imports:
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -0,0 +1,18 @@
+# Number of days of inactivity before an issue becomes stale
+daysUntilStale: 60
+# Number of days of inactivity before a stale issue is closed
+daysUntilClose: 7
+# Issues with these labels will never be considered stale
+exemptLabels:
+  - pinned
+  - security
+  - Feature request
+# Label to use when marking an issue as stale
+staleLabel: wontfix
+# Comment to post when marking an issue as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs. Thank you
+  for your contributions.
+# Comment to post when closing a stale issue. Set to `false` to disable
+closeComment: false
--- a/.github/workflows/model-templates.yml
+++ b/.github/workflows/model-templates.yml
@@ -1,13 +1,15 @@
 name: Model templates runner

 on:
-  pull_request:
+  push:
    paths:
      - "src/**"
      - "tests/**"
      - ".github/**"
      - "templates/**"
-    types: [assigned, opened, synchronize, reopened]
+  pull_request_target:
+    branches:
+      - master

 jobs:
  run_tests_templates:
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -5,96 +5,148 @@ on:
    branches:
      - master
      - ci_*
-      - ci-*
    paths:
      - "src/**"
      - "tests/**"
      - ".github/**"
      - "templates/**"
+  # pull_request:
  repository_dispatch:

-env:
-  HF_HOME: /mnt/cache
-  TRANSFORMERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8

 jobs:
  run_tests_torch_gpu:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
-    container:
-      image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    runs-on: [self-hosted, gpu, single-gpu]
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
+      - uses: actions/checkout@v2
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_torch_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version

      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip install pandas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
-          python -c "import torch; print('Cuda version:', torch.version.cuda)"
-          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

+#      - name: Create model files
+#        run: |
+#          source .env/bin/activate
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+
      - name: Run all non-slow tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+          CUDA_VISIBLE_DEVICES: 0
        run: |
-          python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_gpu_failures_short.txt
-
+        
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_torch_gpu_test_reports
          path: reports
+                  

  run_tests_tf_gpu:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
-    container:
-      image: tensorflow/tensorflow:2.4.1-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    runs-on: [self-hosted, gpu, single-gpu]
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
+      - uses: actions/checkout@v2
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_tf_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version

      - name: Install dependencies
        run: |
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece]
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

+      - name: Create model files
+        run: |
+          source .env/bin/activate
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+
      - name: Run all non-slow tests on GPU
        env:
-          TF_NUM_INTRAOP_THREADS: 8
-          TF_NUM_INTEROP_THREADS: 1
+          OMP_NUM_THREADS: 1
+          CUDA_VISIBLE_DEVICES: 0
        run: |
-          python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_gpu_failures_short.txt
-
+        
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
@@ -102,42 +154,58 @@ jobs:
          name: run_all_tests_tf_gpu_test_reports
          path: reports

-
  run_tests_torch_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu]
-    container:
-      image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    runs-on: [self-hosted, gpu, multi-gpu]
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
+      - uses: actions/checkout@v2
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version

+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip install pandas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
-          python -c "import torch; print('Cuda version:', torch.version.cuda)"
-          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

      - name: Run all non-slow tests on GPU
        env:
-          MKL_SERVICE_FORCE_INTEL: 1
+          OMP_NUM_THREADS: 1
        run: |
-          python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/tests_torch_multi_gpu_failures_short.txt
+        run: cat reports/tests_torch_multi_gpu_failures_short.txt          

      - name: Test suite reports artifacts
        if: ${{ always() }}
@@ -147,34 +215,52 @@ jobs:
          path: reports

  run_tests_tf_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu]
-    container:
-      image: tensorflow/tensorflow:2.4.1-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    runs-on: [self-hosted, gpu, multi-gpu]
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
+      - uses: actions/checkout@v2
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version

+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
      - name: Install dependencies
        run: |
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece]
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

      - name: Run all non-slow tests on GPU
        env:
-          TF_NUM_INTRAOP_THREADS: 8
-          TF_NUM_INTEROP_THREADS: 1
+          OMP_NUM_THREADS: 1
        run: |
-          python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -186,110 +272,4 @@ jobs:
        with:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports
-
-  run_tests_torch_cuda_extensions_gpu:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
-    container:
-      image: nvcr.io/nvidia/pytorch:21.03-py3
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install .[testing,deepspeed]
-
-      - name: Are GPUs recognized by our DL frameworks
-        run: |
-          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
-          python -c "import torch; print('Cuda version:', torch.version.cuda)"
-          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
-          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
-
-      - name: Run all tests on GPU
-        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
-
-      - name: Failure short reports
-        if: ${{ always() }}
-        run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v2
-        with:
-          name: run_tests_torch_cuda_extensions_gpu_test_reports
-          path: reports
-
-  run_tests_torch_cuda_extensions_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu]
-    container:
-      image: nvcr.io/nvidia/pytorch:21.03-py3
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install .[testing,deepspeed,fairscale]
-
-      - name: Are GPUs recognized by our DL frameworks
-        run: |
-          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
-          python -c "import torch; print('Cuda version:', torch.version.cuda)"
-          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
-          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
-
-      - name: Run all tests on GPU
-        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
-
-      - name: Failure short reports
-        if: ${{ always() }}
-        run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v2
-        with:
-          name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
-          path: reports
-
-
-  send_results:
-    name: Send results to webhook
-    runs-on: ubuntu-latest
-    if: always()
-    needs: [
-        run_tests_torch_gpu,
-        run_tests_tf_gpu,
-        run_tests_torch_multi_gpu,
-        run_tests_tf_multi_gpu,
-        run_tests_torch_cuda_extensions_gpu,
-        run_tests_torch_cuda_extensions_multi_gpu
-    ]
-    steps:
-      - uses: actions/checkout@v2
-
-      - uses: actions/download-artifact@v2
-
-      - name: Send message to Slack
-        env:
-          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
-
-        run: |
-          pip install slack_sdk
-          python utils/notification_service.py push
+          
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -1,66 +1,82 @@
+# configuration notes:
+#
+# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise
+# the step uses the system-wide python interpreter.
+
 name: Self-hosted runner (scheduled)

 on:
-  push:
-    branches:
-      - multi_ci_*
  repository_dispatch:
  schedule:
    - cron: "0 0 * * *"

-env:
-  HF_HOME: /mnt/cache
-  TRANSFORMERS_IS_CI: yes
-  RUN_SLOW: yes
-  OMP_NUM_THREADS: 16
-  MKL_NUM_THREADS: 16
-
 jobs:
  run_all_tests_torch_gpu:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
-    container:
-      image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    runs-on: [self-hosted, gpu, single-gpu]
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
+      - uses: actions/checkout@v2

-      - name: NVIDIA-SMI
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v  1.1-slow_tests_torch_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version

      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
-          python -c "import torch; print('Cuda version:', torch.version.cuda)"
-          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

      - name: Run all tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_gpu_failures_short.txt
-
+        
      - name: Run examples tests on GPU
        if: ${{ always() }}
        env:
-          OMP_NUM_THREADS: 16
-          MKL_NUM_THREADS: 16
+          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
-          HF_HOME: /mnt/cache
-          TRANSFORMERS_IS_CI: yes
        run: |
+          source .env/bin/activate
          pip install -r examples/_tests_requirements.txt
-          python -m pytest -n 1 --dist=loadfile --make-reports=examples_torch_gpu examples
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_gpu examples

      - name: Failure short reports
        if: ${{ always() }}
@@ -69,9 +85,13 @@ jobs:
      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -84,36 +104,60 @@ jobs:
          name: run_all_tests_torch_gpu_test_reports
          path: reports

-  run_all_tests_tf_gpu:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
-    container:
-      image: tensorflow/tensorflow:2.4.1-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2

-      - name: NVIDIA-SMI
+  run_all_tests_tf_gpu:
+    runs-on: [self-hosted, gpu, single-gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-slow_tests_tf_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version

      - name: Install dependencies
        run: |
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnx,sentencepiece]
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

      - name: Run all tests on GPU
        env:
-          TF_NUM_INTEROP_THREADS: 1
-          TF_NUM_INTRAOP_THREADS: 16
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests
-
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_gpu tests
+          
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_gpu_failures_short.txt
@@ -121,15 +165,17 @@ jobs:
      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
-          TF_NUM_INTEROP_THREADS: 1
-          TF_NUM_INTRAOP_THREADS: 16
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/tests_tf_pipeline_gpu_failures_short.txt
+        run: cat reports/tests_tf_pipelines_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
@@ -137,49 +183,86 @@ jobs:
        with:
          name: run_all_tests_tf_gpu_test_reports
          path: reports
-
+          
  run_all_tests_torch_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu]
-    container:
-      image: pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    runs-on: [self-hosted, gpu, multi-gpu]
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
+      - uses: actions/checkout@v2

-      - name: NVIDIA-SMI
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version

      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech]
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
-          python -c "import torch; print('Cuda version:', torch.version.cuda)"
-          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

-      - name: Run all tests on GPU
+      - name: Run all tests on multi-GPU
        env:
-          MKL_SERVICE_FORCE_INTEL: 1
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_multi_gpu_failures_short.txt

-      - name: Run all pipeline tests on GPU
+      - name: Run examples tests on multi-GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_examples_multi_gpu examples
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_examples_multi_gpu_failures_short.txt
+
+      - name: Run all pipeline tests on multi-GPU
        if: ${{ always() }}
        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -193,48 +276,73 @@ jobs:
          path: reports

  run_all_tests_tf_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu]
-    container:
-      image: tensorflow/tensorflow:2.4.1-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    runs-on: [self-hosted, gpu, multi-gpu]
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
+      - uses: actions/checkout@v2

-      - name: NVIDIA-SMI
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
        run: |
-          nvidia-smi
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version

      - name: Install dependencies
        run: |
+          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnx,sentencepiece]
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list

      - name: Are GPUs recognized by our DL frameworks
        run: |
+          source .env/bin/activate
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

-      - name: Run all tests on GPU
+      - name: Run all tests on multi-GPU
        env:
-          TF_NUM_INTEROP_THREADS: 1
-          TF_NUM_INTRAOP_THREADS: 16
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_multi_gpu_failures_short.txt

-      - name: Run all pipeline tests on GPU
+      - name: Run all pipeline tests on multi-GPU
        if: ${{ always() }}
        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
-          TF_NUM_INTEROP_THREADS: 1
-          TF_NUM_INTRAOP_THREADS: 16
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
-
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
+          
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_pipeline_multi_gpu_failures_short.txt
@@ -245,110 +353,4 @@ jobs:
        with:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports
-
-  run_all_tests_torch_cuda_extensions_gpu:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
-    container:
-      image: nvcr.io/nvidia/pytorch:21.03-py3
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install .[testing,deepspeed]
-
-      - name: Are GPUs recognized by our DL frameworks
-        run: |
-          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
-          python -c "import torch; print('Cuda version:', torch.version.cuda)"
-          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
-          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
-
-      - name: Run all tests on GPU
-        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
-
-      - name: Failure short reports
-        if: ${{ always() }}
-        run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v2
-        with:
-          name: run_tests_torch_cuda_extensions_gpu_test_reports
-          path: reports
-
-  run_all_tests_torch_cuda_extensions_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu]
-    container:
-      image: nvcr.io/nvidia/pytorch:21.03-py3
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install .[testing,deepspeed,fairscale]
-
-      - name: Are GPUs recognized by our DL frameworks
-        run: |
-          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
-          python -c "import torch; print('Cuda version:', torch.version.cuda)"
-          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
-          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
-
-      - name: Run all tests on GPU
-        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
-
-      - name: Failure short reports
-        if: ${{ always() }}
-        run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v2
-        with:
-          name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
-          path: reports
-
-  send_results:
-    name: Send results to webhook
-    runs-on: ubuntu-latest
-    if: always()
-    needs: [
-        run_all_tests_torch_gpu,
-        run_all_tests_tf_gpu,
-        run_all_tests_torch_multi_gpu,
-        run_all_tests_tf_multi_gpu,
-        run_all_tests_torch_cuda_extensions_gpu,
-        run_all_tests_torch_cuda_extensions_multi_gpu
-    ]
-    steps:
-      - uses: actions/checkout@v2
-
-      - uses: actions/download-artifact@v2
-
-      - name: Send message to Slack
-        env:
-          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
-
-
-        run: |
-          pip install slack_sdk
-          python utils/notification_service.py scheduled
+          
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -1,27 +0,0 @@
-name: Stale Bot
-
-on:
-  schedule:
-    - cron: "0 15 * * *"
-
-jobs:
-  close_stale_issues:
-    name: Close Stale Issues
-    if: github.repository == 'huggingface/transformers'
-    runs-on: ubuntu-latest
-    env:
-      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-    steps:
-    - uses: actions/checkout@v2
-
-    - name: Setup Python
-      uses: actions/setup-python@v1
-      with:
-        python-version: 3.7
-
-    - name: Install requirements
-      run: |
-        pip install PyGithub
-    - name: Close stale issues
-      run: |
-        python scripts/stale.py
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,8 @@ __pycache__/
 *.so

 # tests and logs
-tests/fixtures/cached_*_text.txt
+tests/fixtures/*
+!tests/fixtures/sample_text_no_unicode.txt
 logs/
 lightning_logs/
 lang_code_data/
--- a/ISSUES.md
+++ b/ISSUES.md
@@ -207,8 +207,6 @@ You are not required to read the following guidelines before opening an issue. H

   Do not dispair if you can't figure it out from the begining, just share what you can and perhaps someone else will be able to help you at the forums.

-   If your setup involves any custom datasets, the best way to help us reproduce the problem is to create a [Google Colab notebook](https://colab.research.google.com/) that demonstrates the issue and once you verify that the issue still exists, include a link to that notebook in the Issue. Just make sure that you don't copy and paste the location bar url of the open notebook - as this is private and we won't be able to open it. Instead, you need to click on `Share` in the right upper corner of the notebook, select `Get Link` and then copy and paste the public link it will give to you.
-
 7. If you forked off some of this project's code or example applications, please, do not ask us to go into your code repository and figure out what you may have done. The code is already very complex and unless there is an easy way to do a diff and it's a small diff, it won't be possible to find someone with time on their hands to make a lengthy investigation. Albeit, you might find someone at the forums who will be generous to do this for you.

 8. Before reporting an issue, first, always try to update your environment to the latest official version of this library. We have no resources to go and debug older revisions, which could easily have bugs that have been fixed in the latest released version.
--- a/43
+++ b/43
@@ -19,44 +19,33 @@ modified_only_fixup:
 deps_table_update:
 	@python setup.py deps_table_update

-# autogenerating code
-
-autogenerate_code: deps_table_update
-	python utils/class_mapping_update.py
-
 # Check that source code meets quality standards

-extra_quality_checks:
+extra_quality_checks: deps_table_update
 	python utils/check_copies.py
 	python utils/check_table.py
 	python utils/check_dummies.py
 	python utils/check_repo.py
-	python utils/check_inits.py
+	python utils/style_doc.py src/transformers docs/source --max_len 119

 # this target runs checks on all files
 quality:
 	black --check $(check_dirs)
 	isort --check-only $(check_dirs)
-	python utils/custom_init_isort.py --check_only
 	flake8 $(check_dirs)
+	python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
 	${MAKE} extra_quality_checks

 # Format source code automatically and check is there are any problems left that need manual fixing

-extra_style_checks:
-	python utils/custom_init_isort.py
-	python utils/style_doc.py src/transformers docs/source --max_len 119
-
-# this target runs checks on all files and potentially modifies some of them
-style:
+style: deps_table_update
 	black $(check_dirs)
 	isort $(check_dirs)
-	${MAKE} autogenerate_code
-	${MAKE} extra_style_checks
+	python utils/style_doc.py src/transformers docs/source --max_len 119

 # Super fast fix and check target that only works on relevant modified files since the branch was made

-fixup: modified_only_fixup extra_style_checks autogenerate_code extra_quality_checks
+fixup: modified_only_fixup extra_quality_checks

 # Make marked copies of snippets of codes conform to the original

@@ -75,27 +64,7 @@ test:
 test-examples:
 	python -m pytest -n auto --dist=loadfile -s -v ./examples/

-# Run tests for SageMaker DLC release
-
-test-sagemaker: # install sagemaker dependencies in advance with pip install .[sagemaker]
-	TEST_SAGEMAKER=True python -m pytest -n auto  -s -v ./tests/sagemaker
-
-
 # Check that docs can build

 docs:
 	cd docs && make html SPHINXOPTS="-W -j 4"
-
-# Release stuff
-
-pre-release:
-	python utils/release.py
-
-pre-patch:
-	python utils/release.py --patch
-
-post-release:
-	python utils/release.py --post_release
-
-post-patch:
-	python utils/release.py --post_release --patch
--- a/README.md
+++ b/README.md
@@ -1,12 +1,3 @@
-# Patches
-
-This branch has the following patches:
-
-* gpt-neo model is loaded directly on GPU to save system memory
-* repetition_penalty has range and slope settings, so it doesn't penalize all tokens in the context window
-* no copy of the state dict is made while loading a pretrained model
-* local self attention uses padding so it doesn't OOM on long sequences
-
 <!---
 Copyright 2020 The HuggingFace Team. All rights reserved.

@@ -64,7 +55,7 @@ Here are a few examples:
 - [Masked word completion with BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [Name Entity Recognition with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
 - [Text generation with GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
- [Natural Language Inference with RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [Natural Langugage Inference with RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
 - [Summarization with BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
 - [Question answering with DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
 - [Translation with T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
@@ -203,17 +194,13 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[CPM](https://huggingface.co/transformers/model_doc/cpm.html)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeiT](https://huggingface.co/transformers/model_doc/deit.html)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft Research) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
 1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
@@ -224,36 +211,27 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[Megatron-BERT](https://huggingface.co/transformers/model_doc/megatron_bert.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
 1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/transformers/model_doc/xlsr_wav2vec2.html)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.

 To check if each model has an implementation in PyTorch/TensorFlow/Flax or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/transformers/index.html#bigtable)
--- a/docs/README.md
+++ b/docs/README.md
@@ -26,7 +26,7 @@ pip install -e ".[docs]"
 ---
 **NOTE**

-You only need to generate the documentation to inspect it locally (if you're planning changes and want to
+You only need to generate the documentation to inspect it locally (if you're planning changes and want to 
 check how they look like before committing for instance). You don't have to commit the built documentation.

 ---
@@ -65,7 +65,7 @@ make html
 ```

 A folder called ``_build/html`` should have been created. You can now open the file ``_build/html/index.html`` in your
-browser.
+browser. 

 ---
 **NOTE**
@@ -95,15 +95,15 @@ following these steps:
  expand them).
 - Click on "details" next to the `ci/circleci: build_doc` check.
 - In the new window, click on the "Artifacts" tab.
- Locate the file "docs/_build/html/index.html" (or any specific page you want to check) and click on it to get a
+- Locate the file "docs/_build/html/index.html" (or any specific page you want to check) and click on it to get a 
  preview.

 ## Writing Documentation - Specification

 The `huggingface/transformers` documentation follows the
 [Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style. It is
-mostly written in ReStructuredText
-([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html),
+mostly written in ReStructuredText 
+([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html), 
 [Sourceforge complete documentation](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html)).


@@ -121,8 +121,8 @@ four.
 ### Adding a new model

 When adding a new model:
-
- Create a file `xxx.rst` under `./source/model_doc` (don't hesitate to copy an existing file as template).
+ 
+- Create a file `xxx.rst` under `./source/model_doc` (don't hesitate to copy an existing file as template). 
 - Link that file in `./source/index.rst` on the `model_doc` toc-tree.
 - Write a short overview of the model:
    - Overview with paper & authors
@@ -130,8 +130,8 @@ When adding a new model:
    - Tips and tricks and how to use it best
 - Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and
  every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow.
-  The order is generally:
-    - Configuration,
+  The order is generally: 
+    - Configuration, 
    - Tokenizer
    - PyTorch base model
    - PyTorch head models
@@ -179,7 +179,7 @@ Links should be done as so (note the double underscore at the end): \`text for t

 #### Defining arguments in a method

-Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation.
+Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 
 The argument should be followed by its type, with its shape if it is a tensor, and a line return.
 Another indentation is necessary before writing the description of the argument.

@@ -216,9 +216,9 @@ then its documentation should look like this:

 Note that we always omit the "defaults to :obj:\`None\`" when None is the default for any argument. Also note that even
 if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
-however write as many lines as you want in the indented description (see the example above with `input_ids`).
+however write as many lines as you want in the indented description (see the example above with `input_ids`). 

-#### Writing a multi-line code block
+#### Writing a multi-line code block 

 Multi-line code blocks can be useful for displaying examples. They are done like so:

@@ -237,7 +237,7 @@ the results stay consistent with the library.

 #### Writing a return block

-Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation.
+Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 
 The first line should be the type of the return, followed by a line return. No need to indent further for the elements
 building the return.

@@ -258,43 +258,3 @@ Here's an example for a single value return:
    Returns:
        :obj:`List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
 ```
-
-#### Adding a new section
-
-In ReST section headers are designated as such with the help of a line of underlying characters, e.g.,:
-
-```
-Section 1
-^^^^^^^^^^^^^^^^^^
-
-Sub-section 1
-~~~~~~~~~~~~~~~~~~
-```
-
-ReST allows the use of any characters to designate different section levels, as long as they are used consistently within the same document. For details see [sections doc](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#sections). Because there is no standard different documents often end up using different characters for the same levels which makes it very difficult to know which character to use when creating a new section.
-
-Specifically, if when running `make docs` you get an error like:
-```
-docs/source/main_classes/trainer.rst:127:Title level inconsistent:
-```
-you picked an inconsistent character for some of the levels.
-
-But how do you know which characters you must use for an already existing level or when adding a new level?
-
-You can use this helper script:
-```
-perl -ne '/^(.)\1{100,}/ && do { $h{$1}=++$c if !$h{$1} }; END { %h = reverse %h ; print "$_ $h{$_}\n" for sort keys %h}' docs/source/main_classes/trainer.rst
-1 -
-2 ~
-3 ^
-4 =
-5 "
-```
-
-This tells you which characters have already been assigned for each level.
-
-So using this particular example's output -- if your current section's header uses `=` as its underline character, you now know you're at level 4, and if you want to add a sub-section header you know you want `"` as it'd level 5.
-
-If you needed to add yet another sub-level, then pick a character that is not used already. That is you must pick a character that is not in the output of that script.
-
-Here is the full list of characters that can be used in this context: `= - ` : ' " ~ ^ _ * + # < >`
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,13 +1,10 @@
 // These two things need to be updated at each release for the version selector.
 // Last stable version
-const stableVersion = "v4.5.1"
+const stableVersion = "v4.2.0"
 // Dictionary doc folder to label. The last stable version should have an empty key.
 const versionMapping = {
    "master": "master",
-    "": "v4.5.0/v4.5.1 (stable)",
-    "v4.4.2": "v4.4.0/v4.4.1/v4.4.2",
-    "v4.3.3": "v4.3.0/v4.3.1/v4.3.2/v4.3.3",
-    "v4.2.2": "v4.2.0/v4.2.1/v4.2.2",
+    "": "v4.2.0/v4.2.1 (stable)",
    "v4.1.1": "v4.1.0/v4.1.1",
    "v4.0.1": "v4.0.0/v4.0.1",
    "v3.5.1": "v3.5.0/v3.5.1",
@@ -63,7 +60,7 @@ function addIcon() {
 function addCustomFooter() {
    const customFooter = document.createElement("div");
    const questionOrIssue = document.createElement("div");
-    questionOrIssue.innerHTML = "Stuck? Read our <a href='https://huggingface.co/blog'>Blog posts</a> or <a href='https://github.com/huggingface/transformers'>Create an issue</a>";
+    questionOrIssue.innerHTML = "Stuck? Read our <a href='https://medium.com/huggingface'>Blog posts</a> or <a href='https://github.com/huggingface/transformers'>Create an issue</a>";
    customFooter.appendChild(questionOrIssue);
    customFooter.classList.add("footer");

@@ -130,11 +127,11 @@ function addVersionControl() {
    const parts = location.toString().split('/');
    let versionIndex = parts.length - 2;
    // Index page may not have a last part with filename.html so we need to go up
-    if (parts[parts.length - 1] != "" && ! parts[parts.length - 1].match(/\.html/)) {
+    if (parts[parts.length - 1] != "" && ! parts[parts.length - 1].match(/\.html$|^search.html?/)) {
        versionIndex = parts.length - 1;
    }
    // Main classes and models are nested so we need to go deeper
-    else if (parts[versionIndex] == "main_classes" || parts[versionIndex] == "model_doc" || parts[versionIndex] == "internal") {
+    else if (parts[versionIndex] == "main_classes" || parts[versionIndex] == "model_doc") {
        versionIndex = versionIndex - 1;
    } 
    const version = parts[versionIndex];
--- a/docs/source/add_new_model.rst
+++ b/docs/source/add_new_model.rst
@@ -388,7 +388,7 @@ Next, you can finally start adding new code to 🤗 Transformers. Go into the cl

 ::

-    cd transformers
+   cd transformers

 In the special case that you are adding a model whose architecture exactly matches the model architecture of an
 existing model you only have to add a conversion script as described in `this section <#write-a-conversion-script>`__.
@@ -417,27 +417,27 @@ You should do the following:

 ::

-    git checkout -b add_brand_new_bert
+   git checkout -b add_brand_new_bert

 2. Commit the automatically generated code:

 ::

-    git add .
-    git commit
+   git add .
+   git commit

 3. Fetch and rebase to current master

 ::

-    git fetch upstream
-    git rebase upstream/master
+   git fetch upstream
+   git rebase upstream/master

 4. Push the changes to your account using:

 ::

-    git push -u origin a-descriptive-name-for-my-changes
+   git push -u origin a-descriptive-name-for-my-changes

 5. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the
   GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for
@@ -451,8 +451,8 @@ time to time by doing:

 ::

-    git fetch upstream
-    git merge upstream/master
+   git fetch upstream
+   git merge upstream/master

 In general, all questions you might have regarding the model or your implementation should be asked in your PR and
 discussed/solved in the PR. This way, the Hugging Face team will always be notified when you are committing new code or
--- a/docs/source/community.md
+++ b/docs/source/community.md
@@ -18,8 +18,8 @@ This page regroups resources around 🤗 Transformers developed by the community
 | [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb)  | How to fine-tune the DialoGPT model on a new dataset for open-dialog conversational chatbots |  [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
 | [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  | How to train on sequences as long as 500,000 tokens with Reformer |  [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  |
 | [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) | How to fine-tune BART for summarization with fastai using blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/_notebooks/2020-05-23-text-generation-with-blurr.ipynb) |
-| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | How to generate tweets in the style of your favorite Twitter account by fine-tuning a GPT-2 model |  [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
-| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | A complete tutorial showcasing W&B integration with Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) |
+| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb)  | How to generate tweets in the style of your favorite Twitter account by fine-tune a GPT-2 model |  [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
+| [A Step by Step Guide to Tracking Hugging Face Model Performance](https://colab.research.google.com/drive/1NEiqNPhiouu2pPwDAVeFoN4-vTYMz9F8)  | A quick tutorial for training NLP models with HuggingFace and & visualizing their performance with Weights & Biases |  [Jack Morris](https://github.com/jxmorris12) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1NEiqNPhiouu2pPwDAVeFoN4-vTYMz9F8) |
 | [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb)  | How to build a "long" version of existing pretrained models |  [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) |
 | [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | How to fine-tune longformer model for QA task | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) |
 | [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | How to evaluate longformer on TriviaQA with `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) |
@@ -30,7 +30,6 @@ This page regroups resources around 🤗 Transformers developed by the community
 |[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|How to speed up fine-tuning by a factor of 2 using dynamic padding / bucketing|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)|
 |[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| How to train a Reformer model with bi-directional self-attention layers | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)|
 |[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| How to increase vocabulary of a pretrained SciBERT model from AllenAI on the CORD dataset and pipeline it. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)|
-|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| How to fine tune BlenderBotSmall for summarization on a custom dataset, using the Trainer API. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)|
 |[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | How to fine-tune Electra for sentiment analysis and interpret predictions with Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)|
 |[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | How to fine-tune a non-English GPT-2 Model with Trainer class | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)|
 |[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | How to fine-tune a DistilBERT Model for Multi Label Classification task | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)|
@@ -48,7 +47,3 @@ This page regroups resources around 🤗 Transformers developed by the community
 |[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | How to fine-tune LED on pubmed for long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)|
 |[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | How to effectively evaluate LED on long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)|
 |[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | How to fine-tune *LayoutLMForSequenceClassification* on the RVL-CDIP dataset for scanned document classification | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)|
-|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | How to decode CTC sequence with language model adjustment | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)|
-|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | How to fine-tune BART for summarization in two languages with Trainer class | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
-|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | How to evaluate BigBird on long document question answering on Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|
-| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | How to create YouTube captions from any video by transcribing the audio with Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) |
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -14,24 +14,21 @@
 #
 import os
 import sys
-
-sys.path.insert(0, os.path.abspath("../../src"))
+sys.path.insert(0, os.path.abspath('../../src'))


 # -- Project information -----------------------------------------------------

-project = "transformers"
-copyright = "2020, The Hugging Face Team, Licenced under the Apache License, Version 2.0"
-author = "huggingface"
+project = u'transformers'
+copyright = u'2020, The Hugging Face Team, Licenced under the Apache License, Version 2.0'
+author = u'huggingface'

 # The short X.Y version
-version = ""
+version = u''
 # The full version, including alpha/beta/rc tags
-release = "4.5.0.dev0"
-
-
+release = u'4.3.0'
 # Prefix link to point to master, comment this during version release and uncomment below line
-extlinks = {"prefix_link": ("https://github.com/huggingface/transformers/blob/master/%s", "")}
+extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/master/%s', '')}
 # Prefix link to always point to corresponding version, uncomment this during version release
 # extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/v'+ release + '/%s', '')}

@@ -45,28 +42,27 @@ extlinks = {"prefix_link": ("https://github.com/huggingface/transformers/blob/ma
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    "sphinx.ext.autodoc",
-    "sphinx.ext.extlinks",
-    "sphinx.ext.coverage",
-    "sphinx.ext.napoleon",
-    "recommonmark",
-    "sphinx.ext.viewcode",
-    "sphinx_markdown_tables",
-    "sphinxext.opengraph",
-    "sphinx_copybutton",
+    'sphinx.ext.autodoc',
+    'sphinx.ext.extlinks',
+    'sphinx.ext.coverage',
+    'sphinx.ext.napoleon',
+    'recommonmark',
+    'sphinx.ext.viewcode',
+    'sphinx_markdown_tables',
+    'sphinx_copybutton'
 ]

 # Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
+templates_path = ['_templates']

 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
-source_suffix = [".rst", ".md"]
+source_suffix = ['.rst', '.md']
 # source_suffix = '.rst'

 # The master toctree document.
-master_doc = "index"
+master_doc = 'index'

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -78,7 +74,7 @@ language = None
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']

 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = None
@@ -92,30 +88,20 @@ copybutton_prompt_is_regexp = True
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = "sphinx_rtd_theme"
+html_theme = 'sphinx_rtd_theme'

 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
-html_theme_options = {"analytics_id": "UA-83738774-2", "navigation_with_keys": True}
-
-#  Configuration for OpenGraph and Twitter Card Tags.
-# These are responsible for creating nice shareable social images https://ahrefs.com/blog/open-graph-meta-tags/
-# https://ogp.me/#type_website
-ogp_image = "https://huggingface.co/front/thumbnails/transformers.png"
-ogp_description = "State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0. Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation, etc in 100+ languages. Its aim is to make cutting-edge NLP easier to use for everyone"
-ogp_description_length = 160
-
-ogp_custom_meta_tags = [
-    f'<meta name="twitter:image" content="{ogp_image}">',
-    f'<meta name="twitter:description" content="{ogp_description}">',
-]
+html_theme_options = {
+    'analytics_id': 'UA-83738774-2'
+}

 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
+html_static_path = ['_static']

 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.
@@ -127,17 +113,17 @@ html_static_path = ["_static"]
 #
 # html_sidebars = {}

-# This must be the name of an image file (path relative to the configuration
-# directory) that is the favicon of the docs. Modern browsers use this as
-# the icon for tabs, windows and bookmarks. It should be a Windows-style
+# This must be the name of an image file (path relative to the configuration 
+# directory) that is the favicon of the docs. Modern browsers use this as 
+# the icon for tabs, windows and bookmarks. It should be a Windows-style 
 # icon file (.ico).
-html_favicon = "favicon.ico"
+html_favicon = 'favicon.ico'


 # -- Options for HTMLHelp output ---------------------------------------------

 # Output file base name for HTML help builder.
-htmlhelp_basename = "transformersdoc"
+htmlhelp_basename = 'transformersdoc'


 # -- Options for LaTeX output ------------------------------------------------
@@ -146,12 +132,15 @@ latex_elements = {
    # The paper size ('letterpaper' or 'a4paper').
    #
    # 'papersize': 'letterpaper',
+
    # The font size ('10pt', '11pt' or '12pt').
    #
    # 'pointsize': '10pt',
+
    # Additional stuff for the LaTeX preamble.
    #
    # 'preamble': '',
+
    # Latex figure (float) alignment
    #
    # 'figure_align': 'htbp',
@@ -161,7 +150,8 @@ latex_elements = {
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, "transformers.tex", "transformers Documentation", "huggingface", "manual"),
+    (master_doc, 'transformers.tex', u'transformers Documentation',
+     u'huggingface', 'manual'),
 ]


@@ -169,7 +159,10 @@ latex_documents = [

 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [(master_doc, "transformers", "transformers Documentation", [author], 1)]
+man_pages = [
+    (master_doc, 'transformers', u'transformers Documentation',
+     [author], 1)
+]


 # -- Options for Texinfo output ----------------------------------------------
@@ -178,15 +171,9 @@ man_pages = [(master_doc, "transformers", "transformers Documentation", [author]
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (
-        master_doc,
-        "transformers",
-        "transformers Documentation",
-        author,
-        "transformers",
-        "One line description of project.",
-        "Miscellaneous",
-    ),
+    (master_doc, 'transformers', u'transformers Documentation',
+     author, 'transformers', 'One line description of project.',
+     'Miscellaneous'),
 ]


@@ -205,13 +192,11 @@ epub_title = project
 # epub_uid = ''

 # A list of files that should not be packed into the epub file.
-epub_exclude_files = ["search.html"]
-
+epub_exclude_files = ['search.html']

 def setup(app):
-    app.add_css_file("css/huggingface.css")
-    app.add_css_file("css/code-snippets.css")
-    app.add_js_file("js/custom.js")
-
+    app.add_css_file('css/huggingface.css')
+    app.add_css_file('css/code-snippets.css')
+    app.add_js_file('js/custom.js')

 # -- Extension configuration -------------------------------------------------
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -47,12 +47,12 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas

 .. code-block:: shell

-    export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
+   export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12

-    transformers-cli convert --model_type bert \
-      --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
-      --config $BERT_BASE_DIR/bert_config.json \
-      --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
+   transformers-cli convert --model_type bert \
+     --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
+     --config $BERT_BASE_DIR/bert_config.json \
+     --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin

 You can download Google's pre-trained models for the conversion `here
 <https://github.com/google-research/bert#pre-trained-models>`__.
@@ -72,12 +72,12 @@ Here is an example of the conversion process for the pre-trained ``ALBERT Base``

 .. code-block:: shell

-    export ALBERT_BASE_DIR=/path/to/albert/albert_base
+   export ALBERT_BASE_DIR=/path/to/albert/albert_base

-    transformers-cli convert --model_type albert \
-      --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
-      --config $ALBERT_BASE_DIR/albert_config.json \
-      --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
+   transformers-cli convert --model_type albert \
+     --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
+     --config $ALBERT_BASE_DIR/albert_config.json \
+     --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin

 You can download Google's pre-trained models for the conversion `here
 <https://github.com/google-research/albert#pre-trained-models>`__.
@@ -91,13 +91,13 @@ save as the same format than OpenAI pretrained model (see `here <https://github.

 .. code-block:: shell

-    export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
+   export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights

-    transformers-cli convert --model_type gpt \
-      --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-      [--config OPENAI_GPT_CONFIG] \
-      [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
+   transformers-cli convert --model_type gpt \
+     --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+     [--config OPENAI_GPT_CONFIG] \
+     [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \


 OpenAI GPT-2
@@ -108,13 +108,13 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode

 .. code-block:: shell

-    export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
+   export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights

-    transformers-cli convert --model_type gpt2 \
-      --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
-      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-      [--config OPENAI_GPT2_CONFIG] \
-      [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
+   transformers-cli convert --model_type gpt2 \
+     --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+     [--config OPENAI_GPT2_CONFIG] \
+     [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]

 Transformer-XL
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -124,13 +124,13 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo

 .. code-block:: shell

-    export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
+   export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint

-    transformers-cli convert --model_type transfo_xl \
-      --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-      [--config TRANSFO_XL_CONFIG] \
-      [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
+   transformers-cli convert --model_type transfo_xl \
+     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+     [--config TRANSFO_XL_CONFIG] \
+     [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]


 XLNet
@@ -140,14 +140,14 @@ Here is an example of the conversion process for a pre-trained XLNet model:

 .. code-block:: shell

-    export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
-    export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
+   export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
+   export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config

-    transformers-cli convert --model_type xlnet \
-      --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
-      --config $TRANSFO_XL_CONFIG_PATH \
-      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-      [--finetuning_task_name XLNET_FINETUNED_TASK] \
+   transformers-cli convert --model_type xlnet \
+     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
+     --config $TRANSFO_XL_CONFIG_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+     [--finetuning_task_name XLNET_FINETUNED_TASK] \


 XLM
@@ -157,13 +157,13 @@ Here is an example of the conversion process for a pre-trained XLM model:

 .. code-block:: shell

-    export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
+   export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint

-    transformers-cli convert --model_type xlm \
-      --tf_checkpoint $XLM_CHECKPOINT_PATH \
-      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
-     [--config XML_CONFIG] \
-     [--finetuning_task_name XML_FINETUNED_TASK]
+   transformers-cli convert --model_type xlm \
+     --tf_checkpoint $XLM_CHECKPOINT_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
+    [--config XML_CONFIG] \
+    [--finetuning_task_name XML_FINETUNED_TASK]


 T5
@@ -173,9 +173,9 @@ Here is an example of the conversion process for a pre-trained T5 model:

 .. code-block:: shell

-    export T5=/path/to/t5/uncased_L-12_H-768_A-12
+   export T5=/path/to/t5/uncased_L-12_H-768_A-12

-    transformers-cli convert --model_type t5 \
-      --tf_checkpoint $T5/t5_model.ckpt \
-      --config $T5/t5_config.json \
-      --pytorch_dump_output $T5/pytorch_model.bin
+   transformers-cli convert --model_type t5 \
+     --tf_checkpoint $T5/t5_model.ckpt \
+     --config $T5/t5_config.json \
+     --pytorch_dump_output $T5/pytorch_model.bin
--- a/docs/source/custom_datasets.rst
+++ b/docs/source/custom_datasets.rst
@@ -15,10 +15,10 @@ Fine-tuning with custom datasets

 .. note::

-    The datasets used in this tutorial are available and can be more easily accessed using the `🤗 Datasets library
-    <https://github.com/huggingface/datasets>`_. We do not use this library to access the datasets here since this
-    tutorial meant to illustrate how to work with your own data. A brief of introduction can be found at the end of the
-    tutorial in the section ":ref:`datasetslib`".
+    The datasets used in this tutorial are available and can be more easily accessed using the `🤗 NLP library
+    <https://github.com/huggingface/nlp>`_. We do not use this library to access the datasets here since this tutorial
+    meant to illustrate how to work with your own data. A brief of introduction can be found at the end of the tutorial
+    in the section ":ref:`nlplib`".

 This tutorial will take you through several examples of using 🤗 Transformers models with your own datasets. The guide
 shows one of many valid workflows for using these models and is meant to be illustrative rather than definitive. We
@@ -41,7 +41,7 @@ Sequence Classification with IMDb Reviews
 .. note::

    This dataset can be explored in the Hugging Face model hub (`IMDb <https://huggingface.co/datasets/imdb>`_), and
-    can be alternatively downloaded with the 🤗 Datasets library with ``load_dataset("imdb")``.
+    can be alternatively downloaded with the 🤗 NLP library with ``load_dataset("imdb")``.

 In this example, we'll show how to download, tokenize, and train a model on the IMDb reviews dataset. This task takes
 the text of a review and requires the model to predict whether the sentiment of the review is positive or negative.
@@ -260,7 +260,7 @@ Token Classification with W-NUT Emerging Entities
 .. note::

    This dataset can be explored in the Hugging Face model hub (`WNUT-17 <https://huggingface.co/datasets/wnut_17>`_),
-    and can be alternatively downloaded with the 🤗 Datasets library with ``load_dataset("wnut_17")``.
+    and can be alternatively downloaded with the 🤗 NLP library with ``load_dataset("wnut_17")``.

 Next we will look at token classification. Rather than classifying an entire sequence, this task classifies token by
 token. We'll demonstrate how to do this with `Named Entity Recognition
@@ -459,7 +459,7 @@ Question Answering with SQuAD 2.0
 .. note::

    This dataset can be explored in the Hugging Face model hub (`SQuAD V2
-    <https://huggingface.co/datasets/squad_v2>`_), and can be alternatively downloaded with the 🤗 Datasets library with
+    <https://huggingface.co/datasets/squad_v2>`_), and can be alternatively downloaded with the 🤗 NLP library with
    ``load_dataset("squad_v2")``.

 Question answering comes in many forms. In this example, we'll look at the particular type of extractive QA that
@@ -558,14 +558,15 @@ we can use the built in :func:`~transformers.BatchEncoding.char_to_token` method
        end_positions = []
        for i in range(len(answers)):
            start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
-            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
+            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

            # if start position is None, the answer passage has been truncated
            if start_positions[-1] is None:
                start_positions[-1] = tokenizer.model_max_length
-            if end_positions[-1] is None:
-                end_positions[-1] = tokenizer.model_max_length

+            # if end position is None, the 'char_to_token' function points to the space before the correct token - > add + 1
+            if end_positions[-1] is None:
+                end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] + 1)
        encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

    add_token_positions(train_encodings, train_answers)
@@ -677,23 +678,22 @@ Additional Resources
  - :doc:`Preprocessing <preprocessing>`. Docs page on data preprocessing.
  - :doc:`Training <training>`. Docs page on training and fine-tuning.

-.. _datasetslib:
+.. _nlplib:

-Using the 🤗 Datasets & Metrics library
+Using the 🤗 NLP Datasets & Metrics library
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 This tutorial demonstrates how to read in datasets from various raw text formats and prepare them for training with 🤗
 Transformers so that you can do the same thing with your own custom datasets. However, we recommend users use the `🤗
-Datasets library <https://github.com/huggingface/datasets>`_ for working with the 150+ datasets included in the `hub
+NLP library <https://github.com/huggingface/nlp>`_ for working with the 150+ datasets included in the `hub
 <https://huggingface.co/datasets>`_, including the three datasets used in this tutorial. As a very brief overview, we
-will show how to use the Datasets library to download and prepare the IMDb dataset from the first example,
-:ref:`seq_imdb`.
+will show how to use the NLP library to download and prepare the IMDb dataset from the first example, :ref:`seq_imdb`.

 Start by downloading the dataset:

 .. code-block:: python

-    from datasets import load_dataset
+    from nlp import load_dataset
    train = load_dataset("imdb", split="train")

 Each dataset has multiple columns corresponding to different features. Let's see what our columns are.
@@ -725,5 +725,5 @@ dataset elements.
    >>> {key: val.shape for key, val in train[0].items()})
    {'labels': TensorShape([]), 'input_ids': TensorShape([512]), 'attention_mask': TensorShape([512])}

-We now have a fully-prepared dataset. Check out `the 🤗 Datasets docs
-<https://huggingface.co/docs/datasets/processing.html>`_ for a more thorough introduction.
+We now have a fully-prepared dataset. Check out `the 🤗 NLP docs <https://huggingface.co/nlp/processing.html>`_ for a
+more thorough introduction.
--- a/docs/source/fast_tokenizers.rst
+++ b/docs/source/fast_tokenizers.rst
@@ -1,62 +0,0 @@
-Using tokenizers from 🤗 Tokenizers
-=======================================================================================================================
-
-The :class:`~transformers.PreTrainedTokenizerFast` depends on the `tokenizers
-<https://huggingface.co/docs/tokenizers>`__ library. The tokenizers obtained from the 🤗 Tokenizers library can be
-loaded very simply into 🤗 Transformers.
-
-Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
-
-.. code-block::
-
-    >>> from tokenizers import Tokenizer
-    >>> from tokenizers.models import BPE
-    >>> from tokenizers.trainers import BpeTrainer
-    >>> from tokenizers.pre_tokenizers import Whitespace
-
-    >>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
-    >>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
-
-    >>> tokenizer.pre_tokenizer = Whitespace()
-    >>> files = [...]
-    >>> tokenizer.train(files, trainer)
-
-We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
-a JSON file for future re-use.
-
-Loading directly from the tokenizer object
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
-:class:`~transformers.PreTrainedTokenizerFast` class allows for easy instantiation, by accepting the instantiated
-`tokenizer` object as an argument:
-
-.. code-block::
-
-    >>> from transformers import PreTrainedTokenizerFast
-
-    >>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
-
-This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer
-page <main_classes/tokenizer>` for more information.
-
-Loading from a JSON file
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
-
-.. code-block::
-
-    >>> tokenizer.save("tokenizer.json")
-
-The path to which we saved this file can be passed to the :class:`~transformers.PreTrainedTokenizerFast` initialization
-method using the :obj:`tokenizer_file` parameter:
-
-.. code-block::
-
-    >>> from transformers import PreTrainedTokenizerFast
-
-    >>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
-
-This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer
-page <main_classes/tokenizer>` for more information.
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -21,7 +21,6 @@ General terms
 - CLM: causal language modeling, a pretraining task where the model reads the texts in order and has to predict the
  next word. It's usually done by reading the whole sentence but using a mask inside the model to hide the future
  tokens at a certain timestep.
- deep learning: machine learning algorithms which uses neural networks with several layers.
 - MLM: masked language modeling, a pretraining task where the model sees a corrupted version of the texts, usually done
  by masking some tokens randomly, and has to predict the original text.
 - multimodal: a task that combines texts with another kind of inputs (for instance images).
@@ -34,12 +33,10 @@ General terms
  involve a self-supervised objective, which can be reading the text and trying to predict the next word (see CLM) or
  masking some words and trying to predict them (see MLM).
 - RNN: recurrent neural network, a type of model that uses a loop over a layer to process texts.
- self-attention: each element of the input finds out which other elements of the input they should attend to.
 - seq2seq or sequence-to-sequence: models that generate a new sequence from an input, like translation models, or
  summarization models (such as :doc:`Bart </model_doc/bart>` or :doc:`T5 </model_doc/t5>`).
 - token: a part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords)
  or a punctuation symbol.
- transformer: self-attention based deep learning model architecture.

 Model inputs
 -----------------------------------------------------------------------------------------------------------------------
@@ -182,7 +179,7 @@ such:

 .. code-block::

-    >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
+   >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]

 We can use our tokenizer to automatically generate such a sentence by passing the two sequences to ``tokenizer`` as two
 arguments (and not a list, like before) like this:
--- a/docs/source/imgs/transformers_overview.png
+++ b/docs/source/imgs/transformers_overview.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -22,7 +22,7 @@ State-of-the-art NLP for everyone:
 - Hands-on practitioners
 - AI/ML/NLP teachers and educators

-..
+.. 
    Copyright 2020 The HuggingFace Team. All rights reserved.

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
@@ -97,155 +97,114 @@ and conversion utilities for the following models:
 5. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
   Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
   Narayan, Aliaksei Severyn.
-6. :doc:`BigBird-RoBERTa <model_doc/bigbird>` (from Google Research) released with the paper `Big Bird: Transformers
-   for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua
-   Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-7. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
+6. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-8. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building an
+7. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building an
   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-9. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
+8. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
   <https://arxiv.org/abs/2010.10499>`__ by Adrian de Wynter and Daniel J. Perry.
-10. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
-    French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
-    Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-11. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
+9. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
+   French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
+   Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+10. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
    Span-based Dynamic Convolution <https://arxiv.org/abs/2008.02496>`__ by Zihang Jiang, Weihao Yu, Daquan Zhou,
    Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-12. :doc:`CPM <model_doc/cpm>` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative
-    Chinese Pre-trained Language Model <https://arxiv.org/abs/2012.00413>`__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei
-    Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng,
-    Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang,
-    Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-13. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
+11. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
    Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
    Lav R. Varshney, Caiming Xiong and Richard Socher.
-14. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
-    Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu
-    Chen.
-15. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
-    with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
+12. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft Research) released with the paper `DeBERTa: Decoding-enhanced
+    BERT with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
    Weizhu Chen.
-16. :doc:`DeiT <model_doc/deit>` (from Facebook) released with the paper `Training data-efficient image transformers &
-    distillation through attention <https://arxiv.org/abs/2012.12877>`__ by Hugo Touvron, Matthieu Cord, Matthijs
-    Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-17. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+13. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
    Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
    Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-18. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+14. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
    distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
    Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
    `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
    version of DistilBERT.
-19. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+15. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
    Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
    Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-20. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+16. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
    Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
    Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-21. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+17. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
    Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
    Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-22. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+18. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
    Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
    Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-23. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+19. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
    Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
    and Ilya Sutskever.
-24. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+20. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
    Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
    Luan, Dario Amodei** and Ilya Sutskever**.
-25. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
-    <https://github.com/EleutherAI/gpt-neo>`__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-26. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
-    <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
-27. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+21. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-28. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+22. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-29. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+23. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-30. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+24. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
    by Hao Tan and Mohit Bansal.
-31. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
-    Machine Translation <https://arxiv.org/abs/2010.11125>`__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi
-    Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman
-    Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-32. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+25. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
    Translator Team.
-33. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+26. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-34. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
-    Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
-    Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-35. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
-    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
-    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-36. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
-    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
-    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-37. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+27. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
    Jianfeng Lu, Tie-Yan Liu.
-38. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+28. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-39. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+29. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
    Mohammad Saleh and Peter J. Liu.
-40. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+30. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-41. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+31. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-42. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+32. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-43. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
-    `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
-    Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-44. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
+33. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
    about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
    Krishna, and Kurt W. Keutzer.
-45. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+34. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-46. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+35. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
    Francesco Piccinno and Julian Martin Eisenschlos.
-47. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+36. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-48. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
-    Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
-    Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
-    Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-49. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+37. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
    Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
    Zhou, Abdelrahman Mohamed, Michael Auli.
-50. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+38. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-51. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+39. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-52. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+40. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
    Zettlemoyer and Veselin Stoyanov.
-53. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+41. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-54. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
-    Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
-    Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.


 .. _bigtable:
@@ -270,8 +229,6 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           BigBird           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       BlenderbotSmall       |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
@@ -286,10 +243,6 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           DeBERTa           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
@@ -302,26 +255,18 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
@@ -332,7 +277,7 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             RAG             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -340,8 +285,6 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
@@ -350,8 +293,6 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             ViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
@@ -394,16 +335,13 @@ TensorFlow and/or Flax.

    pretrained_models
    examples
-    troubleshooting
    custom_datasets
    notebooks
-    sagemaker
    community
    converting_tensorflow_models
    migration
    contributing
    add_new_model
-    fast_tokenizers
    testing
    serialization

@@ -421,7 +359,6 @@ TensorFlow and/or Flax.

    main_classes/callback
    main_classes/configuration
-    main_classes/data_collator
    main_classes/logging
    main_classes/model
    main_classes/optimizer_schedules
@@ -430,7 +367,6 @@ TensorFlow and/or Flax.
    main_classes/processors
    main_classes/tokenizer
    main_classes/trainer
-    main_classes/feature_extractor

 .. toctree::
    :maxdepth: 2
@@ -443,18 +379,13 @@ TensorFlow and/or Flax.
    model_doc/bert
    model_doc/bertweet
    model_doc/bertgeneration
-    model_doc/bert_japanese
-    model_doc/bigbird
    model_doc/blenderbot
    model_doc/blenderbot_small
    model_doc/bort
    model_doc/camembert
    model_doc/convbert
-    model_doc/cpm
    model_doc/ctrl
    model_doc/deberta
-    model_doc/deberta_v2
-    model_doc/deit
    model_doc/dialogpt
    model_doc/distilbert
    model_doc/dpr
@@ -464,22 +395,17 @@ TensorFlow and/or Flax.
    model_doc/fsmt
    model_doc/funnel
    model_doc/herbert
-    model_doc/ibert
    model_doc/layoutlm
    model_doc/led
    model_doc/longformer
    model_doc/lxmert
    model_doc/marian
-    model_doc/m2m_100
    model_doc/mbart
-    model_doc/megatron_bert
-    model_doc/megatron_gpt2
    model_doc/mobilebert
    model_doc/mpnet
    model_doc/mt5
    model_doc/gpt
    model_doc/gpt2
-    model_doc/gpt_neo
    model_doc/pegasus
    model_doc/phobert
    model_doc/prophetnet
@@ -487,18 +413,15 @@ TensorFlow and/or Flax.
    model_doc/reformer
    model_doc/retribert
    model_doc/roberta
-    model_doc/speech_to_text
    model_doc/squeezebert
    model_doc/t5
    model_doc/tapas
    model_doc/transformerxl
-    model_doc/vit
    model_doc/wav2vec2
    model_doc/xlm
    model_doc/xlmprophetnet
    model_doc/xlmroberta
    model_doc/xlnet
-    model_doc/xlsr_wav2vec2

 .. toctree::
    :maxdepth: 2
@@ -509,4 +432,3 @@ TensorFlow and/or Flax.
    internal/tokenization_utils
    internal/trainer_utils
    internal/generation_utils
-    internal/file_utils
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -151,34 +151,9 @@ environment variable for ``TRANSFORMERS_CACHE``.

 ### Note on model downloads (Continuous Integration or large-scale deployments)

-If you expect to be downloading large volumes of models (more than 10,000) from huggingface.co (for instance through
+If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through
 your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way
-faster, and cheaper. Feel free to contact us privately, we'd love to help with this.
-
-### Offline mode
-
-It's possible to run 🤗 Transformers in a firewalled or a no-network environment.
-
-Setting environment variable `TRANSFORMERS_OFFLINE=1` will tell 🤗 Transformers to use local files only and will not try to look things up.
-
-Most likely you may want to couple this with `HF_DATASETS_OFFLINE=1` that performs the same for 🤗 Datasets if you're using the latter.
-
-Here is an example of how this can be used on a filesystem that is shared between a normally networked and a firewalled to the external world instances.
-
-On the instance with the normal network run your program which will download and cache models (and optionally datasets if you use 🤗 Datasets). For example:
-
-```
-python examples/seq2seq/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-
-and then with the same filesystem you can now run the same program on a firewalled instance:
-```
-HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
-python examples/seq2seq/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
-```
-and it should succeed without any hanging waiting to timeout.
-
-
+faster, and cheaper. Feel free to contact us privately if you need any help.

 ## Do you want to run a Transformer model on a mobile device?

--- a/docs/source/internal/file_utils.rst
+++ b/docs/source/internal/file_utils.rst
@@ -1,54 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-General Utilities
-----------------------------------------------------------------------------------------------------------------------
-
-This page lists all of Transformers general utility functions that are found in the file ``file_utils.py``.
-
-Most of those are only useful if you are studying the general code in the library.
-
-
-Enums and namedtuples
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.file_utils.ExplicitEnum
-
-.. autoclass:: transformers.file_utils.PaddingStrategy
-
-.. autoclass:: transformers.file_utils.TensorType
-
-
-Special Decorators
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: transformers.file_utils.add_start_docstrings
-
-.. autofunction:: transformers.file_utils.add_start_docstrings_to_model_forward
-
-.. autofunction:: transformers.file_utils.add_end_docstrings
-
-.. autofunction:: transformers.file_utils.add_code_sample_docstrings
-
-.. autofunction:: transformers.file_utils.replace_return_docstrings
-
-
-Special Properties
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.file_utils.cached_property
-
-
-Other Utilities
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.file_utils._BaseLazyModule
--- a/docs/source/internal/generation_utils.rst
+++ b/docs/source/internal/generation_utils.rst
@@ -151,33 +151,6 @@ generation.
 .. autoclass:: transformers.HammingDiversityLogitsProcessor
    :members: __call__

-.. autoclass:: transformers.ForcedBOSTokenLogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.ForcedEOSTokenLogitsProcessor
-    :members: __call__
-
-.. autoclass:: transformers.InfNanRemoveLogitsProcessor
-    :members: __call__
-
-
-StoppingCriteria
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-A :class:`~transformers.StoppingCriteria` can be used to change when to stop generation (other than EOS token).
-
-.. autoclass:: transformers.StoppingCriteria
-    :members: __call__
-
-.. autoclass:: transformers.StoppingCriteriaList
-    :members: __call__
-
-.. autoclass:: transformers.MaxLengthCriteria
-    :members: __call__
-
-.. autoclass:: transformers.MaxTimeCriteria
-    :members: __call__
-
 BeamSearch
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/internal/pipelines_utils.rst
+++ b/docs/source/internal/pipelines_utils.rst
@@ -47,4 +47,6 @@ Data format
 Utilities
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+.. autofunction:: transformers.pipelines.get_framework
+
 .. autoclass:: transformers.pipelines.PipelineException
--- a/docs/source/internal/tokenization_utils.rst
+++ b/docs/source/internal/tokenization_utils.rst
@@ -38,6 +38,12 @@ SpecialTokensMixin
 Enums and namedtuples
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+.. autoclass:: transformers.tokenization_utils_base.ExplicitEnum
+
+.. autoclass:: transformers.tokenization_utils_base.PaddingStrategy
+
+.. autoclass:: transformers.tokenization_utils_base.TensorType
+
 .. autoclass:: transformers.tokenization_utils_base.TruncationStrategy

 .. autoclass:: transformers.tokenization_utils_base.CharSpan
--- a/docs/source/internal/trainer_utils.rst
+++ b/docs/source/internal/trainer_utils.rst
@@ -22,7 +22,7 @@ Utilities

 .. autoclass:: transformers.EvalPrediction

-.. autoclass:: transformers.IntervalStrategy
+.. autoclass:: transformers.EvaluationStrategy

 .. autofunction:: transformers.set_seed

--- a/docs/source/main_classes/callback.rst
+++ b/docs/source/main_classes/callback.rst
@@ -74,32 +74,6 @@ TrainerCallback
 .. autoclass:: transformers.TrainerCallback
    :members:

-Here is an example of how to register a custom callback with the PyTorch :class:`~transformers.Trainer`:
-
-.. code-block:: python
-
-    class MyCallback(TrainerCallback):
-        "A callback that prints a message at the beginning of training"
-
-        def on_train_begin(self, args, state, control, **kwargs):
-            print("Starting training")
-
-    trainer = Trainer(
-        model,
-        args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        callbacks=[MyCallback]  # We can either pass the callback class this way or an instance of it (MyCallback())
-    )
-
-Another way to register a callback is to call ``trainer.add_callback()`` as follows:
-
-.. code-block:: python
-
-    trainer = Trainer(...)
-    trainer.add_callback(MyCallback)
-    # Alternatively, we can pass an instance of the callback class
-    trainer.add_callback(MyCallback())

 TrainerState
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/main_classes/data_collator.rst
+++ b/docs/source/main_classes/data_collator.rst
@@ -1,71 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Data Collator
-----------------------------------------------------------------------------------------------------------------------
-
-Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of
-the same type as the elements of :obj:`train_dataset` or :obj:`eval_dataset`.
-
-To be able to build batches, data collators may apply some processing (like padding). Some of them (like
-:class:`~transformers.DataCollatorForLanguageModeling`) also apply some random data augmentation (like random masking)
-oin the formed batch.
-
-Examples of use can be found in the :doc:`example scripts <../examples>` or :doc:`example notebooks <../notebooks>`.
-
-
-Default data collator
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autofunction:: transformers.data.data_collator.default_data_collator
-
-
-DataCollatorWithPadding
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.data.data_collator.DataCollatorWithPadding
-    :members:
-
-
-DataCollatorForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.data.data_collator.DataCollatorForTokenClassification
-    :members:
-
-
-DataCollatorForSeq2Seq
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.data.data_collator.DataCollatorForSeq2Seq
-    :members:
-
-
-DataCollatorForLanguageModeling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.data.data_collator.DataCollatorForLanguageModeling
-    :members: mask_tokens
-
-
-DataCollatorForWholeWordMask
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.data.data_collator.DataCollatorForWholeWordMask
-    :members: mask_tokens
-
-
-DataCollatorForPermutationLanguageModeling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.data.data_collator.DataCollatorForPermutationLanguageModeling
-    :members: mask_tokens
--- a/docs/source/main_classes/feature_extractor.rst
+++ b/docs/source/main_classes/feature_extractor.rst
@@ -1,48 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-
-Feature Extractor
-----------------------------------------------------------------------------------------------------------------------
-
-A feature extractor is in charge of preparing input features for a multi-modal model. This includes feature extraction
-from sequences, *e.g.*, pre-processing audio files to Log-Mel Spectrogram features, feature extraction from images
-*e.g.* cropping image image files, but also padding, normalization, and conversion to Numpy, PyTorch, and TensorFlow
-tensors.
-
-
-FeatureExtractionMixin
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.feature_extraction_utils.FeatureExtractionMixin
-    :members: from_pretrained, save_pretrained
-
-
-SequenceFeatureExtractor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SequenceFeatureExtractor
-    :members: pad
-
-
-BatchFeature
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BatchFeature
-    :members:
-
-
-ImageFeatureExtractionMixin
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.image_utils.ImageFeatureExtractionMixin
-    :members:
--- a/docs/source/main_classes/logging.rst
+++ b/docs/source/main_classes/logging.rst
@@ -65,10 +65,6 @@ Other functions

 .. autofunction:: transformers.logging.get_logger

-.. autofunction:: transformers.logging.enable_default_handler
-
-.. autofunction:: transformers.logging.disable_default_handler
-
 .. autofunction:: transformers.logging.enable_explicit_format

 .. autofunction:: transformers.logging.reset_format
--- a/docs/source/main_classes/output.rst
+++ b/docs/source/main_classes/output.rst
@@ -60,7 +60,7 @@ ModelOutput
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.file_utils.ModelOutput
-    :members: to_tuple
+    :members:


 BaseModelOutput
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -54,24 +54,19 @@ PreTrainedTokenizer

 .. autoclass:: transformers.PreTrainedTokenizer
    :special-members: __call__
-    :members: batch_decode, convert_ids_to_tokens, convert_tokens_to_ids, convert_tokens_to_string, decode, encode, 
-        get_added_vocab, get_special_tokens_mask, num_special_tokens_to_add, prepare_for_tokenization, tokenize,
-        vocab_size
+    :members:
+
+    .. automethod:: encode


 PreTrainedTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The :class:`~transformers.PreTrainedTokenizerFast` depend on the `tokenizers
-<https://huggingface.co/docs/tokenizers>`__ library. The tokenizers obtained from the 🤗 tokenizers library can be
-loaded very simply into 🤗 transformers. Take a look at the :doc:`Using tokenizers from 🤗 tokenizers
-<../fast_tokenizers>` page to understand how this is done.
-
 .. autoclass:: transformers.PreTrainedTokenizerFast
    :special-members: __call__
-    :members: batch_decode, convert_ids_to_tokens, convert_tokens_to_ids, convert_tokens_to_string, decode, encode, 
-        get_added_vocab, get_special_tokens_mask, num_special_tokens_to_add,
-        set_truncation_and_padding,tokenize, vocab_size
+    :members:
+
+    .. automethod:: encode


 BatchEncoding
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -44,13 +44,6 @@ AutoTokenizer
    :members:


-AutoFeatureExtractor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.AutoFeatureExtractor
-    :members:
-
-
 AutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -196,52 +189,3 @@ FlaxAutoModel

 .. autoclass:: transformers.FlaxAutoModel
    :members:
-
-
-FlaxAutoModelForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxAutoModelForPreTraining
-    :members:
-
-
-FlaxAutoModelForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxAutoModelForMaskedLM
-    :members:
-
-
-FlaxAutoModelForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxAutoModelForSequenceClassification
-    :members:
-
-
-FlaxAutoModelForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxAutoModelForQuestionAnswering
-    :members:
-
-
-FlaxAutoModelForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxAutoModelForTokenClassification
-    :members:
-
-
-FlaxAutoModelForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxAutoModelForMultipleChoice
-    :members:
-
-
-FlaxAutoModelForNextSentencePrediction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxAutoModelForNextSentencePrediction
-    :members:
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -90,7 +90,7 @@ BertForPreTraining
    :members: forward


-BertLMHeadModel
+BertModelLMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertLMHeadModel
@@ -209,50 +209,8 @@ FlaxBertModel
    :members: __call__


-FlaxBertForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxBertForPreTraining
-    :members: __call__
-
-
 FlaxBertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.FlaxBertForMaskedLM
    :members: __call__
-
-
-FlaxBertForNextSentencePrediction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxBertForNextSentencePrediction
-    :members: __call__
-
-
-FlaxBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxBertForSequenceClassification
-    :members: __call__
-
-
-FlaxBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxBertForMultipleChoice
-    :members: __call__
-
-
-FlaxBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxBertForTokenClassification
-    :members: __call__
-
-
-FlaxBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.FlaxBertForQuestionAnswering
-    :members: __call__
--- a/docs/source/model_doc/bert_japanese.rst
+++ b/docs/source/model_doc/bert_japanese.rst
@@ -1,78 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-BertJapanese
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The BERT models trained on Japanese text.
-
-There are models with two different tokenization methods:
-
- Tokenize with MeCab and WordPiece. This requires some extra dependencies, `fugashi
-  <https://github.com/polm/fugashi>`__ which is a wrapper around `MeCab <https://taku910.github.io/mecab/>`__.
- Tokenize into characters.
-
-To use `MecabTokenizer`, you should ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install
-from source) to install dependencies.
-
-See `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__.
-
-Example of using a model with MeCab and WordPiece tokenization:
-
-.. code-block::
-
-    >>> import torch
-    >>> from transformers import AutoModel, AutoTokenizer 
-
-    >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
-    >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
-
-    >>> ## Input Japanese Text
-    >>> line = "吾輩は猫である。"
-
-    >>> inputs = tokenizer(line, return_tensors="pt")
-
-    >>> print(tokenizer.decode(inputs['input_ids'][0]))
-    [CLS] 吾輩 は 猫 で ある 。 [SEP]
-
-    >>> outputs = bertjapanese(**inputs)
-
-Example of using a model with Character tokenization:
-
-.. code-block::
-
-    >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
-    >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
-
-    >>> ## Input Japanese Text
-    >>> line = "吾輩は猫である。"
-
-    >>> inputs = tokenizer(line, return_tensors="pt")
-
-    >>> print(tokenizer.decode(inputs['input_ids'][0]))
-    [CLS] 吾 輩 は 猫 で あ る 。 [SEP]
-
-    >>> outputs = bertjapanese(**inputs)
-
-Tips:
-
- This implementation is the same as BERT, except for tokenization method. Refer to the :doc:`documentation of BERT
-  <bert>` for more usage examples.
-
-BertJapaneseTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BertJapaneseTokenizer
-    :members: 
--- a/docs/source/model_doc/bertgeneration.rst
+++ b/docs/source/model_doc/bertgeneration.rst
@@ -38,22 +38,22 @@ Usage:

 .. code-block::

-    >>> # leverage checkpoints for Bert2Bert model...
-    >>> # use BERT's cls token as BOS token and sep token as EOS token
-    >>> encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
-    >>> # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
-    >>> decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
-    >>> bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
+  # leverage checkpoints for Bert2Bert model...
+  # use BERT's cls token as BOS token and sep token as EOS token
+  encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
+  # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
+  decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
+  bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)

-    >>> # create tokenizer...
-    >>> tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
+  # create tokenizer...
+  tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

-    >>> input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
-    >>> labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
+  input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
+  labels = tokenizer('This is a short summary', return_tensors="pt").input_ids

-    >>> # train...
-    >>> loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
-    >>> loss.backward()
+  # train...
+  loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
+  loss.backward()


 - Pretrained :class:`~transformers.EncoderDecoderModel` are also directly available in the model hub, e.g.,
@@ -61,15 +61,15 @@ Usage:

 .. code-block::

-    >>> # instantiate sentence fusion model
-    >>> sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
-    >>> tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
+  # instantiate sentence fusion model
+  sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
+  tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")

-    >>> input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids
+  input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids

-    >>> outputs = sentence_fuser.generate(input_ids)
+  outputs = sentence_fuser.generate(input_ids)

-    >>> print(tokenizer.decode(outputs[0]))
+  print(tokenizer.decode(outputs[0]))


 Tips:
--- a/docs/source/model_doc/bertweet.rst
+++ b/docs/source/model_doc/bertweet.rst
@@ -31,28 +31,28 @@ Example of use:

 .. code-block::

-    >>> import torch
-    >>> from transformers import AutoModel, AutoTokenizer 
+  import torch
+  from transformers import AutoModel, AutoTokenizer 

-    >>> bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
+  bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

-    >>> # For transformers v4.x+: 
-    >>> tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
+  # For transformers v4.x+: 
+  tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

-    >>> # For transformers v3.x: 
-    >>> # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
+  # For transformers v3.x: 
+  # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

-    >>> # INPUT TWEET IS ALREADY NORMALIZED!
-    >>> line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
+  # INPUT TWEET IS ALREADY NORMALIZED!
+  line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"

-    >>> input_ids = torch.tensor([tokenizer.encode(line)])
+  input_ids = torch.tensor([tokenizer.encode(line)])

-    >>> with torch.no_grad():
-    ...     features = bertweet(input_ids)  # Models outputs are now tuples
+  with torch.no_grad():
+      features = bertweet(input_ids)  # Models outputs are now tuples

-    >>> # With TensorFlow 2.0+:
-    >>> # from transformers import TFAutoModel
-    >>> # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
+  ## With TensorFlow 2.0+:
+  # from transformers import TFAutoModel
+  # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")


 The original code can be found `here <https://github.com/VinAIResearch/BERTweet>`__.
--- a/docs/source/model_doc/bigbird.rst
+++ b/docs/source/model_doc/bigbird.rst
@@ -1,130 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-BigBird
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The BigBird model was proposed in `Big Bird: Transformers for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by
-Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon,
-Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention
-based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse
-attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it
-has been shown that applying sparse, global, and random attention approximates full attention, while being
-computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context,
-BigBird has shown improved performance on various long document NLP tasks, such as question answering and
-summarization, compared to BERT or RoBERTa.
-
-The abstract from the paper is the following:
-
-*Transformers-based models, such as BERT, have been one of the most successful deep learning models for NLP.
-Unfortunately, one of their core limitations is the quadratic dependency (mainly in terms of memory) on the sequence
-length due to their full attention mechanism. To remedy this, we propose, BigBird, a sparse attention mechanism that
-reduces this quadratic dependency to linear. We show that BigBird is a universal approximator of sequence functions and
-is Turing complete, thereby preserving these properties of the quadratic, full attention model. Along the way, our
-theoretical analysis reveals some of the benefits of having O(1) global tokens (such as CLS), that attend to the entire
-sequence as part of the sparse attention mechanism. The proposed sparse attention can handle sequences of length up to
-8x of what was previously possible using similar hardware. As a consequence of the capability to handle longer context,
-BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also
-propose novel applications to genomics data.*
-
-Tips:
-
- For an in-detail explanation on how BigBird's attention works, see `this blog post
-  <https://huggingface.co/blog/big-bird>`__.
- BigBird comes with 2 implementations: **original_full** & **block_sparse**. For the sequence length < 1024, using
-  **original_full** is advised as there is no benefit in using **block_sparse** attention.
- The code currently uses window size of 3 blocks and 2 global blocks.
- Sequence length must be divisible by block size.
- Current implementation supports only **ITC**.
- Current implementation doesn't support **num_random_blocks = 0**
-
-The original code can be found `here <https://github.com/google-research/bigbird>`__.
-
-BigBirdConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BigBirdConfig
-    :members:
-
-
-BigBirdTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BigBirdTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-BigBird specific outputs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.models.big_bird.modeling_big_bird.BigBirdForPreTrainingOutput
-    :members:
-
-
-BigBirdModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BigBirdModel
-    :members: forward
-
-
-BigBirdForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BigBirdForPreTraining
-    :members: forward
-
-
-BigBirdForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BigBirdForCausalLM
-    :members: forward
-
-
-BigBirdForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BigBirdForMaskedLM
-    :members: forward
-
-
-BigBirdForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BigBirdForSequenceClassification
-    :members: forward
-
-
-BigBirdForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BigBirdForMultipleChoice
-    :members: forward
-
-
-BigBirdForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BigBirdForTokenClassification
-    :members: forward
-
-
-BigBirdForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BigBirdForQuestionAnswering
-    :members: forward
--- a/docs/source/model_doc/convbert.rst
+++ b/docs/source/model_doc/convbert.rst
@@ -56,7 +56,8 @@ ConvBertTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.ConvBertTokenizerFast
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary


 ConvBertModel
--- a/docs/source/model_doc/cpm.rst
+++ b/docs/source/model_doc/cpm.rst
@@ -1,44 +0,0 @@
-..
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-CPM
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The CPM model was proposed in `CPM: A Large-scale Generative Chinese Pre-trained Language Model
-<https://arxiv.org/abs/2012.00413>`__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin,
-Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen,
-Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-
-The abstract from the paper is the following:
-
-*Pre-trained Language Models (PLMs) have proven to be beneficial for various downstream NLP tasks. Recently, GPT-3,
-with 175 billion parameters and 570GB training data, drew a lot of attention due to the capacity of few-shot (even
-zero-shot) learning. However, applying GPT-3 to address Chinese NLP tasks is still challenging, as the training corpus
-of GPT-3 is primarily English, and the parameters are not publicly available. In this technical report, we release the
-Chinese Pre-trained Language Model (CPM) with generative pre-training on large-scale Chinese training data. To the best
-of our knowledge, CPM, with 2.6 billion parameters and 100GB Chinese training data, is the largest Chinese pre-trained
-language model, which could facilitate several downstream Chinese NLP tasks, such as conversation, essay generation,
-cloze test, and language understanding. Extensive experiments demonstrate that CPM achieves strong performance on many
-NLP tasks in the settings of few-shot (even zero-shot) learning.*
-
-The original implementation can be found here: https://github.com/TsinghuaAI/CPM-Generate
-
-Note: We only have a tokenizer here, since the model architecture is the same as GPT-2.
-
-CpmTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.CpmTokenizer
-    :members:
--- a/docs/source/model_doc/deberta.rst
+++ b/docs/source/model_doc/deberta.rst
@@ -60,7 +60,7 @@ DebertaModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DebertaModel
-    :members: forward
+    :members:


 DebertaPreTrainedModel
@@ -74,25 +74,25 @@ DebertaForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DebertaForMaskedLM
-    :members: forward
+    :members:


 DebertaForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DebertaForSequenceClassification
-    :members: forward
+    :members:


 DebertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DebertaForTokenClassification
-    :members: forward
+    :members:


 DebertaForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DebertaForQuestionAnswering
-    :members: forward
+    :members:
--- a/docs/source/model_doc/deberta_v2.rst
+++ b/docs/source/model_doc/deberta_v2.rst
@@ -1,118 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-DeBERTa-v2
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
-<https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
-BERT model released in 2018 and Facebook's RoBERTa model released in 2019.
-
-It builds on RoBERTa with disentangled attention and enhanced mask decoder training with half of the data used in
-RoBERTa.
-
-The abstract from the paper is the following:
-
-*Recent progress in pre-trained neural language models has significantly improved the performance of many natural
-language processing (NLP) tasks. In this paper we propose a new model architecture DeBERTa (Decoding-enhanced BERT with
-disentangled attention) that improves the BERT and RoBERTa models using two novel techniques. The first is the
-disentangled attention mechanism, where each word is represented using two vectors that encode its content and
-position, respectively, and the attention weights among words are computed using disentangled matrices on their
-contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
-predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
-of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
-the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
-(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
-pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
-
-
-The following information is visible directly on the [original implementation
-repository](https://github.com/microsoft/DeBERTa). DeBERTa v2 is the second version of the DeBERTa model. It includes
-the 1.5B model used for the SuperGLUE single-model submission and achieving 89.9, versus human baseline 89.8. You can
-find more details about this submission in the authors'
-[blog](https://www.microsoft.com/en-us/research/blog/microsoft-deberta-surpasses-human-performance-on-the-superglue-benchmark/)
-
-New in v2:
-
- **Vocabulary** In v2 the tokenizer is changed to use a new vocabulary of size 128K built from the training data.
-  Instead of a GPT2-based tokenizer, the tokenizer is now
-  [sentencepiece-based](https://github.com/google/sentencepiece) tokenizer.
- **nGiE(nGram Induced Input Encoding)** The DeBERTa-v2 model uses an additional convolution layer aside with the first
-  transformer layer to better learn the local dependency of input tokens.
- **Sharing position projection matrix with content projection matrix in attention layer** Based on previous
-  experiments, this can save parameters without affecting the performance.
- **Apply bucket to encode relative postions** The DeBERTa-v2 model uses log bucket to encode relative positions
-  similar to T5.
- **900M model & 1.5B model** Two additional model sizes are available: 900M and 1.5B, which significantly improves the
-  performance of downstream tasks.
-
-The original code can be found `here <https://github.com/microsoft/DeBERTa>`__.
-
-
-DebertaV2Config
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2Config
-    :members:
-
-
-DebertaV2Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2Tokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-DebertaV2Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2Model
-    :members: forward
-
-
-DebertaV2PreTrainedModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2PreTrainedModel
-    :members: forward
-
-
-DebertaV2ForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2ForMaskedLM
-    :members: forward
-
-
-DebertaV2ForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2ForSequenceClassification
-    :members: forward
-
-
-DebertaV2ForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2ForTokenClassification
-    :members: forward
-
-
-DebertaV2ForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DebertaV2ForQuestionAnswering
-    :members: forward
--- a/docs/source/model_doc/deit.rst
+++ b/docs/source/model_doc/deit.rst
@@ -1,109 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-DeiT
-----------------------------------------------------------------------------------------------------------------------
-
-.. note::
-
-    This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
-    breaking changes to fix it in the future. If you see something strange, file a `Github Issue
-    <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__.
-
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The DeiT model was proposed in `Training data-efficient image transformers & distillation through attention
-<https://arxiv.org/abs/2012.12877>`__ by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre
-Sablayrolles, Hervé Jégou. The `Vision Transformer (ViT) <https://huggingface.co/transformers/model_doc/vit.html>`__
-introduced in `Dosovitskiy et al., 2020 <https://arxiv.org/abs/2010.11929>`__ has shown that one can match or even
-outperform existing convolutional neural networks using a Transformer encoder (BERT-like). However, the ViT models
-introduced in that paper required training on expensive infrastructure for multiple weeks, using external data. DeiT
-(data-efficient image transformers) are more efficiently trained transformers for image classification, requiring far
-less data and far less computing resources compared to the original ViT models.
-
-The abstract from the paper is the following:
-
-*Recently, neural networks purely based on attention were shown to address image understanding tasks such as image
-classification. However, these visual transformers are pre-trained with hundreds of millions of images using an
-expensive infrastructure, thereby limiting their adoption. In this work, we produce a competitive convolution-free
-transformer by training on Imagenet only. We train them on a single computer in less than 3 days. Our reference vision
-transformer (86M parameters) achieves top-1 accuracy of 83.1% (single-crop evaluation) on ImageNet with no external
-data. More importantly, we introduce a teacher-student strategy specific to transformers. It relies on a distillation
-token ensuring that the student learns from the teacher through attention. We show the interest of this token-based
-distillation, especially when using a convnet as a teacher. This leads us to report results competitive with convnets
-for both Imagenet (where we obtain up to 85.2% accuracy) and when transferring to other tasks. We share our code and
-models.*
-
-Tips:
-
- Compared to ViT, DeiT models use a so-called distillation token to effectively learn from a teacher (which, in the
-  DeiT paper, is a ResNet like-model). The distillation token is learned through backpropagation, by interacting with
-  the class ([CLS]) and patch tokens through the self-attention layers.
- There are 2 ways to fine-tune distilled models, either (1) in a classic way, by only placing a prediction head on top
-  of the final hidden state of the class token and not using the distillation signal, or (2) by placing both a
-  prediction head on top of the class token and on top of the distillation token. In that case, the [CLS] prediction
-  head is trained using regular cross-entropy between the prediction of the head and the ground-truth label, while the
-  distillation prediction head is trained using hard distillation (cross-entropy between the prediction of the
-  distillation head and the label predicted by the teacher). At inference time, one takes the average prediction
-  between both heads as final prediction. (2) is also called "fine-tuning with distillation", because one relies on a
-  teacher that has already been fine-tuned on the downstream dataset. In terms of models, (1) corresponds to
-  :class:`~transformers.DeiTForImageClassification` and (2) corresponds to
-  :class:`~transformers.DeiTForImageClassificationWithTeacher`.
- Note that the authors also did try soft distillation for (2) (in which case the distillation prediction head is
-  trained using KL divergence to match the softmax output of the teacher), but hard distillation gave the best results.
- All released checkpoints were pre-trained and fine-tuned on ImageNet-1k only. No external data was used. This is in
-  contrast with the original ViT model, which used external data like the JFT-300M dataset/Imagenet-21k for
-  pre-training.
- The authors of DeiT also released more efficiently trained ViT models, which you can directly plug into
-  :class:`~transformers.ViTModel` or :class:`~transformers.ViTForImageClassification`. Techniques like data
-  augmentation, optimization, and regularization were used in order to simulate training on a much larger dataset
-  (while only using ImageNet-1k for pre-training). There are 4 variants available (in 3 different sizes):
-  `facebook/deit-tiny-patch16-224`, `facebook/deit-small-patch16-224`, `facebook/deit-base-patch16-224` and
-  `facebook/deit-base-patch16-384`. Note that one should use :class:`~transformers.DeiTFeatureExtractor` in order to
-  prepare images for the model.
-
-
-DeiTConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DeiTConfig
-    :members:
-
-
-DeiTFeatureExtractor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DeiTFeatureExtractor
-    :members: __call__
-
-
-DeiTModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DeiTModel
-    :members: forward
-
-
-DeiTForImageClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DeiTForImageClassification
-    :members: forward
-
-
-DeiTForImageClassificationWithTeacher
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DeiTForImageClassificationWithTeacher
-    :members: forward
--- a/docs/source/model_doc/fsmt.rst
+++ b/docs/source/model_doc/fsmt.rst
@@ -56,7 +56,7 @@ FSMTTokenizer

 .. autoclass:: transformers.FSMTTokenizer
    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
+        create_token_type_ids_from_sequences, prepare_seq2seq_batch, save_vocabulary


 FSMTModel
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -50,7 +50,7 @@ The original code can be found `here <https://github.com/openai/finetune-transfo
 Note:

 If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install ``ftfy``
-and ``SpaCy``:
+and ``SpaCy``::

 .. code-block:: bash

--- a/docs/source/model_doc/gpt_neo.rst
+++ b/docs/source/model_doc/gpt_neo.rst
@@ -1,65 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-GPT Neo
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The GPTNeo model was released in the `EleutherAI/gpt-neo <https://github.com/EleutherAI/gpt-neo>`__ repository by Sid
-Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. It is a GPT2 like causal language model trained on the
-`Pile <https://pile.eleuther.ai/>`__ dataset.
-
-The architecture is similar to GPT2 except that GPT Neo uses local attention in every other layer with a window size of
-256 tokens.
-
-Generation
-_______________________________________________________________________________________________________________________
-
-The :obj:`generate()` method can be used to generate text using GPT Neo model.
-
-.. code-block::
-
-    >>> from transformers import GPTNeoForCausalLM, GPT2Tokenizer
-    >>> model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
-    >>> tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
-
-    >>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
-    ...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
-    ...          "researchers was the fact that the unicorns spoke perfect English."
-
-    >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-
-    >>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
-    >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
-
-
-GPTNeoConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPTNeoConfig
-    :members:
-
-
-GPTNeoModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPTNeoModel
-    :members: forward
-
-
-GPTNeoForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.GPTNeoForCausalLM
-    :members: forward
--- a/docs/source/model_doc/herbert.rst
+++ b/docs/source/model_doc/herbert.rst
@@ -40,20 +40,20 @@ Examples of use:

 .. code-block::

-    >>> from transformers import HerbertTokenizer, RobertaModel
+  from transformers import HerbertTokenizer, RobertaModel

-    >>> tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
-    >>> model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
+  tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+  model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")

-    >>> encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
-    >>> outputs = model(encoded_input)
+  encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
+  outputs = model(encoded_input)

-    >>> # HerBERT can also be loaded using AutoTokenizer and AutoModel:
-    >>> import torch
-    >>> from transformers import AutoModel, AutoTokenizer
+  # HerBERT can also be loaded using AutoTokenizer and AutoModel:
+  import torch
+  from transformers import AutoModel, AutoTokenizer

-    >>> tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
-    >>> model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")
+  tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+  model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")


 The original code can be found `here <https://github.com/allegro/HerBERT>`__.
--- a/docs/source/model_doc/ibert.rst
+++ b/docs/source/model_doc/ibert.rst
@@ -1,88 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-I-BERT
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The I-BERT model was proposed in `I-BERT: Integer-only BERT Quantization <https://arxiv.org/abs/2101.01321>`__ by
-Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney and Kurt Keutzer. It's a quantized version of RoBERTa running
-inference up to four times faster.
-
-The abstract from the paper is the following:
-
-*Transformer based models, like BERT and RoBERTa, have achieved state-of-the-art results in many Natural Language
-Processing tasks. However, their memory footprint, inference latency, and power consumption are prohibitive for
-efficient inference at the edge, and even at the data center. While quantization can be a viable solution for this,
-previous work on quantizing Transformer based models use floating-point arithmetic during inference, which cannot
-efficiently utilize integer-only logical units such as the recent Turing Tensor Cores, or traditional integer-only ARM
-processors. In this work, we propose I-BERT, a novel quantization scheme for Transformer based models that quantizes
-the entire inference with integer-only arithmetic. Based on lightweight integer-only approximation methods for
-nonlinear operations, e.g., GELU, Softmax, and Layer Normalization, I-BERT performs an end-to-end integer-only BERT
-inference without any floating point calculation. We evaluate our approach on GLUE downstream tasks using
-RoBERTa-Base/Large. We show that for both cases, I-BERT achieves similar (and slightly higher) accuracy as compared to
-the full-precision baseline. Furthermore, our preliminary implementation of I-BERT shows a speedup of 2.4 - 4.0x for
-INT8 inference on a T4 GPU system as compared to FP32 inference. The framework has been developed in PyTorch and has
-been open-sourced.*
-
-
-The original code can be found `here <https://github.com/kssteven418/I-BERT>`__.
-
-IBertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.IBertConfig
-    :members:
-
-
-IBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.IBertModel
-    :members: forward
-
-
-IBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.IBertForMaskedLM
-    :members: forward
-
-
-IBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.IBertForSequenceClassification
-    :members: forward
-
-
-IBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.IBertForMultipleChoice
-    :members: forward
-
-
-IBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.IBertForTokenClassification
-    :members: forward
-
-
-IBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.IBertForQuestionAnswering
-    :members: forward
--- a/docs/source/model_doc/layoutlm.rst
+++ b/docs/source/model_doc/layoutlm.rst
@@ -56,24 +56,24 @@ Tips:

 .. code-block::

-    def normalize_bbox(bbox, width, height):
-         return [
-             int(1000 * (bbox[0] / width)),
-             int(1000 * (bbox[1] / height)),
-             int(1000 * (bbox[2] / width)),
-             int(1000 * (bbox[3] / height)),
-         ]
+   def normalize_bbox(bbox, width, height):
+        return [
+            int(1000 * (bbox[0] / width)),
+            int(1000 * (bbox[1] / height)),
+            int(1000 * (bbox[2] / width)),
+            int(1000 * (bbox[3] / height)),
+        ]

 Here, :obj:`width` and :obj:`height` correspond to the width and height of the original document in which the token
 occurs. Those can be obtained using the Python Image Library (PIL) library for example, as follows:

 .. code-block::

-    from PIL import Image
+   from PIL import Image

-    image = Image.open("name_of_your_document - can be a png file, pdf, etc.")
+   image = Image.open("name_of_your_document - can be a png file, pdf, etc.")

-    width, height = image.size
+   width, height = image.size

 - For a demo which shows how to fine-tune :class:`LayoutLMForTokenClassification` on the `FUNSD dataset
  <https://guillaumejaume.github.io/FUNSD/>`__ (a collection of annotated forms), see `this notebook
@@ -130,31 +130,3 @@ LayoutLMForTokenClassification

 .. autoclass:: transformers.LayoutLMForTokenClassification
    :members:
-
-
-TFLayoutLMModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFLayoutLMModel
-    :members:
-
-
-TFLayoutLMForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFLayoutLMForMaskedLM
-    :members:
-
-
-TFLayoutLMForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFLayoutLMForSequenceClassification
-    :members:
-
-
-TFLayoutLMForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFLayoutLMForTokenClassification
-    :members:
--- a/docs/source/model_doc/led.rst
+++ b/docs/source/model_doc/led.rst
@@ -73,7 +73,8 @@ LEDTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.LEDTokenizerFast
-    :members:
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary


 LED specific outputs
--- a/docs/source/model_doc/m2m_100.rst
+++ b/docs/source/model_doc/m2m_100.rst
@@ -1,128 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-M2M100
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The M2M100 model was proposed in `Beyond English-Centric Multilingual Machine Translation
-<https://arxiv.org/abs/2010.11125>`__ by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky,
-Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy
-Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-
-The abstract from the paper is the following:
-
-*Existing work in translation demonstrated the potential of massively multilingual machine translation by training a
-single model able to translate between any pair of languages. However, much of this work is English-Centric by training
-only on data which was translated from or to English. While this is supported by large sources of training data, it
-does not reflect translation needs worldwide. In this work, we create a true Many-to-Many multilingual translation
-model that can translate directly between any pair of 100 languages. We build and open source a training dataset that
-covers thousands of language directions with supervised data, created through large-scale mining. Then, we explore how
-to effectively increase model capacity through a combination of dense scaling and language-specific sparse parameters
-to create high quality models. Our focus on non-English-Centric models brings gains of more than 10 BLEU when directly
-translating between non-English directions while performing competitively to the best single systems of WMT. We
-open-source our scripts so that others may reproduce the data, evaluation, and final M2M-100 model.*
-
-
-Training and Generation
-_______________________________________________________________________________________________________________________
-
-M2M100 is a multilingual encoder-decoder (seq-to-seq) model primarily intended for translation tasks. As the model is
-multilingual it expects the sequences in a certain format: A special language id token is used as prefix in both the
-source and target text. The source text format is :obj:`[lang_code] X [eos]`, where :obj:`lang_code` is source language
-id for source text and target language id for target text, with :obj:`X` being the source or target text.
-
-The :class:`~transformers.M2M100Tokenizer` depends on :obj:`sentencepiece` so be sure to install it before running the
-examples. To install :obj:`sentencepiece` run ``pip install sentencepiece``.
-
- Supervised Training
-
-.. code-block::
-
-    from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
-
-    model = M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_418M')
-    tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M', src_lang="en", tgt_lang="fr")
-
-    src_text = "Life is like a box of chocolates."
-    tgt_lang = "La vie est comme une boîte de chocolat."
-
-    model_inputs = tokenizer(src_text, return_tensors="pt")
-    with tokenizer.as_target_tokenizer():
-        labels = tokenizer(tgt_text, return_tensors="pt").input_ids
-
-    loss = model(**model_inputs, labels=labels) # forward pass
-
-
- Generation
-
-    M2M100 uses the :obj:`eos_token_id` as the :obj:`decoder_start_token_id` for generation with the target language id
-    being forced as the first generated token. To force the target language id as the first generated token, pass the
-    `forced_bos_token_id` parameter to the `generate` method. The following example shows how to translate between
-    Hindi to French and Chinese to English using the `facebook/m2m100_418M` checkpoint.
-
-.. code-block::
-
-    >>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
-
-    >>> hi_text = "जीवन एक चॉकलेट बॉक्स की तरह है।"
-    >>> chinese_text = "生活就像一盒巧克力。"
-
-    >>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
-    >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
-
-    >>> # translate Hindi to French
-    >>> tokenizer.src_lang = "hi"
-    >>> encoded_hi = tokenizer(hi_text, return_tensors="pt")
-    >>> generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("fr"))
-    >>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-    "La vie est comme une boîte de chocolat."
-
-    >>> # translate Chinese to English
-    >>> tokenizer.src_lang = "zh"
-    >>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
-    >>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
-    >>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-    "Life is like a box of chocolate."
-
-
-M2M100Config
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.M2M100Config
-    :members:
-
-
-M2M100Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.M2M100Tokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-M2M100Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.M2M100Model
-    :members: forward
-
-
-M2M100ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.M2M100ForConditionalGeneration
-    :members: forward
-
-
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -76,29 +76,27 @@ require 3 character language codes:

 .. code-block:: python

-    >>> from transformers import MarianMTModel, MarianTokenizer
-    >>> src_text = [
-    ...     '>>fra<< this is a sentence in english that we want to translate to french',
-    ...     '>>por<< This should go to portuguese',
-    ...     '>>esp<< And this to Spanish'
-    >>> ]
+    from transformers import MarianMTModel, MarianTokenizer
+    src_text = [
+        '>>fra<< this is a sentence in english that we want to translate to french',
+        '>>por<< This should go to portuguese',
+        '>>esp<< And this to Spanish'
+    ]

-    >>> model_name = 'Helsinki-NLP/opus-mt-en-roa'
-    >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
-    >>> print(tokenizer.supported_language_codes)
-    ['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']
-
-    >>> model = MarianMTModel.from_pretrained(model_name)
-    >>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
-    >>> [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
-    ["c'est une phrase en anglais que nous voulons traduire en français",
-     'Isto deve ir para o português.',
-     'Y esto al español']
+    model_name = 'Helsinki-NLP/opus-mt-en-roa'
+    tokenizer = MarianTokenizer.from_pretrained(model_name)
+    print(tokenizer.supported_language_codes)
+    model = MarianMTModel.from_pretrained(model_name)
+    translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
+    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+    # ["c'est une phrase en anglais que nous voulons traduire en français",
+    # 'Isto deve ir para o português.',
+    # 'Y esto al español']




-Here is the code to see all available pretrained models on the hub:
+Code to see available pretrained models:

 .. code-block:: python

@@ -149,22 +147,21 @@ Example of translating english to many romance languages, using old-style 2 char

 .. code-block::python

-    >>> from transformers import MarianMTModel, MarianTokenizer
-    >>> src_text = [
-    ...     '>>fr<< this is a sentence in english that we want to translate to french',
-    ...     '>>pt<< This should go to portuguese',
-    ...     '>>es<< And this to Spanish'
-    >>> ]
+    from transformers import MarianMTModel, MarianTokenizer
+    src_text = [
+        '>>fr<< this is a sentence in english that we want to translate to french',
+        '>>pt<< This should go to portuguese',
+        '>>es<< And this to Spanish'
+    ]

-    >>> model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
-    >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+    model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
+    tokenizer = MarianTokenizer.from_pretrained(model_name)
+    print(tokenizer.supported_language_codes)

-    >>> model = MarianMTModel.from_pretrained(model_name)
-    >>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
-    >>> tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
-    ["c'est une phrase en anglais que nous voulons traduire en français", 
-     'Isto deve ir para o português.',
-     'Y esto al español']
+    model = MarianMTModel.from_pretrained(model_name)
+    translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
+    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+    # ["c'est une phrase en anglais que nous voulons traduire en français", 'Isto deve ir para o português.',  'Y esto al español']



@@ -179,7 +176,7 @@ MarianTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.MarianTokenizer
-    :members: as_target_tokenizer
+    :members: prepare_seq2seq_batch


 MarianModel
--- a/docs/source/model_doc/mbart.rst
+++ b/docs/source/model_doc/mbart.rst
@@ -10,14 +10,14 @@
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.

-MBart and MBart-50
+MBart
 -----------------------------------------------------------------------------------------------------------------------

 **DISCLAIMER:** If you see something strange, file a `Github Issue
 <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
@patrickvonplaten

-Overview of MBart
+Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 The MBart model was presented in `Multilingual Denoising Pre-training for Neural Machine Translation
@@ -31,34 +31,33 @@ on the encoder, decoder, or reconstructing parts of the text.

 The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/mbart>`__

-Training of MBart
+Examples
 _______________________________________________________________________________________________________________________

-MBart is a multilingual encoder-decoder (sequence-to-sequence) model primarily intended for translation task. As the
-model is multilingual it expects the sequences in a different format. A special language id token is added in both the
-source and target text. The source text format is :obj:`X [eos, src_lang_code]` where :obj:`X` is the source text. The
-target text format is :obj:`[tgt_lang_code] X [eos]`. :obj:`bos` is never used.
+- Examples and scripts for fine-tuning mBART and other models for sequence to sequence tasks can be found in
+  :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
+- Given the large embeddings table, mBART consumes a large amount of GPU RAM, especially for fine-tuning.
+  :class:`MarianMTModel` is usually a better choice for bilingual machine translation.

-The regular :meth:`~transformers.MBartTokenizer.__call__` will encode source text format, and it should be wrapped
-inside the context manager :meth:`~transformers.MBartTokenizer.as_target_tokenizer` to encode target text format.
+Training
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+MBart is a multilingual encoder-decoder (seq-to-seq) model primarily intended for translation task. As the model is
+multilingual it expects the sequences in a different format. A special language id token is added in both the source
+and target text. The source text format is :obj:`X [eos, src_lang_code]` where :obj:`X` is the source text. The target
+text format is :obj:`[tgt_lang_code] X [eos]`. :obj:`bos` is never used.
+
+The :meth:`~transformers.MBartTokenizer.prepare_seq2seq_batch` handles this automatically and should be used to encode
+the sequences for sequence-to-sequence fine-tuning.

 - Supervised training

 .. code-block::

-    >>> from transformers import MBartForConditionalGeneration, MBartTokenizer
-
-    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
-    >>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
-    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-
-    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt", src_lang="en_XX", tgt_lang="ro_RO")
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")
-
-    >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
-    >>> # forward pass
-    >>> model(**inputs, labels=batch['labels'])
+    example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
+    expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
+    batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt")
+    model(input_ids=batch['input_ids'], labels=batch['labels']) # forward pass

 - Generation

@@ -67,95 +66,14 @@ inside the context manager :meth:`~transformers.MBartTokenizer.as_target_tokeniz

 .. code-block::

-    >>> from transformers import MBartForConditionalGeneration, MBartTokenizer
-
-    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX")
-    >>> article = "UN Chief Says There Is No Military Solution in Syria"
-    >>> inputs = tokenizer(article, return_tensors="pt")
-    >>> translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
-    >>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-    "Şeful ONU declară că nu există o soluţie militară în Siria"
-
-
-Overview of MBart-50
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-MBart-50 was introduced in the `Multilingual Translation with Extensible Multilingual Pretraining and Finetuning
-<https://arxiv.org/abs/2008.00401>` paper by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav
-Chaudhary, Jiatao Gu, Angela Fan. MBart-50 is created using the original `mbart-large-cc25` checkpoint by extendeding
-its embedding layers with randomly initialized vectors for an extra set of 25 language tokens and then pretrained on 50
-languages.
-
-According to the abstract
-
-*Multilingual translation models can be created through multilingual finetuning. Instead of finetuning on one
-direction, a pretrained model is finetuned on many directions at the same time. It demonstrates that pretrained models
-can be extended to incorporate additional languages without loss of performance. Multilingual finetuning improves on
-average 1 BLEU over the strongest baselines (being either multilingual from scratch or bilingual finetuning) while
-improving 9.3 BLEU on average over bilingual baselines from scratch.*
-
-
-Training of MBart-50
-_______________________________________________________________________________________________________________________
-
-The text format for MBart-50 is slightly different from mBART. For MBart-50 the language id token is used as a prefix
-for both source and target text i.e the text format is :obj:`[lang_code] X [eos]`, where :obj:`lang_code` is source
-language id for source text and target language id for target text, with :obj:`X` being the source or target text
-respectively.
-
-
-MBart-50 has its own tokenizer :class:`~transformers.MBart50Tokenizer`.
-
-  Supervised training
-
-.. code-block::
-
-    from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
-
-    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
-    tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
-
-    src_text = " UN Chief Says There Is No Military Solution in Syria"
-    tgt_text =  "Şeful ONU declară că nu există o soluţie militară în Siria"
-
-    model_inputs = tokenizer(src_text, return_tensors="pt")
-    with tokenizer.as_target_tokenizer():
-        labels = tokenizer(tgt_text, return_tensors="pt").input_ids
-
-    model(**model_inputs, labels=labels) # forward pass
-
-
- Generation
-
-    To generate using the mBART-50 multilingual translation models, :obj:`eos_token_id` is used as the
-    :obj:`decoder_start_token_id` and the target language id is forced as the first generated token. To force the
-    target language id as the first generated token, pass the `forced_bos_token_id` parameter to the `generate` method.
-    The following example shows how to translate between Hindi to French and Arabic to English using the
-    `facebook/mbart-50-large-many-to-many` checkpoint.
-
-.. code-block::
-
-    from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
-
-    article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है"
-    article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا."
-
-    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-    tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-
-    # translate Hindi to French
-    tokenizer.src_lang = "hi_IN"
-    encoded_hi = tokenizer(article_hi, return_tensors="pt")
-    generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"])
-    tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-    # => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire en Syria."
-
-    # translate Arabic to English
-    tokenizer.src_lang = "ar_AR"
-    encoded_ar = tokenizer(article_ar, return_tensors="pt")
-    generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
-    tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-    # => "The Secretary-General of the United Nations says there is no military solution in Syria."
+    from transformers import MBartForConditionalGeneration, MBartTokenizer
+    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
+    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
+    article = "UN Chief Says There Is No Military Solution in Syria"
+    batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang="en_XX", return_tensors="pt")
+    translated_tokens = model.generate(**batch, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
+    translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+    assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria"


 MBartConfig
@@ -169,7 +87,7 @@ MBartTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.MBartTokenizer
-    :members: as_target_tokenizer, build_inputs_with_special_tokens
+    :members: build_inputs_with_special_tokens, prepare_seq2seq_batch


 MBartTokenizerFast
@@ -179,20 +97,6 @@ MBartTokenizerFast
    :members:


-MBart50Tokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBart50Tokenizer
-    :members:
-
-
-MBart50TokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MBart50TokenizerFast
-    :members:
-
-
 MBartModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/megatron_bert.rst
+++ b/docs/source/model_doc/megatron_bert.rst
@@ -1,153 +0,0 @@
-.. 
-    Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-MegatronBERT
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The MegatronBERT model was proposed in `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
-Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
-Jared Casper and Bryan Catanzaro.
-
-The abstract from the paper is the following:
-
-*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
-Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
-constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
-efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
-approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
-parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
-illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
-15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
-that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
-the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
-billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
-BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
-achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
-accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
-of 89.4%).*
-
-Tips:
-
-We have provided pretrained `BERT-345M <https://ngc.nvidia.com/catalog/models/nvidia:megatron_bert_345m>`__ checkpoints
-for use to evaluate or finetuning downstream tasks.
-
-To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__ for and setup the NVIDIA GPU Cloud (NGC)
-Registry CLI. Further documentation for downloading models can be found in the `NGC documentation
-<https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
-
-Alternatively, you can directly download the checkpoints using:
-
-BERT-345M-uncased::
-
-.. code-block:: bash
-
-    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip
-    -O megatron_bert_345m_v0_1_uncased.zip
-
-BERT-345M-cased::
-
-.. code-block:: bash
-
-    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O
-    megatron_bert_345m_v0_1_cased.zip
-
-Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to convert them to a format that will
-easily be loaded by Hugging Face Transformers and our port of the BERT code.
-
-The following commands allow you to do the conversion. We assume that the folder ``models/megatron_bert`` contains
-``megatron_bert_345m_v0_1_{cased, uncased}.zip`` and that the commands are run from inside that folder::
-
-.. code-block:: bash
-
-    python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip 
-
-.. code-block:: bash
-
-    python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip
-
-The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
-and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
-approach using "tensor parallel" and "pipeline parallel" techniques.
-
-MegatronBertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertConfig
-    :members:
-
-
-MegatronBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertModel
-    :members: forward
-
-
-MegatronBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForMaskedLM
-    :members: forward
-
-
-MegatronBertForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForCausalLM
-    :members: forward
-
-
-MegatronBertForNextSentencePrediction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForNextSentencePrediction
-    :members: forward
-
-
-MegatronBertForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForPreTraining
-    :members: forward
-
-
-MegatronBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForSequenceClassification
-    :members: forward
-
-
-MegatronBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForMultipleChoice
-    :members: forward
-
-
-MegatronBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForTokenClassification
-    :members: forward
-
-
-MegatronBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.MegatronBertForQuestionAnswering
-    :members: forward
-
-
--- a/docs/source/model_doc/megatron_gpt2.rst
+++ b/docs/source/model_doc/megatron_gpt2.rst
@@ -1,70 +0,0 @@
-.. 
-    Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-MegatronGPT2
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The MegatronGPT2 model was proposed in `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model
-Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley,
-Jared Casper and Bryan Catanzaro.
-
-The abstract from the paper is the following:
-
-*Recent work in language modeling demonstrates that training large transformer models advances the state of the art in
-Natural Language Processing applications. However, very large models can be quite difficult to train due to memory
-constraints. In this work, we present our techniques for training very large transformer models and implement a simple,
-efficient intra-layer model parallel approach that enables training transformer models with billions of parameters. Our
-approach does not require a new compiler or library changes, is orthogonal and complimentary to pipeline model
-parallelism, and can be fully implemented with the insertion of a few communication operations in native PyTorch. We
-illustrate this approach by converging transformer based models up to 8.3 billion parameters using 512 GPUs. We sustain
-15.1 PetaFLOPs across the entire application with 76% scaling efficiency when compared to a strong single GPU baseline
-that sustains 39 TeraFLOPs, which is 30% of peak FLOPs. To demonstrate that large language models can further advance
-the state of the art (SOTA), we train an 8.3 billion parameter transformer language model similar to GPT-2 and a 3.9
-billion parameter model similar to BERT. We show that careful attention to the placement of layer normalization in
-BERT-like models is critical to achieving increased performance as the model size grows. Using the GPT-2 model we
-achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA
-accuracy of 63.2%) datasets. Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy
-of 89.4%).*
-
-Tips:
-
-We have provided pretrained `GPT2-345M <https://ngc.nvidia.com/catalog/models/nvidia:megatron_lm_345m>`__ checkpoints
-for use to evaluate or finetuning downstream tasks.
-
-To access these checkpoints, first `sign up <https://ngc.nvidia.com/signup>`__ for and setup the NVIDIA GPU Cloud (NGC)
-Registry CLI. Further documentation for downloading models can be found in the `NGC documentation
-<https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html#topic_6_4_1>`__.
-
-Alternatively, you can directly download the checkpoints using::
-
-.. code-block:: bash
-
-    wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O
-    megatron_gpt2_345m_v0_0.zip
-
-Once you have obtained the checkpoint from NVIDIA GPU Cloud (NGC), you have to convert it to a format that will easily
-be loaded by Hugging Face Transformers GPT2 implementation.
-
-The following command allows you to do the conversion. We assume that the folder ``models/megatron_gpt2`` contains
-``megatron_gpt2_345m_v0_0.zip`` and that the command is run from that folder::
-
-.. code-block:: bash
-
-    python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip
-
-The original code can be found `here <https://github.com/NVIDIA/Megatron-LM>`__. That repository contains a multi-GPU
-and multi-node implementation of the Megatron Language models. In particular, it contains a hybrid model parallel
-approach using "tensor parallel" and "pipeline parallel" techniques.
-
--- a/docs/source/model_doc/pegasus.rst
+++ b/docs/source/model_doc/pegasus.rst
@@ -51,8 +51,8 @@ All the `checkpoints <https://huggingface.co/models?search=pegasus>`__ are fine-
 Examples
 _______________________________________________________________________________________________________________________

- :prefix_link:`Script <examples/research_projects/seq2seq-distillation/finetune_pegasus_xsum.sh>` to fine-tune pegasus
-  on the XSUM dataset. Data download instructions at :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
+- :prefix_link:`Script <examples/seq2seq/finetune_pegasus_xsum.sh>` to fine-tune pegasus on the XSUM dataset. Data
+  download instructions at :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
 - FP16 is not supported (help/ideas on this appreciated!).
 - The adafactor optimizer is recommended for pegasus fine-tuning.

@@ -78,20 +78,20 @@ Usage Example

 .. code-block:: python

-    >>> from transformers import PegasusForConditionalGeneration, PegasusTokenizer
-    >>> import torch
-    >>> src_text = [
-    ...     """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
-    >>> ]
+    from transformers import PegasusForConditionalGeneration, PegasusTokenizer
+    import torch
+    src_text = [
+        """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
+    ]

-    >>> model_name = 'google/pegasus-xsum'
-    >>> device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    >>> tokenizer = PegasusTokenizer.from_pretrained(model_name)
-    >>> model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
-    >>> batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(torch_device)
-    >>> translated = model.generate(**batch)
-    >>> tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
-    >>> assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
+    model_name = 'google/pegasus-xsum'
+    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    tokenizer = PegasusTokenizer.from_pretrained(model_name)
+    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
+    batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest', return_tensors="pt").to(torch_device)
+    translated = model.generate(**batch)
+    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
+    assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."



@@ -107,7 +107,7 @@ PegasusTokenizer
 warning: ``add_tokens`` does not work at the moment.

 .. autoclass:: transformers.PegasusTokenizer
-    :members:
+    :members: __call__, prepare_seq2seq_batch


 PegasusTokenizerFast
--- a/docs/source/model_doc/phobert.rst
+++ b/docs/source/model_doc/phobert.rst
@@ -31,23 +31,23 @@ Example of use:

 .. code-block::

-    >>> import torch
-    >>> from transformers import AutoModel, AutoTokenizer
+  import torch
+  from transformers import AutoModel, AutoTokenizer

-    >>> phobert = AutoModel.from_pretrained("vinai/phobert-base")
-    >>> tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
+  phobert = AutoModel.from_pretrained("vinai/phobert-base")
+  tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

-    >>> # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
-    >>> line = "Tôi là sinh_viên trường đại_học Công_nghệ ."
+  # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
+  line = "Tôi là sinh_viên trường đại_học Công_nghệ ."

-    >>> input_ids = torch.tensor([tokenizer.encode(line)])
+  input_ids = torch.tensor([tokenizer.encode(line)])

-    >>> with torch.no_grad():
-    ...     features = phobert(input_ids)  # Models outputs are now tuples
+  with torch.no_grad():
+      features = phobert(input_ids)  # Models outputs are now tuples

-    >>> # With TensorFlow 2.0+:
-    >>> # from transformers import TFAutoModel
-    >>> # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
+  ## With TensorFlow 2.0+:
+  # from transformers import TFAutoModel
+  # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")


 The original code can be found `here <https://github.com/VinAIResearch/PhoBERT>`__.
--- a/docs/source/model_doc/rag.rst
+++ b/docs/source/model_doc/rag.rst
@@ -56,7 +56,7 @@ RagTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RagTokenizer
-    :members:
+    :members: prepare_seq2seq_batch


 Rag specific outputs
@@ -94,24 +94,3 @@ RagTokenForGeneration

 .. autoclass:: transformers.RagTokenForGeneration
    :members: forward, generate
-
-
-TFRagModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRagModel
-    :members: call
-
-
-TFRagSequenceForGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRagSequenceForGeneration
-    :members: call, generate
-
-
-TFRagTokenForGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFRagTokenForGeneration
-    :members: call, generate
--- a/docs/source/model_doc/reformer.rst
+++ b/docs/source/model_doc/reformer.rst
@@ -145,8 +145,8 @@ For training, the :class:`~transformers.ReformerModelWithLMHead` should be used

 .. code-block::

-    input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
-    loss = model(input_ids, labels=input_ids)[0]
+  input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+  loss = model(input_ids, labels=input_ids)[0]


 ReformerConfig
--- a/docs/source/model_doc/speech_to_text.rst
+++ b/docs/source/model_doc/speech_to_text.rst
@@ -1,152 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Speech2Text
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The Speech2Text model was proposed in `fairseq S2T: Fast Speech-to-Text Modeling with fairseq
-<https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. It's a
-transformer-based seq2seq (encoder-decoder) model designed for end-to-end Automatic Speech Recognition (ASR) and Speech
-Translation (ST). It uses a convolutional downsampler to reduce the length of speech inputs by 3/4th before they are
-fed into the encoder. The model is trained with standard autoregressive cross-entropy loss and generates the
-transcripts/translations autoregressively. Speech2Text has been fine-tuned on several datasets for ASR and ST:
-`LibriSpeech <http://www.openslr.org/12>`__, `CoVoST 2 <https://github.com/facebookresearch/covost>`__, `MuST-C
-<https://ict.fbk.eu/must-c/>`__.
-
-The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/speech_to_text>`__.
-
-
-Inference
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Speech2Text is a speech model that accepts a float tensor of log-mel filter-bank features extracted from the speech
-signal. It's a transformer-based seq2seq model, so the transcripts/translations are generated autoregressively. The
-:obj:`generate()` method can be used for inference.
-
-The :class:`~transformers.Speech2TextFeatureExtractor` class is responsible for extracting the log-mel filter-bank
-features. The :class:`~transformers.Speech2TextProcessor` wraps :class:`~transformers.Speech2TextFeatureExtractor` and
-:class:`~transformers.Speech2TextTokenizer` into a single instance to both extract the input features and decode the
-predicted token ids.
-
-The feature extractor depends on :obj:`torchaudio` and the tokenizer depends on :obj:`sentencepiece` so be sure to
-install those packages before running the examples. You could either install those as extra speech dependancies with
-``pip install transformers"[speech, sentencepiece]"`` or install the packages seperatly with ``pip install torchaudio
-sentencepiece``. Also ``torchaudio`` requires the development version of the `libsndfile
-<http://www.mega-nerd.com/libsndfile/>`__ package which can be installed via a system package manager. On Ubuntu it can
-be installed as follows: ``apt install libsndfile1-dev``
-
-
- ASR and Speech Translation
-
-.. code-block::
-
-        >>> import torch
-        >>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
-        >>> from datasets import load_dataset
-        >>> import soundfile as sf
-
-        >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
-        >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
-
-        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
-        ...     return batch
-
-        >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
-        >>> ds = ds.map(map_to_array)
-
-        >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
-        >>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask"])
-
-        >>> transcription = processor.batch_decode(generated_ids)
-
-
- Multilingual speech translation
-
-    For multilingual speech translation models, :obj:`eos_token_id` is used as the :obj:`decoder_start_token_id` and
-    the target language id is forced as the first generated token. To force the target language id as the first
-    generated token, pass the :obj:`forced_bos_token_id` parameter to the :obj:`generate()` method. The following
-    example shows how to transate English speech to French text using the `facebook/s2t-medium-mustc-multilingual-st`
-    checkpoint.
-
-.. code-block::
-
-        >>> import torch
-        >>> from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
-        >>> from datasets import load_dataset
-        >>> import soundfile as sf
-
-        >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
-        >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
-
-        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
-        ...     return batch
-
-        >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
-        >>> ds = ds.map(map_to_array)
-
-        >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
-        >>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask], forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"])
-
-        >>> translation = processor.batch_decode(generated_ids)
-
-
-See the `model hub <https://huggingface.co/models?filter=speech_to_text>`__ to look for Speech2Text checkpoints.
-
-
-Speech2TextConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextConfig
-    :members:
-
-
-Speech2TextTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
-
-
-Speech2TextFeatureExtractor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextFeatureExtractor
-    :members: __call__
-
-
-Speech2TextProcessor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextProcessor
-    :members: __call__, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
-
-
-Speech2TextModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextModel
-    :members: forward
-
-
-Speech2TextForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Speech2TextForConditionalGeneration
-    :members: forward
--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -73,10 +73,10 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash

 .. code-block::

-    input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
-    labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
-    # the forward function automatically creates the correct decoder_input_ids
-    loss = model(input_ids=input_ids, labels=labels).loss
+  input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
+  labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
+  # the forward function automatically creates the correct decoder_input_ids
+  loss = model(input_ids=input_ids, labels=labels).loss

 - Supervised training

@@ -86,10 +86,10 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash

 .. code-block::

-    input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
-    labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
-    # the forward function automatically creates the correct decoder_input_ids
-    loss = model(input_ids=input_ids, labels=labels).loss
+  input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+  labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
+  # the forward function automatically creates the correct decoder_input_ids
+  loss = model(input_ids=input_ids, labels=labels).loss


 T5Config
@@ -104,7 +104,7 @@ T5Tokenizer

 .. autoclass:: transformers.T5Tokenizer
    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
+        create_token_type_ids_from_sequences, prepare_seq2seq_batch, save_vocabulary


 T5TokenizerFast
--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -1,102 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Vision Transformer (ViT)
-----------------------------------------------------------------------------------------------------------------------
-
-.. note::
-
-    This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
-    breaking changes to fix it in the future. If you see something strange, file a `Github Issue
-    <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__.
-
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The Vision Transformer (ViT) model was proposed in `An Image is Worth 16x16 Words: Transformers for Image Recognition
-at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
-Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
-Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
-very good results compared to familiar convolutional architectures.
-
-
-The abstract from the paper is the following:
-
-*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
-applications to computer vision remain limited. In vision, attention is either applied in conjunction with
-convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
-structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
-sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
-data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
-Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
-substantially fewer computational resources to train.*
-
-Tips:
-
- To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
-  which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be
-  used for classification. The authors also add absolute position embeddings, and feed the resulting sequence of
-  vectors to a standard Transformer encoder.
- As the Vision Transformer expects each image to be of the same size (resolution), one can use
-  :class:`~transformers.ViTFeatureExtractor` to resize (or rescale) and normalize images for the model.
- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
-  each checkpoint. For example, :obj:`google/vit-base-patch16-224` refers to a base-sized architecture with patch
-  resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the `hub
-  <https://huggingface.co/models?search=vit>`__.
- The available checkpoints are either (1) pre-trained on `ImageNet-21k <http://www.image-net.org/>`__ (a collection of
-  14 million images and 21k classes) only, or (2) also fine-tuned on `ImageNet
-  <http://www.image-net.org/challenges/LSVRC/2012/>`__ (also referred to as ILSVRC 2012, a collection of 1.3 million
-  images and 1,000 classes).
- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
-  use a higher resolution than pre-training `(Touvron et al., 2019) <https://arxiv.org/abs/1906.06423>`__, `(Kolesnikov
-  et al., 2020) <https://arxiv.org/abs/1912.11370>`__. In order to fine-tune at higher resolution, the authors perform
-  2D interpolation of the pre-trained position embeddings, according to their location in the original image.
- The best results are obtained with supervised pre-training, which is not the case in NLP. The authors also performed
-  an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked
-  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
-  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.
-
-
-The original code (written in JAX) can be found `here <https://github.com/google-research/vision_transformer>`__.
-
-Note that we converted the weights from Ross Wightman's `timm library
-<https://github.com/rwightman/pytorch-image-models>`__, who already converted the weights from JAX to PyTorch. Credits
-go to him!
-
-
-ViTConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ViTConfig
-    :members:
-
-
-ViTFeatureExtractor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ViTFeatureExtractor
-    :members: __call__
-
-
-ViTModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ViTModel
-    :members: forward
-
-
-ViTForImageClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.ViTForImageClassification
-    :members: forward
--- a/docs/source/model_doc/wav2vec2.rst
+++ b/docs/source/model_doc/wav2vec2.rst
@@ -34,7 +34,7 @@ Tips:

 - Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
 - Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
-  using :class:`~transformers.Wav2Vec2CTCTokenizer`.
+  using :class:`~transformers.Wav2Vec2Tokenizer`.


 Wav2Vec2Config
@@ -44,27 +44,13 @@ Wav2Vec2Config
    :members:


-Wav2Vec2CTCTokenizer
+Wav2Vec2Tokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.Wav2Vec2CTCTokenizer
+.. autoclass:: transformers.Wav2Vec2Tokenizer
    :members: __call__, save_vocabulary


-Wav2Vec2FeatureExtractor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2FeatureExtractor
-    :members: __call__
-
-
-Wav2Vec2Processor
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.Wav2Vec2Processor
-    :members: __call__, pad, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
-
-
 Wav2Vec2Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/xlsr_wav2vec2.rst
+++ b/docs/source/model_doc/xlsr_wav2vec2.rst
@@ -1,45 +0,0 @@
-.. 
-    Copyright 2021 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-XLSR-Wav2Vec2
-----------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The XLSR-Wav2Vec2 model was proposed in `Unsupervised Cross-Lingual Representation Learning For Speech Recognition
-<https://arxiv.org/abs/2006.13979>`__ by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael
-Auli.
-
-The abstract from the paper is the following:
-
-*This paper presents XLSR which learns cross-lingual speech representations by pretraining a single model from the raw
-waveform of speech in multiple languages. We build on wav2vec 2.0 which is trained by solving a contrastive task over
-masked latent speech representations and jointly learns a quantization of the latents shared across languages. The
-resulting model is fine-tuned on labeled data and experiments show that cross-lingual pretraining significantly
-outperforms monolingual pretraining. On the CommonVoice benchmark, XLSR shows a relative phoneme error rate reduction
-of 72% compared to the best known results. On BABEL, our approach improves word error rate by 16% relative compared to
-a comparable system. Our approach enables a single multilingual speech recognition model which is competitive to strong
-individual models. Analysis shows that the latent discrete speech representations are shared across languages with
-increased sharing for related languages. We hope to catalyze research in low-resource speech understanding by releasing
-XLSR-53, a large model pretrained in 53 languages.*
-
-Tips:
-
- XLSR-Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
- XLSR-Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be
-  decoded using :class:`~transformers.Wav2Vec2CTCTokenizer`.
-
-XLSR-Wav2Vec2's architecture is based on the Wav2Vec2 model, so one can refer to :doc:`Wav2Vec2's documentation page
-<wav2vec2>`.
-
-The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/fairseq/models/wav2vec>`__.
--- a/docs/source/perplexity.rst
+++ b/docs/source/perplexity.rst
@@ -18,8 +18,8 @@ that the metric applies specifically to classical language models (sometimes cal
 models) and is not well defined for masked language models like BERT (see :doc:`summary of the models
 <model_summary>`).

-Perplexity is defined as the exponentiated average negative log-likelihood of a sequence. If we have a tokenized
-sequence :math:`X = (x_0, x_1, \dots, x_t)`, then the perplexity of :math:`X` is,
+Perplexity is defined as the exponentiated average log-likelihood of a sequence. If we have a tokenized sequence
+:math:`X = (x_0, x_1, \dots, x_t)`, then the perplexity of :math:`X` is,

 .. math::

--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -139,12 +139,6 @@ For the full list, refer to `https://huggingface.co/models <https://huggingface.
 |                    | ``gpt2-xl``                                                | | 48-layer, 1600-hidden, 25-heads, 1558M parameters.                                                                                  |
 |                    |                                                            | | OpenAI's XL-sized GPT-2 English model                                                                                               |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| GPTNeo             | ``EleutherAI/gpt-neo-1.3B``                                | | 24-layer, 2048-hidden, 16-heads, 1.3B parameters.                                                                                   |
-|                    |                                                            | | EleutherAI's GPT-3 like language model.                                                                                             |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``EleutherAI/gpt-neo-2.7B``                                | | 32-layer, 2560-hidden, 20-heads, 2.7B parameters.                                                                                   |
-|                    |                                                            | | EleutherAI's GPT-3 like language model.                                                                                             |
-+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | Transformer-XL     | ``transfo-xl-wt103``                                       | | 18-layer, 1024-hidden, 16-heads, 257M parameters.                                                                                   |
 |                    |                                                            | | English model trained on wikitext-103                                                                                               |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
@@ -371,12 +365,6 @@ For the full list, refer to `https://huggingface.co/models <https://huggingface.
 |                    | ``reformer-crime-and-punishment``                          | | 6-layer, 256-hidden, 2-heads, 3M parameters                                                                                         |
 |                    |                                                            | | Trained on English text: Crime and Punishment novel by Fyodor Dostoyevsky.                                                          |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| M2M100             | ``facebook/m2m100_418M``                                   | | 24-layer, 1024-hidden, 16-heads, 418M parameters                                                                                    |
-|                    |                                                            | | multilingual machine translation model for 100 languages                                                                            |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``facebook/m2m100_1.2B``                                   | | 48-layer, 1024-hidden, 16-heads, 1.2B parameters                                                                                    |
-|                    |                                                            | | multilingual machine translation model for 100 languages                                                                            |
-+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | MarianMT           | ``Helsinki-NLP/opus-mt-{src}-{tgt}``                       | | 12-layer, 512-hidden, 8-heads, ~74M parameter Machine translation models. Parameter counts vary depending on vocab size.            |
 |                    |                                                            | | (see `model list <https://huggingface.co/Helsinki-NLP>`_)                                                                           |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
@@ -393,15 +381,6 @@ For the full list, refer to `https://huggingface.co/models <https://huggingface.
 |                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                    | ``facebook/mbart-large-en-ro``                             | | 24-layer, 1024-hidden, 16-heads, 610M parameters                                                                                    |
 |                    |                                                            | | mbart-large-cc25 model finetuned on WMT english romanian translation.                                                               |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``facebook/mbart-large-50``                                | | 24-layer, 1024-hidden, 16-heads,                                                                                                    |
-|                    |                                                            | | mBART model trained on 50 languages' monolingual corpus.                                                                            |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``facebook/mbart-large-50-one-to-many-mmt``                | | 24-layer, 1024-hidden, 16-heads,                                                                                                    |
-|                    |                                                            | | mbart-50-large model finetuned for one (English) to many multilingual machine translation covering 50 languages.                    |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``facebook/mbart-large-50-many-to-many-mmt``               | | 24-layer, 1024-hidden, 16-heads,                                                                                                    |
-|                    |                                                            | | mbart-50-large model finetuned for many to many multilingual machine translation covering 50 languages.                             |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | Lxmert             | ``lxmert-base-uncased``                                    | | 9-language layers, 9-relationship layers, and 12-cross-modality layers                                                              |
 |                    |                                                            | | 768-hidden, 12-heads (for each layer) ~ 228M parameters                                                                             |
@@ -455,30 +434,15 @@ For the full list, refer to `https://huggingface.co/models <https://huggingface.
 |                    |                                                            |                                                                                                                                       |
 |                    |                                                            | (see `details <https://github.com/microsoft/unilm/tree/master/layoutlm>`__)                                                           |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| DeBERTa            | ``microsoft/deberta-base``                                 | | 12-layer, 768-hidden, 12-heads, ~140M parameters                                                                                    |
+| DeBERTa            | ``microsoft/deberta-base``                                 | | 12-layer, 768-hidden, 12-heads, ~125M parameters                                                                                    |
 |                    |                                                            | | DeBERTa using the BERT-base architecture                                                                                            |
 |                    |                                                            |                                                                                                                                       |
 |                    |                                                            | (see `details <https://github.com/microsoft/DeBERTa>`__)                                                                              |
 |                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``microsoft/deberta-large``                                | | 24-layer, 1024-hidden, 16-heads, ~400M parameters                                                                                   |
+|                    | ``microsoft/deberta-large``                                | | 24-layer, 1024-hidden, 16-heads, ~390M parameters                                                                                   |
 |                    |                                                            | | DeBERTa using the BERT-large architecture                                                                                           |
 |                    |                                                            |                                                                                                                                       |
 |                    |                                                            | (see `details <https://github.com/microsoft/DeBERTa>`__)                                                                              |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``microsoft/deberta-xlarge``                               | | 48-layer, 1024-hidden, 16-heads, ~750M parameters                                                                                   |
-|                    |                                                            | | DeBERTa XLarge with similar BERT architecture                                                                                       |
-|                    |                                                            |                                                                                                                                       |
-|                    |                                                            | (see `details <https://github.com/microsoft/DeBERTa>`__)                                                                              |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``microsoft/deberta-xlarge-v2``                            | | 24-layer, 1536-hidden, 24-heads, ~900M parameters                                                                                   |
-|                    |                                                            | | DeBERTa XLarge V2 with similar BERT architecture                                                                                    |
-|                    |                                                            |                                                                                                                                       |
-|                    |                                                            | (see `details <https://github.com/microsoft/DeBERTa>`__)                                                                              |
-|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                    | ``microsoft/deberta-xxlarge-v2``                           | | 48-layer, 1536-hidden, 24-heads, ~1.5B parameters                                                                                   |
-|                    |                                                            | | DeBERTa XXLarge V2 with similar BERT architecture                                                                                   |
-|                    |                                                            |                                                                                                                                       |
-|                    |                                                            | (see `details <https://github.com/microsoft/DeBERTa>`__)                                                                              |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | SqueezeBERT        | ``squeezebert/squeezebert-uncased``                        | | 12-layer, 768-hidden, 12-heads, 51M parameters, 4.3x faster than bert-base-uncased on a smartphone.                                 |
 |                    |                                                            | | SqueezeBERT architecture pretrained from scratch on masked language model (MLM) and sentence order prediction (SOP) tasks.          |
--- a/docs/source/sagemaker.md
+++ b/docs/source/sagemaker.md
@@ -1,393 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# Run training on Amazon SageMaker
-
-Hugging Face and Amazon are introducing new [Hugging Face Deep Learning Containers (DLCs)](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-training-containers) to make it easier than ever to train Hugging Face Transformer models in [Amazon SageMaker](https://aws.amazon.com/sagemaker/).
-
-To learn how to access and use the new Hugging Face DLCs with the Amazon SageMaker Python SDK, check out the guides and resources below.
-
---
-
-## Deep Learning Container (DLC) overview
-
-The Deep Learning Container are in every available where Amazon SageMaker is available. You can see the [AWS region table](https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/) for all AWS global infrastructure. To get an detailed overview of all included packages look [here in the release notes](https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/deep-learning-containers-images.html).
-
-| 🤗 Transformers version | 🤗 Datasets version | PyTorch/TensorFlow version | type     | device | Python Version | Example `image_uri`                                                                                                               |
-| ----------------------- | ------------------- | -------------------------- | -------- | ------ | -------------- | --------------------------------------------------------------------------------------------------------------------------------- |
-| 4.4.2                   | 1.5.0               | PyTorch 1.6.0              | training | GPU    | 3.6            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04`    |
-| 4.4.2                   | 1.5.0               | TensorFlow 2.4.1           | training | GPU    | 3.7            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.4.2-gpu-py37-cu110-ubuntu18.04` |
-
---
-
-## Getting Started: Train a 🤗 Transformers Model
-
-To train a 🤗 Transformers model by using the `HuggingFace` SageMaker Python SDK you need to:
-
- [Prepare a training script](#prepare-a-transformers-fine-tuning-script)
- [Create a `HuggingFace` Estimator](#create-an-huggingface-estimator)
- [Run training by calling the `fit` method](#execute-training)
- [Access you model](#access-trained-model)
-
-### Setup & Installation
-
-Before you can train a transformers models with Amazon SageMaker you need to sign up for an AWS account. If you do not have an AWS account yet learn more [here](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-set-up.html).
-
-After you complete these tasks you can get started using either [SageMaker Studio](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-studio-onboard.html), [SageMaker Notebook Instances](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-console.html), or a local environment. To start training locally you need configure the right [IAM permission](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html).
-
-Upgrade to the latest `sagemaker` version.
-
-```bash
-pip install sagemaker --upgrade
-```
-
-**SageMaker environment**
-
-_Note: The execution role is intended to be available only when running a notebook within SageMaker. If you run `get_execution_role` in a notebook not on SageMaker, expect a "region" error._
-
-```python
-import sagemaker
-sess = sagemaker.Session()
-role = sagemaker.get_execution_role()
-```
-
-**Local environment**
-
-```python
-import sagemaker
-import boto3
-
-iam_client = boto3.client('iam')
-role = iam_client.get_role(RoleName='role-name-of-your-iam-role-with-right-permissions')['Role']['Arn']
-sess = sagemaker.Session()
-```
-
-### Prepare a 🤗 Transformers fine-tuning script.
-
-The training script is very similar to a training script you might run outside of SageMaker, but you can access useful properties about the training environment through various environment variables, including the following:
-
- `SM_MODEL_DIR`: A string that represents the path where the training job writes the model artifacts to. After training, artifacts in this directory are uploaded to S3 for model hosting. `SM_MODEL_DIR` is always set to `/opt/ml/model`.
-
- `SM_NUM_GPUS`: An integer representing the number of GPUs available to the host.
-
- `SM_CHANNEL_XXXX:` A string that represents the path to the directory that contains the input data for the specified channel. For example, if you specify two input channels in the HuggingFace estimator’s fit call, named `train` and `test`, the environment variables `SM_CHANNEL_TRAIN` and `SM_CHANNEL_TEST` are set.
-
-You can find a full list of the exposed environment variables [here](https://github.com/aws/sagemaker-training-toolkit/blob/master/ENVIRONMENT_VARIABLES.md).
-
-Later we define `hyperparameters` in the [HuggingFace Estimator](#create-an-huggingface-estimator), which are passed in as named arguments and and can be processed with the `ArgumentParser()`.
-
-```python
-import transformers
-import datasets
-import argparse
-import os
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser()
-
-    # hyperparameters sent by the client are passed as command-line arguments to the script.
-    parser.add_argument("--epochs", type=int, default=3)
-    parser.add_argument("--per_device_train_batch_size", type=int, default=32)
-    parser.add_argument("--model_name_or_path", type=str)
-
-    # Data, model, and output directories
-    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
-    parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
-    parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
-```
-
-_Note that SageMaker doesn’t support argparse actions. For example, if you want to use a boolean hyperparameter, specify `type` as `bool` in your script and provide an explicit `True` or `False` value._
-
-For a complete example of a 🤗 Transformers training script, see [train.py](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/scripts/train.py)
-
-### Create an HuggingFace Estimator
-
-You run 🤗 Transformers training scripts on SageMaker by creating `HuggingFace` Estimators. The Estimator handles end-to-end Amazon SageMaker training. The training of your script is invoked when you call `fit` on a `HuggingFace` Estimator. In the Estimator you define, which fine-tuning script should be used as `entry_point`, which `instance_type` should be used, which `hyperparameters` are passed in, you can find all possible `HuggingFace` Parameter [here](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/sagemaker.huggingface.html#huggingface-estimator). and an example of a fine-tuning script [here](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/scripts/train.py).
-You can find all useable `instance_types` [here](https://aws.amazon.com/de/sagemaker/pricing/).
-
-The following code sample shows how you train a custom `HuggingFace` script `train.py`, passing in three hyperparameters (`epochs`, `per_device_train_batch_size`, and `model_name_or_path`).
-
-```python
-from sagemaker.huggingface import HuggingFace
-
-
-# hyperparameters, which are passed into the training job
-hyperparameters={'epochs': 1,
-                 'per_device_train_batch_size': 32,
-                 'model_name_or_path': 'distilbert-base-uncased'
-                 }
-
-# create the Estimator
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3.2xlarge',
-        instance_count=1,
-        role=role,
-        transformers_version='4.4',
-        pytorch_version='1.6',
-        py_version='py36',
-        hyperparameters = hyperparameters
-)
-```
-
-To run the `TrainingJob` locally you can define `instance_type='local'` or `instance_type='local-gpu'` for gpu usage. _Note: this does not working within SageMaker Studio_
-
-### Execute Training
-
-You start your `TrainingJob` by calling `fit` on a `HuggingFace` Estimator. In the `fit` method you specify your input training data, like a string S3 URI `s3://my-bucket/my-training-data` or a `FileSystemInput` for [EFS or FSx Lustre](https://sagemaker.readthedocs.io/en/stable/overview.html?highlight=FileSystemInput#use-file-systems-as-training-inputs), see [here](https://sagemaker.readthedocs.io/en/stable/overview.html?highlight=FileSystemInput#use-file-systems-as-training-inputs).
-
-```python
-huggingface_estimator.fit(
-  {'train': 's3://sagemaker-us-east-1-558105141721/samples/datasets/imdb/train',
-   'test': 's3://sagemaker-us-east-1-558105141721/samples/datasets/imdb/test'}
-)
-
-```
-
-SageMaker takes care of starting and managing all the required ec2 instances for ands starts the training job by running.
-
-```bash
-/opt/conda/bin/python train.py --epochs 1 --model_name_or_path distilbert-base-uncased --per_device_train_batch_size 32
-```
-
-### Access trained model
-
-After training is done you can access your model either through the [AWS console](https://console.aws.amazon.com/console/home?nc2=h_ct&src=header-signin) or downloading it directly from S3.
-
-```python
-from sagemaker.s3 import S3Downloader
-
-S3Downloader.download(
-    s3_uri=huggingface_estimator.model_data, # s3 uri where the trained model is located
-    local_path='.', # local path where *.targ.gz is saved
-    sagemaker_session=sess # sagemaker session used for training the model
-)
-```
-
---
-
-## Sample Notebooks
-
-You can find here a list of the official notebooks provided by Hugging Face.
-
-| Notebook                                                                                                                                                                                        | Description                                                                                                      |
-| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------- |
-| [Getting Started Pytorch](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb)                                                   | End-to-End binary Text-Classification example using `Trainer` and `imdb` dataset                                 |
-| [Getting Started Tensorflow](https://github.com/huggingface/notebooks/blob/master/sagemaker/02_getting_started_tensorflow/sagemaker-notebook.ipynb)                                             | End-to-End binary Text-Classification example using `Keras` and `imdb` dataset                                   |
-| [Distributed Training Data Parallelism](https://github.com/huggingface/notebooks/blob/master/sagemaker/03_distributed_training_data_parallelism/sagemaker-notebook.ipynb)                       | End-to-End distributed Question-Answering example using `Trainer` and 🤗 Transformers example script for `SQAuD` |
-| [Distributed Training Model Parallelism](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)                     | End-to-End model parallelism example using `SageMakerTrainer` and `run_glue.py` script                           |
-| [Spot Instances and continues training](https://github.com/huggingface/notebooks/blob/master/sagemaker/05_spot_instances/sagemaker-notebook.ipynb)                                              | End-to-End to Text-Classification example using spot instances with continued training.                          |
-| [SageMaker Metrics](https://github.com/huggingface/notebooks/blob/master/sagemaker/06_sagemaker_metrics/sagemaker-notebook.ipynb)                                                               | End-to-End to Text-Classification example using SageMaker Metrics to extract and log metrics during training     |
-| [Distributed Training Data Parallelism Tensorflow](https://github.com/huggingface/notebooks/blob/master/sagemaker/07_tensorflow_distributed_training_data_parallelism/sagemaker-notebook.ipynb) | End-to-End distributed binary Text-Classification example using `Keras` and `TensorFlow`                    
-| [Distributed Seq2Seq Training with Data Parallelism and BART](https://github.com/huggingface/notebooks/blob/master/sagemaker/08_distributed_summarization_bart_t5/sagemaker-notebook.ipynb) | End-to-End distributed summarization example `BART-large` and 🤗 Transformers example script for `summarization`                        |
-
-
---
-
-## Advanced Features
-
-In addition to the Deep Learning Container and the SageMaker SDK, we have implemented other additional features.
-
-### Distributed Training: Data-Parallel
-
-You can use [SageMaker Data Parallelism Library](https://aws.amazon.com/blogs/aws/managed-data-parallelism-in-amazon-sagemaker-simplifies-training-on-large-datasets/) out of the box for distributed training. We added the functionality of Data Parallelism directly into the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html). If your `train.py` uses the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) API you only need to define the distribution parameter in the HuggingFace Estimator.
-
- [Example Notebook PyTorch](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)
- [Example Notebook TensorFlow](https://github.com/huggingface/notebooks/blob/master/sagemaker/07_tensorflow_distributed_training_data_parallelism/sagemaker-notebook.ipynb)
-
-```python
-# configuration for running training on smdistributed Data Parallel
-distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}
-
-# create the Estimator
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3dn.24xlarge',
-        instance_count=2,
-        role=role,
-        transformers_version='4.4.2',
-        pytorch_version='1.6.0',
-        py_version='py36',
-        hyperparameters = hyperparameters
-        distribution = distribution
-)
-
-```
-
-### Distributed Training: Model-Parallel
-
-You can use [SageMaker Model Parallelism Library](https://aws.amazon.com/blogs/aws/amazon-sagemaker-simplifies-training-deep-learning-models-with-billions-of-parameters/) out of the box for distributed training. We added the functionality of Model Parallelism directly into the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html). If your `train.py` uses the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) API you only need to define the distribution parameter in the HuggingFace Estimator.  
-For detailed information about the adjustments take a look [here](https://sagemaker.readthedocs.io/en/stable/api/training/smd_model_parallel_general.html?highlight=modelparallel#required-sagemaker-python-sdk-parameters).
-
-
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)
-
-
-```python
-# configuration for running training on smdistributed Model Parallel
-mpi_options = {
-    "enabled" : True,
-    "processes_per_host" : 8
-}
-
-smp_options = {
-    "enabled":True,
-    "parameters": {
-        "microbatches": 4,
-        "placement_strategy": "spread",
-        "pipeline": "interleaved",
-        "optimize": "speed",
-        "partitions": 4,
-        "ddp": True,
-    }
-}
-
-distribution={
-    "smdistributed": {"modelparallel": smp_options},
-    "mpi": mpi_options
-}
-
- # create the Estimator
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3dn.24xlarge',
-        instance_count=2,
-        role=role,
-        transformers_version='4.4.2',
-        pytorch_version='1.6.0',
-        py_version='py36',
-        hyperparameters = hyperparameters,
-        distribution = distribution
-)
-```
-
-### Spot Instances
-
-With the creation of HuggingFace Framework extension for the SageMaker Python SDK we can also leverage the benefit of [fully-managed EC2 spot instances](https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html) and save up to 90% of our training cost.
-
-_Note: Unless your training job completes quickly, we recommend you use [checkpointing](https://docs.aws.amazon.com/sagemaker/latest/dg/model-checkpoints.html) with managed spot training, therefore you need to define the `checkpoint_s3_uri`._
-
-To use spot instances with the `HuggingFace` Estimator we have to set the `use_spot_instances` parameter to `True` and define your `max_wait` and `max_run` time. You can read more about the [managed spot training lifecycle here](https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html).
-
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/05_spot_instances/sagemaker-notebook.ipynb)
-
-```python
-# hyperparameters, which are passed into the training job
-hyperparameters={'epochs': 1,
-                 'train_batch_size': 32,
-                 'model_name':'distilbert-base-uncased',
-                 'output_dir':'/opt/ml/checkpoints'
-                 }
-# create the Estimator
-
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3.2xlarge',
-        instance_count=1,
-	    checkpoint_s3_uri=f's3://{sess.default_bucket()}/checkpoints'
-        use_spot_instances=True,
-        max_wait=3600, # This should be equal to or greater than max_run in seconds'
-        max_run=1000,
-        role=role,
-        transformers_version='4.4',
-        pytorch_version='1.6',
-        py_version='py36',
-        hyperparameters = hyperparameters
-)
-
-# Training seconds: 874
-# Billable seconds: 262
-# Managed Spot Training savings: 70.0%
-
-```
-
-### Git Repository
-
-When you create a `HuggingFace` Estimator, you can specify a [training script that is stored in a GitHub repository](https://sagemaker.readthedocs.io/en/stable/overview.html#use-scripts-stored-in-a-git-repository) as the entry point for the estimator, so that you don’t have to download the scripts locally. If Git support is enabled, the `entry_point` and `source_dir` should be relative paths in the Git repo if provided. 
-
-If you are using `git_config` to run the [🤗 Transformers examples scripts](https://github.com/huggingface/transformers/tree/master/examples) keep in mind that you need to configure the right `'branch'` for you `transformers_version`, e.g. if you use `transformers_version='4.4.2` you have to use `'branch':'v4.4.2'`. 
-
-As an example to use `git_config` with an [example script from the transformers repository](https://github.com/huggingface/transformers/tree/master/examples/text-classification).
-
-_Tip: define `output_dir` as `/opt/ml/model` in the hyperparameter for the script to save your model to S3 after training._
-
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/02_getting_started_tensorflow/sagemaker-notebook.ipynb)
-
-```python
-# configure git settings
-git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.4.2'} # v4.4.2 is referring to the `transformers_version you use in the estimator.
-
- # create the Estimator
-huggingface_estimator = HuggingFace(
-        entry_point='run_glue.py',
-        source_dir='./examples/text-classification',
-        git_config=git_config,
-        instance_type='ml.p3.2xlarge',
-        instance_count=1,
-        role=role,
-        transformers_version='4.4',
-        pytorch_version='1.6',
-        py_version='py36',
-        hyperparameters=hyperparameters
-)
-
-```
-
-### SageMaker Metrics
-
-[SageMaker Metrics](https://docs.aws.amazon.com/sagemaker/latest/dg/training-metrics.html#define-train-metrics) can automatically parse the logs for metrics and send those metrics to CloudWatch. If you want SageMaker to parse logs you have to specify the metrics that you want SageMaker to send to CloudWatch when you configure the training job. You specify the name of the metrics that you want to send and the regular expressions that SageMaker uses to parse the logs that your algorithm emits to find those metrics.
-
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/06_sagemaker_metrics/sagemaker-notebook.ipynb)
-
-```python
-# define metrics definitions
-
-metric_definitions = [
-{"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
-{"Name": "eval_accuracy", "Regex": "eval_accuracy.*=\D*(.*?)$"},
-{"Name": "eval_loss", "Regex": "eval_loss.*=\D*(.*?)$"},
-]
-
-# create the Estimator
-
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3.2xlarge',
-        instance_count=1,
-        role=role,
-        transformers_version='4.4',
-        pytorch_version='1.6',
-        py_version='py36',
-        metric_definitions=metric_definitions,
-        hyperparameters = hyperparameters)
-
-```
-
-## Additional Resources
-
- [Announcement Blog Post](https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face)
-
- [AWS and Hugging Face collaborate to simplify and accelerate adoption of natural language processing](https://aws.amazon.com/blogs/machine-learning/aws-and-hugging-face-collaborate-to-simplify-and-accelerate-adoption-of-natural-language-processing-models/)
-
- [Amazon SageMaker documentation for Hugging Face](https://docs.aws.amazon.com/sagemaker/latest/dg/hugging-face.html)
-
- [SageMaker Python SDK](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/index.html)
--- a/docs/source/task_summary.rst
+++ b/docs/source/task_summary.rst
@@ -54,11 +54,12 @@ Sequence Classification

 Sequence classification is the task of classifying sequences according to a given number of classes. An example of
 sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune a
-model on a GLUE sequence classification task, you may leverage the :prefix_link:`run_glue.py
-<examples/text-classification/run_glue.py>`, :prefix_link:`run_tf_glue.py
-<examples/text-classification/run_tf_glue.py>`, :prefix_link:`run_tf_text_classification.py
-<examples/text-classification/run_tf_text_classification.py>` or :prefix_link:`run_xnli.py
-<examples/text-classification/run_xnli.py>` scripts.
+model on a GLUE sequence classification task, you may leverage the `run_glue.py
+<https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_glue.py>`__ and
+`run_pl_glue.py
+<https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_pl_glue.py>`__ or
+`run_tf_glue.py
+<https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_tf_glue.py>`__ scripts.

 Here is an example of using pipelines to do sentiment analysis: identifying if a sequence is positive or negative. It
 leverages a fine-tuned model on sst2, which is a GLUE task.
@@ -167,8 +168,9 @@ Extractive Question Answering

 Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
 question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune a
-model on a SQuAD task, you may leverage the `run_qa.py
-<https://github.com/huggingface/transformers/tree/master/examples/question-answering/run_qa.py>`__ and `run_tf_squad.py
+model on a SQuAD task, you may leverage the `run_squad.py
+<https://github.com/huggingface/transformers/tree/master/examples/question-answering/run_squad.py>`__ and
+`run_tf_squad.py
 <https://github.com/huggingface/transformers/tree/master/examples/question-answering/run_tf_squad.py>`__ scripts.


@@ -240,6 +242,7 @@ Here is an example of question answering using a model and a tokenizer. The proc
    ...     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
    ...     input_ids = inputs["input_ids"].tolist()[0]
    ...
+    ...     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    ...     outputs = model(**inputs)
    ...     answer_start_scores = outputs.start_logits
    ...     answer_end_scores = outputs.end_logits
@@ -283,6 +286,7 @@ Here is an example of question answering using a model and a tokenizer. The proc
    ...     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf")
    ...     input_ids = inputs["input_ids"].numpy()[0]
    ...
+    ...     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    ...     outputs = model(inputs)
    ...     answer_start_scores = outputs.start_logits
    ...     answer_end_scores = outputs.end_logits
@@ -324,9 +328,7 @@ Masked language modeling is the task of masking tokens in a sequence with a mask
 fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the
 right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis for
 downstream tasks requiring bi-directional context, such as SQuAD (question answering, see `Lewis, Lui, Goyal et al.
-<https://arxiv.org/abs/1910.13461>`__, part 4.2). If you would like to fine-tune a model on a masked language modeling
-task, you may leverage the `run_mlm.py
-<https://github.com/huggingface/transformers/tree/master/examples/language-modeling/run_mlm.py>`__ script.
+<https://arxiv.org/abs/1910.13461>`__, part 4.2).

 Here is an example of using pipelines to replace a mask from a sequence:

@@ -434,8 +436,7 @@ Causal Language Modeling

 Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the
 model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
-for generation tasks. If you would like to fine-tune a model on a causal language modeling task, you may leverage the
-`run_clm.py <https://github.com/huggingface/transformers/tree/master/examples/language-modeling/run_clm.py>`__ script.
+for generation tasks.

 Usually, the next token is predicted by sampling from the logits of the last hidden state the model produces from the
 input sequence.
@@ -454,7 +455,7 @@ of tokens.
    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
    >>> model = AutoModelWithLMHead.from_pretrained("gpt2")

-    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and"
+    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and "

    >>> input_ids = tokenizer.encode(sequence, return_tensors="pt")

@@ -603,7 +604,11 @@ Named Entity Recognition (NER) is the task of classifying tokens according to a
 as a person, an organisation or a location. An example of a named entity recognition dataset is the CoNLL-2003 dataset,
 which is entirely based on that task. If you would like to fine-tune a model on an NER task, you may leverage the
 `run_ner.py <https://github.com/huggingface/transformers/tree/master/examples/token-classification/run_ner.py>`__
-script.
+(PyTorch), `run_pl_ner.py
+<https://github.com/huggingface/transformers/tree/master/examples/token-classification/run_pl_ner.py>`__ (leveraging
+pytorch-lightning) or the `run_tf_ner.py
+<https://github.com/huggingface/transformers/tree/master/examples/token-classification/run_tf_ner.py>`__ (TensorFlow)
+scripts.

 Here is an example of using pipelines to do named entity recognition, specifically, trying to identify tokens as
 belonging to one of 9 classes:
@@ -741,9 +746,7 @@ token. The following array should be the output:
 Summarization
 -----------------------------------------------------------------------------------------------------------------------

-Summarization is the task of summarizing a document or an article into a shorter text. If you would like to fine-tune a
-model on a summarization task, you may leverage the `run_summarization.py
-<https://github.com/huggingface/transformers/tree/master/examples/seq2seq/run_summarization.py>`__ script.
+Summarization is the task of summarizing a document or an article into a shorter text.

 An example of a summarization dataset is the CNN / Daily Mail dataset, which consists of long news articles and was
 created for the task of summarization. If you would like to fine-tune a model on a summarization task, various
@@ -821,9 +824,7 @@ CNN / Daily Mail), it yields very good results.
 Translation
 -----------------------------------------------------------------------------------------------------------------------

-Translation is the task of translating a text from one language to another. If you would like to fine-tune a model on a
-translation task, you may leverage the `run_translation.py
-<https://github.com/huggingface/transformers/tree/master/examples/seq2seq/run_translation.py>`__ script.
+Translation is the task of translating a text from one language to another.

 An example of a translation dataset is the WMT English to German dataset, which has sentences in English as the input
 data and the corresponding sentences in German as the target data. If you would like to fine-tune a model on a
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -1,4 +1,4 @@
-..
+.. 
    Copyright 2020 The HuggingFace Team. All rights reserved.

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
@@ -70,19 +70,19 @@ Run all:

 .. code-block:: console

-    pytest
+   pytest

 or:

 .. code-block:: bash

-    make test
+   make test

 Note that the latter is defined as:

 .. code-block:: bash

-    python -m pytest -n auto --dist=loadfile -s -v ./tests/
+   python -m pytest -n auto --dist=loadfile -s -v ./tests/

 which tells pytest to:

@@ -100,13 +100,13 @@ All tests of the test suite:

 .. code-block:: bash

-    pytest --collect-only -q
+   pytest --collect-only -q

 All tests of a given test file:

 .. code-block:: bash

-    pytest tests/test_optimization.py --collect-only -q
+   pytest tests/test_optimization.py --collect-only -q



@@ -117,7 +117,7 @@ To run an individual test module:

 .. code-block:: bash

-    pytest tests/test_logging.py
+   pytest tests/test_logging.py


 Run specific tests
@@ -128,7 +128,7 @@ class containing those tests. For example, it could be:

 .. code-block:: bash

-    pytest tests/test_optimization.py::OptimizationTest::test_adam_w
+   pytest tests/test_optimization.py::OptimizationTest::test_adam_w

 Here:

@@ -140,7 +140,7 @@ If the file contains multiple classes, you can choose to run only tests of a giv

 .. code-block:: bash

-    pytest tests/test_optimization.py::OptimizationTest
+   pytest tests/test_optimization.py::OptimizationTest


 will run all the tests inside that class.
@@ -149,7 +149,8 @@ As mentioned earlier you can see what tests are contained inside the ``Optimizat

 .. code-block:: bash

-    pytest tests/test_optimization.py::OptimizationTest --collect-only -q
+   pytest tests/test_optimization.py::OptimizationTest --collect-only -q
+

 You can run tests by keyword expressions.

@@ -157,36 +158,20 @@ To run only tests whose name contains ``adam``:

 .. code-block:: bash

-    pytest -k adam tests/test_optimization.py
-
-Logical ``and`` and ``or`` can be used to indicate whether all keywords should match or either. ``not`` can be used to
-negate.
+   pytest -k adam tests/test_optimization.py

 To run all tests except those whose name contains ``adam``:

 .. code-block:: bash

-    pytest -k "not adam" tests/test_optimization.py
+   pytest -k "not adam" tests/test_optimization.py

 And you can combine the two patterns in one:

-.. code-block:: bash
-
-    pytest -k "ada and not adam" tests/test_optimization.py
-
-For example to run both ``test_adafactor`` and ``test_adam_w`` you can use:

 .. code-block:: bash

-    pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py
-
-Note that we use ``or`` here, since we want either of the keywords to match to include both.
-
-If you want to include only tests that include both patterns, ``and`` is to be used:
-
-.. code-block:: bash
-
-    pytest -k "test and ada" tests/test_optimization.py
+   pytest -k "ada and not adam" tests/test_optimization.py



@@ -251,7 +236,7 @@ example, to run all except ``test_modeling_*.py`` tests:

 .. code-block:: bash

-    pytest `ls -1 tests/*py | grep -v test_modeling`
+   pytest `ls -1 tests/*py | grep -v test_modeling`


 Clearing state
@@ -292,13 +277,13 @@ Repeat tests

 .. code-block:: bash

-    pip install pytest-flakefinder
+   pip install pytest-flakefinder

 And then run every test multiple times (50 by default):

 .. code-block:: bash

-    pytest --flake-finder --flake-runs=5 tests/test_failing_test.py
+   pytest --flake-finder --flake-runs=5 tests/test_failing_test.py

 .. note::
   This plugin doesn't work with ``-n`` flag from ``pytest-xdist``.
@@ -322,19 +307,19 @@ As explained earlier this allows detection of coupled tests - where one test's s

 .. code-block:: bash

-    pytest tests
-    [...]
-    Using --random-order-bucket=module
-    Using --random-order-seed=573663
+   pytest tests
+   [...]
+   Using --random-order-bucket=module
+   Using --random-order-seed=573663

 So that if the given particular sequence fails, you can reproduce it by adding that exact seed, e.g.:

 .. code-block:: bash

-    pytest --random-order-seed=573663
-    [...]
-    Using --random-order-bucket=module
-    Using --random-order-seed=573663
+   pytest --random-order-seed=573663
+   [...]
+   Using --random-order-bucket=module
+   Using --random-order-seed=573663

 It will only reproduce the exact order if you use the exact same list of tests (or no list at all). Once you start to
 manually narrowing down the list you can no longer rely on the seed, but have to list them manually in the exact order
@@ -342,7 +327,7 @@ they failed and tell pytest to not randomize them instead using ``--random-order

 .. code-block:: bash

-    pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py
+   pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py

 To disable the shuffling for all tests:

@@ -369,7 +354,7 @@ progressbar, and show tests that fail and the assert instantly. It gets activate

 .. code-block:: bash

-    pip install pytest-sugar
+   pip install pytest-sugar

 To run tests without it, run:

@@ -388,7 +373,7 @@ For a single or a group of tests via ``pytest`` (after ``pip install pytest-pspe

 .. code-block:: bash

-    pytest --pspec tests/test_optimization.py
+   pytest --pspec tests/test_optimization.py 



@@ -490,8 +475,8 @@ Inside tests:

 .. code-block:: bash

-    from transformers.testing_utils import get_gpu_count
-    n_gpu = get_gpu_count() # works with torch and tf
+   from transformers.testing_utils import get_gpu_count
+   n_gpu = get_gpu_count() # works with torch and tf



@@ -514,8 +499,8 @@ You will need at least 2 GPUs to see these tests in action:

 .. code-block:: bash

-    CUDA_VISIBLE_DEVICES="0,1" RUN_SLOW=1 pytest -sv examples/seq2seq/test_finetune_trainer.py \
-    examples/seq2seq/test_seq2seq_examples_multi_gpu.py
+   CUDA_VISIBLE_DEVICES="0,1" RUN_SLOW=1 pytest -sv examples/seq2seq/test_finetune_trainer.py \
+   examples/seq2seq/test_seq2seq_examples_multi_gpu.py


 Output capture
@@ -528,13 +513,13 @@ To disable output capturing and to get the ``stdout`` and ``stderr`` normally, u

 .. code-block:: bash

-    pytest -s tests/test_logging.py
+   pytest -s tests/test_logging.py

 To send test results to JUnit format output:

 .. code-block:: bash

-    py.test tests --junitxml=result.xml
+   py.test tests --junitxml=result.xml


 Color control
@@ -544,7 +529,7 @@ To have no color (e.g., yellow on white background is not readable):

 .. code-block:: bash

-    pytest --color=no tests/test_logging.py
+   pytest --color=no tests/test_logging.py



@@ -555,7 +540,7 @@ Creating a URL for each test failure:

 .. code-block:: bash

-    pytest --pastebin=failed tests/test_logging.py
+   pytest --pastebin=failed tests/test_logging.py

 This will submit test run information to a remote Paste service and provide a URL for each failure. You may select
 tests as usual or add for example -x if you only want to send one particular failure.
@@ -564,7 +549,7 @@ Creating a URL for a whole test session log:

 .. code-block:: bash

-    pytest --pastebin=all tests/test_logging.py
+   pytest --pastebin=all tests/test_logging.py



@@ -606,13 +591,13 @@ and you could run just the ``negative`` and ``integer`` sets of params with:

 .. code-block:: bash

-    pytest -k "negative and integer" tests/test_mytest.py
+   pytest -k "negative and integer" tests/test_mytest.py

 or all but ``negative`` sub-tests, with:

 .. code-block:: bash

-    pytest -k "not negative" tests/test_mytest.py
+   pytest -k "not negative" tests/test_mytest.py

 Besides using the ``-k`` filter that was just mentioned, you can find out the exact name of each sub-test and run any
 or all of them using their exact names.
@@ -672,7 +657,7 @@ and it will list:

    test_this2.py::test_floor[integer-1-1.0]
    test_this2.py::test_floor[negative--1.5--2.0]
-    test_this2.py::test_floor[large fraction-1.6-1]
+    test_this2.py::test_floor[large fraction-1.6-1]       

 So now you can run just the specific test:

@@ -795,23 +780,6 @@ leave any data in there.
   otherwise.


-Temporary sys.path override
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If you need to temporary override ``sys.path`` to import from another test for example, you can use the
-``ExtendSysPath`` context manager. Example:
-
-
-.. code-block:: python
-
-    import os
-    from transformers.testing_utils import ExtendSysPath
-    bindir = os.path.abspath(os.path.dirname(__file__))
-    with ExtendSysPath(f"{bindir}/.."):
-        from test_trainer import TrainerIntegrationCommon  # noqa
-
-
-
 Skipping tests
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/troubleshooting.md
+++ b/docs/source/troubleshooting.md
@@ -1,30 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# Troubleshooting
-
-This document is to help find solutions for common problems.
-
-## Firewalled environments
-
-Some cloud and intranet setups have their GPU instances firewalled to the outside world, so if your script is trying to download model weights or datasets it will first hang and then timeout with an error message like:
-
-```
-ValueError: Connection error, and we cannot find the requested files in the cached path.
-Please try again or make sure your Internet connection is on.
-```
-
-One possible solution in this situation is to use the ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode).
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,5 +1,6 @@
 <!---
 Copyright 2020 The HuggingFace Team. All rights reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -15,13 +16,8 @@ limitations under the License.

 # Examples

-This folder contains actively maintained examples of use of 🤗 Transformers organized along NLP tasks. If you are looking for an example that used to be in this folder, it may have moved to our [research projects](https://github.com/huggingface/transformers/tree/master/examples/research_projects) subfolder (which contains frozen snapshots of research projects) or to the [legacy](https://github.com/huggingface/transformers/tree/master/examples/legacy) subfolder.
-
-While we strive to present as many use cases as possible, the scripts in this folder are just examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. To help you with that, all the PyTorch versions of the examples fully expose the preprocessing of the data. This way, you can easily tweak them.
-
-This is similar if you want the scripts to report another metric than the one they currently use: look at the `compute_metrics` function inside the script. It takes the full arrays of predictions and labels and has to return a dictionary of string keys and float values. Just change it to add (or replace) your own metric to the ones already reported.
-
-Please discuss on the [forum](https://discuss.huggingface.co/) or in an [issue](https://github.com/huggingface/transformers/issues) a feature you would like to implement in an example before submitting a PR: we welcome bug fixes but since we want to keep the examples as simple as possible, it's unlikely we will merge a pull request adding more functionality at the cost of readability.
+This folder contains actively maintained examples of use of 🤗 Transformers organized along NLP tasks. If you are looking for an example that used to
+be in this folder, it may have moved to our [research projects](https://github.com/huggingface/transformers/tree/master/examples/research_projects) subfolder (which contains frozen snapshots of research projects).

 ## Important note

@@ -38,43 +34,10 @@ Then cd in the example folder of your choice and run
 pip install -r requirements.txt
 ```

-To browse the examples corresponding to released versions of 🤗 Transformers, click on the line below and then on your desired version of the library:
-
-<details>
-  <summary>Examples for older versions of 🤗 Transformers</summary>
-
-  - [v4.3.3](https://github.com/huggingface/transformers/tree/v4.3.3/examples)
-  - [v4.2.2](https://github.com/huggingface/transformers/tree/v4.2.2/examples)
-  - [v4.1.1](https://github.com/huggingface/transformers/tree/v4.1.1/examples)
-  - [v4.0.1](https://github.com/huggingface/transformers/tree/v4.0.1/examples)
-  - [v3.5.1](https://github.com/huggingface/transformers/tree/v3.5.1/examples)
-  - [v3.4.0](https://github.com/huggingface/transformers/tree/v3.4.0/examples)
-  - [v3.3.1](https://github.com/huggingface/transformers/tree/v3.3.1/examples)
-  - [v3.2.0](https://github.com/huggingface/transformers/tree/v3.2.0/examples)
-  - [v3.1.0](https://github.com/huggingface/transformers/tree/v3.1.0/examples)
-  - [v3.0.2](https://github.com/huggingface/transformers/tree/v3.0.2/examples)
-  - [v2.11.0](https://github.com/huggingface/transformers/tree/v2.11.0/examples)
-  - [v2.10.0](https://github.com/huggingface/transformers/tree/v2.10.0/examples)
-  - [v2.9.1](https://github.com/huggingface/transformers/tree/v2.9.1/examples)
-  - [v2.8.0](https://github.com/huggingface/transformers/tree/v2.8.0/examples)
-  - [v2.7.0](https://github.com/huggingface/transformers/tree/v2.7.0/examples)
-  - [v2.6.0](https://github.com/huggingface/transformers/tree/v2.6.0/examples)
-  - [v2.5.1](https://github.com/huggingface/transformers/tree/v2.5.1/examples)
-  - [v2.4.0](https://github.com/huggingface/transformers/tree/v2.4.0/examples)
-  - [v2.3.0](https://github.com/huggingface/transformers/tree/v2.3.0/examples)
-  - [v2.2.0](https://github.com/huggingface/transformers/tree/v2.2.0/examples)
-  - [v2.1.1](https://github.com/huggingface/transformers/tree/v2.1.0/examples)
-  - [v2.0.0](https://github.com/huggingface/transformers/tree/v2.0.0/examples)
-  - [v1.2.0](https://github.com/huggingface/transformers/tree/v1.2.0/examples)
-  - [v1.1.0](https://github.com/huggingface/transformers/tree/v1.1.0/examples)
-  - [v1.0.0](https://github.com/huggingface/transformers/tree/v1.0.0/examples)
-</details>
-
-Alternatively, you can find switch your cloned 🤗 Transformers to a specific version (for instance with v3.5.1) with
+Alternatively, you can run the version of the examples as they were for your current version of Transformers via (for instance with v3.5.1):
 ```bash
 git checkout tags/v3.5.1
 ```
-and run the example command as usual afterward.

 ## The Big Table of Tasks

@@ -90,47 +53,21 @@ Coming soon!

 | Task | Example datasets | Trainer support | TFTrainer support | 🤗 Datasets | Colab
 |---|---|:---:|:---:|:---:|:---:|
-| [**`language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/language-modeling)       | WikiText-2      | ✅ | -  | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/language_modeling.ipynb)
-| [**`multiple-choice`**](https://github.com/huggingface/transformers/tree/master/examples/multiple-choice)           | SWAG            | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/multiple_choice.ipynb)
+| [**`language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/language-modeling)       | Raw text        | ✅ | -  | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)
+| [**`multiple-choice`**](https://github.com/huggingface/transformers/tree/master/examples/multiple-choice)           | SWAG, RACE, ARC | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb)
 | [**`question-answering`**](https://github.com/huggingface/transformers/tree/master/examples/question-answering)     | SQuAD           | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb)
-| [**`summarization`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)                     |  XSum           | ✅ | -  | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/summarization.ipynb)
-| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/text-classification)   | GLUE            | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification.ipynb)
+| [**`summarization`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)                     | CNN/Daily Mail  | ✅  | - | - | -
+| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/text-classification)   | GLUE, XNLI      | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification.ipynb)
 | [**`text-generation`**](https://github.com/huggingface/transformers/tree/master/examples/text-generation)           | -               | n/a | n/a | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)
 | [**`token-classification`**](https://github.com/huggingface/transformers/tree/master/examples/token-classification) | CoNLL NER       | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification.ipynb)
-| [**`translation`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)                       | WMT             | ✅  | - | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/translation.ipynb)
+| [**`translation`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)                       | WMT             | ✅  | - | - | -


-## Running quick tests
-
-Most examples are equipped with a mechanism to truncate the number of dataset samples to the desired length. This is useful for debugging purposes, for example to quickly check that all stages of the programs can complete, before running the same setup on the full dataset which may take hours to complete.
-
-For example here is how to truncate all three splits to just 50 samples each:
-```
-examples/token-classification/run_ner.py \
--max_train_samples 50 \
--max_val_samples 50 \
--max_test_samples 50 \
-[...]
-```
-
-Most example scripts should have the first two command line arguments and some have the third one. You can quickly check if a given example supports any of these by passing a `-h` option, e.g.:
-```
-examples/token-classification/run_ner.py -h
-```
-
-## Resuming training
-
-You can resume training from a previous checkpoint like this:
-
-1. Pass `--output_dir previous_output_dir` without `--overwrite_output_dir` to resume training from the latest checkpoint in `output_dir` (what you would use if the training was interrupted, for instance).
-2. Pass `--model_name_or_path path_to_a_specific_checkpoint` to resume training from that checkpoint folder.
-
-Should you want to turn an example into a notebook where you'd no longer have access to the command
-line, 🤗 Trainer supports resuming from a checkpoint via `trainer.train(resume_from_checkpoint)`.
-
-1. If `resume_from_checkpoint` is `True` it will look for the last checkpoint in the value of `output_dir` passed via `TrainingArguments`.
-2. If `resume_from_checkpoint` is a path to a specific checkpoint it will use that saved checkpoint folder to resume the training from.
+<!--
+## One-click Deploy to Cloud (wip)

+**Coming soon!**
+-->

 ## Distributed training and mixed precision

@@ -141,7 +78,7 @@ use the following command:
 ```bash
 python -m torch.distributed.launch \
    --nproc_per_node number_of_gpu_you_have path_to_script.py \
-	--all_arguments_of_the_script
+	--all_arguments_of_the_script 
 ```

 As an example, here is how you would fine-tune the BERT large model (with whole word masking) on the text
@@ -185,7 +122,7 @@ regular training script with its arguments (this is similar to the `torch.distri
 ```bash
 python xla_spawn.py --num_cores num_tpu_you_have \
    path_to_script.py \
-	--all_arguments_of_the_script
+	--all_arguments_of_the_script 
 ```

 As an example, here is how you would fine-tune the BERT large model (with whole word masking) on the text
@@ -210,7 +147,7 @@ python xla_spawn.py --num_cores 8 \
 You can easily log and monitor your runs code. The following are currently supported:

 * [TensorBoard](https://www.tensorflow.org/tensorboard)
-* [Weights & Biases](https://docs.wandb.ai/integrations/huggingface)
+* [Weights & Biases](https://docs.wandb.com/library/integrations/huggingface)
 * [Comet ML](https://www.comet.ml/docs/python-sdk/huggingface/)

 ### Weights & Biases
@@ -234,23 +171,9 @@ import wandb
 wandb.login()
 ```

-To enable logging to W&B, include `"wandb"` in the `report_to` of your `TrainingArguments` or script. Or just pass along `--report_to all` if you have `wandb` installed.
-
 Whenever you use `Trainer` or `TFTrainer` classes, your losses, evaluation metrics, model topology and gradients (for `Trainer` only) will automatically be logged.

-Advanced configuration is possible by setting environment variables:
-
-| Environment Variable | Value |
-|---|---|
-| WANDB_LOG_MODEL | Log the model as artifact (log the model as artifact at the end of training (`false` by default) |
-| WANDB_WATCH | one of `gradients` (default) to log histograms of gradients, `all` to log histograms of both gradients and parameters, or `false` for no histogram logging |
-| WANDB_PROJECT | Organize runs by project |
-
-Set run names with `run_name` argument present in scripts or as part of `TrainingArguments`.
-
-Additional configuration options are available through generic [wandb environment variables](https://docs.wandb.com/library/environment-variables).
-
-Refer to related [documentation & examples](https://docs.wandb.ai/integrations/huggingface).
+When using 🤗 Transformers with PyTorch Lightning, runs can be tracked through `WandbLogger`. Refer to related [documentation & examples](https://docs.wandb.com/library/integrations/lightning).

 ### Comet.ml

--- a/examples/_tests_requirements.txt
+++ b/examples/_tests_requirements.txt
@@ -2,7 +2,7 @@ tensorboard
 scikit-learn
 seqeval
 psutil
-sacrebleu >= 1.4.12
+sacrebleu
 rouge-score
 tensorflow_datasets
 matplotlib
--- a/examples/benchmarking/run_benchmark.py
+++ b/examples/benchmarking/run_benchmark.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/benchmarking/run_benchmark_tf.py
+++ b/examples/benchmarking/run_benchmark_tf.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2018 The HuggingFace Inc. team.
 # Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/language-modeling/README.md
+++ b/examples/language-modeling/README.md
@@ -22,11 +22,12 @@ ALBERT, BERT, DistilBERT, RoBERTa, XLNet... GPT and GPT-2 are trained or fine-tu
 loss. XLNet uses permutation language modeling (PLM), you can find more information about the differences between those
 objectives in our [model summary](https://huggingface.co/transformers/model_summary.html).

-There are two sets of scripts provided. The first set leverages the Trainer API. The second set with `no_trainer` in the suffix uses a custom training loop and leverages the 🤗 Accelerate library . Both sets use the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.
+These scripts leverage the 🤗 Datasets library and the Trainer API. You can easily customize them to your needs if you
+need extra processing on your datasets.

 **Note:** The old script `run_language_modeling.py` is still available [here](https://github.com/huggingface/transformers/blob/master/examples/legacy/run_language_modeling.py).

-The following examples, will run on datasets hosted on our [hub](https://huggingface.co/datasets) or with your own
+The following examples, will run on a datasets hosted on our [hub](https://huggingface.co/datasets) or with your own
 text files for training and validation. We give examples of both below.

 ### GPT-2/GPT and causal language modeling
@@ -59,15 +60,6 @@ python run_clm.py \
    --output_dir /tmp/test-clm
 ```

-This uses the built in HuggingFace `Trainer` for training. If you want to use a custom training loop, you can utilize or adapt the `run_clm_no_trainer.py` script. Take a look at the script for a list of supported arguments. An example is shown below:
-
-```bash
-python run_clm_no_trainer.py \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --model_name_or_path gpt2 \
-    --output_dir /tmp/test-clm
-```

 ### RoBERTa/BERT/DistilBERT and masked language modeling

@@ -103,33 +95,23 @@ python run_mlm.py \
 If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script
 concatenates all texts and then splits them in blocks of the same length).

-This uses the built in HuggingFace `Trainer` for training. If you want to use a custom training loop, you can utilize or adapt the `run_mlm_no_trainer.py` script. Take a look at the script for a list of supported arguments. An example is shown below:
-
-```bash
-python run_mlm_no_trainer.py \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --model_name_or_path roberta-base \
-    --output_dir /tmp/test-mlm
-```
-
 **Note:** On TPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make
 sure all your batches have the same length.

 ### Whole word masking

-This part was moved to `examples/research_projects/mlm_wwm`.
+This part was moved to `examples/research_projects/mlm_wwm`. 

 ### XLNet and permutation language modeling

-XLNet uses a different training objective, which is permutation language modeling. It is an autoregressive method
-to learn bidirectional contexts by maximizing the expected likelihood over all permutations of the input
+XLNet uses a different training objective, which is permutation language modeling. It is an autoregressive method 
+to learn bidirectional contexts by maximizing the expected likelihood over all permutations of the input 
 sequence factorization order.

-We use the `--plm_probability` flag to define the ratio of length of a span of masked tokens to surrounding
+We use the `--plm_probability` flag to define the ratio of length of a span of masked tokens to surrounding 
 context length for permutation language modeling.

-The `--max_span_length` flag may also be used to limit the length of a span of masked tokens used
+The `--max_span_length` flag may also be used to limit the length of a span of masked tokens used 
 for permutation language modeling.

 Here is how to fine-tune XLNet on wikitext-2:
--- a/examples/language-modeling/run_clm.py
+++ b/examples/language-modeling/run_clm.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
 #
@@ -43,14 +42,9 @@ from transformers import (
    default_data_collator,
    set_seed,
 )
-from transformers.testing_utils import CaptureLogger
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version


-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.6.0.dev0")
-
 logger = logging.getLogger(__name__)


@@ -119,26 +113,11 @@ class DataTrainingArguments:
        default=None,
        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_val_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
-            "value if set."
-        },
-    )
-
    block_size: Optional[int] = field(
        default=None,
        metadata={
-            "help": "Optional input sequence length after tokenization. "
-            "The training dataset will be truncated in block of this size for training. "
+            "help": "Optional input sequence length after tokenization."
+            "The training dataset will be truncated in block of this size for training."
            "Default to the model max input length for single sentence inputs (take into account special tokens)."
        },
    )
@@ -214,7 +193,7 @@ def main():
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
-    logger.info(f"Training/evaluation parameters {training_args}")
+    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)
@@ -230,19 +209,17 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
            )
    else:
        data_files = {}
@@ -257,7 +234,7 @@ def main():
        )
        if extension == "txt":
            extension = "text"
-        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -319,18 +296,8 @@ def main():
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

-    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
-    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
-
    def tokenize_function(examples):
-        with CaptureLogger(tok_logger) as cl:
-            output = tokenizer(examples[text_column_name])
-        # clm input could be much much longer than block_size
-        if "Token indices sequence length is longer than the" in cl.out:
-            tok_logger.warning(
-                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model."
-            )
-        return output
+        return tokenizer(examples[text_column_name])

    tokenized_datasets = datasets.map(
        tokenize_function,
@@ -343,14 +310,14 @@ def main():
    if data_args.block_size is None:
        block_size = tokenizer.model_max_length
        if block_size > 1024:
-            logger.warning(
+            logger.warn(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
            )
        block_size = 1024
    else:
        if data_args.block_size > tokenizer.model_max_length:
-            logger.warning(
+            logger.warn(
                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
            )
@@ -378,7 +345,6 @@ def main():
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
-
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
@@ -386,26 +352,12 @@ def main():
        load_from_cache_file=not data_args.overwrite_cache,
    )

-    if training_args.do_train:
-        if "train" not in tokenized_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = lm_datasets["train"]
-        if data_args.max_train_samples is not None:
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-
-    if training_args.do_eval:
-        if "validation" not in tokenized_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = lm_datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
-
    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
+        train_dataset=lm_datasets["train"] if training_args.do_train else None,
+        eval_dataset=lm_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        # Data collator will default to DataCollatorWithPadding, so we change it.
        data_collator=default_data_collator,
@@ -422,30 +374,36 @@ def main():
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload

-        metrics = train_result.metrics
+        output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_train_file, "w") as writer:
+                logger.info("***** Train results *****")
+                for key, value in sorted(train_result.metrics.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")

-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
-
-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
+            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
+            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
+    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

-        metrics = trainer.evaluate()
+        eval_output = trainer.evaluate()

-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
-        perplexity = math.exp(metrics["eval_loss"])
-        metrics["perplexity"] = perplexity
+        perplexity = math.exp(eval_output["eval_loss"])
+        results["perplexity"] = perplexity

-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results_clm.txt")
+        if trainer.is_world_process_zero():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in sorted(results.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")
+
+    return results


 def _mp_fn(index):
--- a/examples/language-modeling/run_clm_no_trainer.py
+++ b/examples/language-modeling/run_clm_no_trainer.py
@@ -1,456 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for causal language modeling (BERT, ALBERT, RoBERTa...)
-on a text file or a dataset without using HuggingFace Trainer.
-
-Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
-https://huggingface.co/models?filter=causal-lm
-"""
-# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
-
-import argparse
-import logging
-import math
-import os
-import random
-
-import datasets
-import torch
-from datasets import load_dataset
-from torch.utils.data.dataloader import DataLoader
-from tqdm.auto import tqdm
-
-import transformers
-from accelerate import Accelerator
-from transformers import (
-    CONFIG_MAPPING,
-    MODEL_MAPPING,
-    AdamW,
-    AutoConfig,
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    SchedulerType,
-    default_data_collator,
-    get_scheduler,
-    set_seed,
-)
-
-
-logger = logging.getLogger(__name__)
-MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task")
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default=None,
-        help="The name of the dataset to use (via the datasets library).",
-    )
-    parser.add_argument(
-        "--dataset_config_name",
-        type=str,
-        default=None,
-        help="The configuration name of the dataset to use (via the datasets library).",
-    )
-    parser.add_argument(
-        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
-    )
-    parser.add_argument(
-        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
-    )
-    parser.add_argument(
-        "--validation_split_percentage",
-        default=5,
-        help="The percentage of the train set used as validation set in case there's no validation split",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        type=str,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-        required=True,
-    )
-    parser.add_argument(
-        "--config_name",
-        type=str,
-        default=None,
-        help="Pretrained config name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--use_slow_tokenizer",
-        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
-    )
-    parser.add_argument(
-        "--per_device_train_batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per device) for the training dataloader.",
-    )
-    parser.add_argument(
-        "--per_device_eval_batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per device) for the evaluation dataloader.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-5,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
-    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--lr_scheduler_type",
-        type=SchedulerType,
-        default="linear",
-        help="The scheduler type to use.",
-        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
-    )
-    parser.add_argument(
-        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--model_type",
-        type=str,
-        default=None,
-        help="Model type to use if training from scratch.",
-        choices=MODEL_TYPES,
-    )
-    parser.add_argument(
-        "--block_size",
-        type=int,
-        default=None,
-        help="Optional input sequence length after tokenization. The training dataset will be truncated in block of this size for training. Default to the model max input length for single sentence inputs (take into account special tokens).",
-    )
-    parser.add_argument(
-        "--preprocessing_num_workers",
-        type=int,
-        default=None,
-        help="The number of processes to use for the preprocessing.",
-    )
-    parser.add_argument(
-        "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
-    )
-
-    args = parser.parse_args()
-
-    # Sanity checks
-    if args.dataset_name is None and args.train_file is None and args.validation_file is None:
-        raise ValueError("Need either a dataset name or a training/validation file.")
-    else:
-        if args.train_file is not None:
-            extension = args.train_file.split(".")[-1]
-            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
-        if args.validation_file is not None:
-            extension = args.validation_file.split(".")[-1]
-            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
-
-    if args.output_dir is not None:
-        os.makedirs(args.output_dir, exist_ok=True)
-
-    return args
-
-
-def main():
-    args = parse_args()
-
-    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator()
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state)
-
-    # Setup logging, we only want one process per machine to log things on the screen.
-    # accelerator.is_local_main_process is only True for one process per machine.
-    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
-    if accelerator.is_local_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
-        if "validation" not in raw_datasets.keys():
-            raw_datasets["validation"] = load_dataset(
-                args.dataset_name,
-                args.dataset_config_name,
-                split=f"train[:{args.validation_split_percentage}%]",
-            )
-            raw_datasets["train"] = load_dataset(
-                args.dataset_name,
-                args.dataset_config_name,
-                split=f"train[{args.validation_split_percentage}%:]",
-            )
-    else:
-        data_files = {}
-        if args.train_file is not None:
-            data_files["train"] = args.train_file
-        if args.validation_file is not None:
-            data_files["validation"] = args.validation_file
-        extension = args.train_file.split(".")[-1]
-        if extension == "txt":
-            extension = "text"
-        raw_datasets = load_dataset(extension, data_files=data_files)
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    # Load pretrained model and tokenizer
-    #
-    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
-    elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
-    else:
-        config = CONFIG_MAPPING[args.model_type]()
-        logger.warning("You are instantiating a new config instance from scratch.")
-
-    if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
-    elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    if args.model_name_or_path:
-        model = AutoModelForCausalLM.from_pretrained(
-            args.model_name_or_path,
-            from_tf=bool(".ckpt" in args.model_name_or_path),
-            config=config,
-        )
-    else:
-        logger.info("Training new model from scratch")
-        model = AutoModelForCausalLM.from_config(config)
-
-    model.resize_token_embeddings(len(tokenizer))
-
-    # Preprocessing the datasets.
-    # First we tokenize all the texts.
-    column_names = raw_datasets["train"].column_names
-    text_column_name = "text" if "text" in column_names else column_names[0]
-
-    def tokenize_function(examples):
-        return tokenizer(examples[text_column_name])
-
-    tokenized_datasets = raw_datasets.map(
-        tokenize_function,
-        batched=True,
-        num_proc=args.preprocessing_num_workers,
-        remove_columns=column_names,
-        load_from_cache_file=not args.overwrite_cache,
-    )
-
-    if args.block_size is None:
-        block_size = tokenizer.model_max_length
-        if block_size > 1024:
-            logger.warning(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
-            )
-        block_size = 1024
-    else:
-        if args.block_size > tokenizer.model_max_length:
-            logger.warning(
-                f"The block_size passed ({args.block_size}) is larger than the maximum length for the model"
-                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
-            )
-        block_size = min(args.block_size, tokenizer.model_max_length)
-
-    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
-    def group_texts(examples):
-        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
-        total_length = len(concatenated_examples[list(examples.keys())[0]])
-        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-        # customize this part to your needs.
-        total_length = (total_length // block_size) * block_size
-        # Split by chunks of max_len.
-        result = {
-            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
-            for k, t in concatenated_examples.items()
-        }
-        result["labels"] = result["input_ids"].copy()
-        return result
-
-    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
-    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
-    # to preprocess.
-    #
-    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
-
-    lm_datasets = tokenized_datasets.map(
-        group_texts,
-        batched=True,
-        num_proc=args.preprocessing_num_workers,
-        load_from_cache_file=not args.overwrite_cache,
-    )
-
-    train_dataset = lm_datasets["train"]
-    eval_dataset = lm_datasets["validation"]
-
-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
-        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
-
-    # DataLoaders creation:
-    train_dataloader = DataLoader(
-        train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size
-    )
-    eval_dataloader = DataLoader(
-        eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size
-    )
-
-    # Optimizer
-    # Split weights in two groups, one with weight decay and the other not.
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {
-            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
-            "weight_decay": 0.0,
-        },
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
-
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader
-    )
-
-    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
-    # shorter in multiprocess)
-
-    # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    else:
-        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    lr_scheduler = get_scheduler(
-        name=args.lr_scheduler_type,
-        optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
-    )
-
-    # Train!
-    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
-    completed_steps = 0
-
-    for epoch in range(args.num_train_epochs):
-        model.train()
-        for step, batch in enumerate(train_dataloader):
-            outputs = model(**batch)
-            loss = outputs.loss
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
-                progress_bar.update(1)
-                completed_steps += 1
-
-            if completed_steps >= args.max_train_steps:
-                break
-
-        model.eval()
-        losses = []
-        for step, batch in enumerate(eval_dataloader):
-            with torch.no_grad():
-                outputs = model(**batch)
-
-            loss = outputs.loss
-            losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size)))
-
-        losses = torch.cat(losses)
-        losses = losses[: len(eval_dataset)]
-        perplexity = math.exp(torch.mean(losses))
-
-        logger.info(f"epoch {epoch}: perplexity: {perplexity}")
-
-    if args.output_dir is not None:
-        accelerator.wait_for_everyone()
-        unwrapped_model = accelerator.unwrap_model(model)
-        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/language-modeling/run_mlm.py
+++ b/examples/language-modeling/run_mlm.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2020 The HuggingFace Team All rights reserved.
 #
@@ -44,12 +43,8 @@ from transformers import (
    set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version


-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.6.0.dev0")
-
 logger = logging.getLogger(__name__)
 MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
@@ -150,20 +145,6 @@ class DataTrainingArguments:
            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
        },
    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_val_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
-            "value if set."
-        },
-    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -223,7 +204,7 @@ def main():
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
-    logger.info(f"Training/evaluation parameters {training_args}")
+    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)
@@ -239,19 +220,17 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
            )
    else:
        data_files = {}
@@ -262,7 +241,7 @@ def main():
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
-        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -323,22 +302,6 @@ def main():
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

-    if data_args.max_seq_length is None:
-        max_seq_length = tokenizer.model_max_length
-        if max_seq_length > 1024:
-            logger.warning(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
-            )
-            max_seq_length = 1024
-    else:
-        if data_args.max_seq_length > tokenizer.model_max_length:
-            logger.warning(
-                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
-                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-            )
-        max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
    if data_args.line_by_line:
        # When using line_by_line, we just tokenize each nonempty line.
        padding = "max_length" if data_args.pad_to_max_length else False
@@ -350,7 +313,7 @@ def main():
                examples["text"],
                padding=padding,
                truncation=True,
-                max_length=max_seq_length,
+                max_length=data_args.max_seq_length,
                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
                # receives the `special_tokens_mask`.
                return_special_tokens_mask=True,
@@ -378,6 +341,22 @@ def main():
            load_from_cache_file=not data_args.overwrite_cache,
        )

+        if data_args.max_seq_length is None:
+            max_seq_length = tokenizer.model_max_length
+            if max_seq_length > 1024:
+                logger.warn(
+                    f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
+                    "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
+                )
+                max_seq_length = 1024
+        else:
+            if data_args.max_seq_length > tokenizer.model_max_length:
+                logger.warn(
+                    f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+                    f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+                )
+            max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
        def group_texts(examples):
@@ -400,7 +379,6 @@ def main():
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
-
        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
@@ -408,35 +386,16 @@ def main():
            load_from_cache_file=not data_args.overwrite_cache,
        )

-    if training_args.do_train:
-        if "train" not in tokenized_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = tokenized_datasets["train"]
-        if data_args.max_train_samples is not None:
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-
-    if training_args.do_eval:
-        if "validation" not in tokenized_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = tokenized_datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
-
    # Data collator
    # This one will take care of randomly masking the tokens.
-    pad_to_multiple_of_8 = data_args.line_by_line and training_args.fp16 and not data_args.pad_to_max_length
-    data_collator = DataCollatorForLanguageModeling(
-        tokenizer=tokenizer,
-        mlm_probability=data_args.mlm_probability,
-        pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
-    )
+    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
+        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
+        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
@@ -451,30 +410,37 @@ def main():
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload
-        metrics = train_result.metrics

-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+        output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_train_file, "w") as writer:
+                logger.info("***** Train results *****")
+                for key, value in sorted(train_result.metrics.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")

-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
+            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
+            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
+    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

-        metrics = trainer.evaluate()
+        eval_output = trainer.evaluate()

-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
-        perplexity = math.exp(metrics["eval_loss"])
-        metrics["perplexity"] = perplexity
+        perplexity = math.exp(eval_output["eval_loss"])
+        results["perplexity"] = perplexity

-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm.txt")
+        if trainer.is_world_process_zero():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in sorted(results.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")
+
+    return results


 def _mp_fn(index):
--- a/examples/language-modeling/run_mlm_flax.py
+++ b/examples/language-modeling/run_mlm_flax.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2020 The HuggingFace Team All rights reserved.
 #
@@ -307,7 +306,7 @@ def create_learning_rate_scheduler(
                progress = jnp.maximum(0.0, (step - warmup_steps) / float(steps_per_cycle))
                ret *= jnp.maximum(0.0, 0.5 * (1.0 + jnp.cos(jnp.pi * (progress % 1.0))))
            else:
-                raise ValueError(f"Unknown factor {name}.")
+                raise ValueError("Unknown factor %s." % name)
        return jnp.asarray(ret, dtype=jnp.float32)

    return step_fn
@@ -332,7 +331,9 @@ def accuracy(logits, targets, weights=None):
      Tuple of scalar loss and batch normalizing factor.
    """
    if logits.ndim != targets.ndim + 1:
-        raise ValueError(f"Incorrect shapes. Got shape {logits.shape} logits and {targets.shape} targets")
+        raise ValueError(
+            "Incorrect shapes. Got shape %s logits and %s targets" % (str(logits.shape), str(targets.shape))
+        )

    loss = jnp.equal(jnp.argmax(logits, axis=-1), targets)
    loss *= weights
@@ -351,7 +352,9 @@ def cross_entropy(logits, targets, weights=None, label_smoothing=0.0):
      Tuple of scalar loss and batch normalizing factor.
    """
    if logits.ndim != targets.ndim + 1:
-        raise ValueError(f"Incorrect shapes. Got shape {logits.shape} logits and {targets.shape} targets")
+        raise ValueError(
+            "Incorrect shapes. Got shape %s logits and %s targets" % (str(logits.shape), str(targets.shape))
+        )

    vocab_size = logits.shape[-1]
    confidence = 1.0 - label_smoothing
@@ -459,7 +462,7 @@ if __name__ == "__main__":
    )

    # Set the verbosity to info of the Transformers logger (on main process only):
-    logger.info(f"Training/evaluation parameters {training_args}")
+    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)
@@ -475,19 +478,17 @@ if __name__ == "__main__":
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
            )
    else:
        data_files = {}
@@ -498,7 +499,7 @@ if __name__ == "__main__":
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
-        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

--- a/examples/language-modeling/run_mlm_no_trainer.py
+++ b/examples/language-modeling/run_mlm_no_trainer.py
@@ -1,500 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...)
-on a text file or a dataset without using HuggingFace Trainer.
-
-Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
-https://huggingface.co/models?filter=masked-lm
-"""
-# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
-
-import argparse
-import logging
-import math
-import os
-import random
-
-import datasets
-import torch
-from datasets import load_dataset
-from torch.utils.data.dataloader import DataLoader
-from tqdm.auto import tqdm
-
-import transformers
-from accelerate import Accelerator
-from transformers import (
-    CONFIG_MAPPING,
-    MODEL_MAPPING,
-    AdamW,
-    AutoConfig,
-    AutoModelForMaskedLM,
-    AutoTokenizer,
-    DataCollatorForLanguageModeling,
-    SchedulerType,
-    get_scheduler,
-    set_seed,
-)
-
-
-logger = logging.getLogger(__name__)
-MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Finetune a transformers model on a Masked Language Modeling task")
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default=None,
-        help="The name of the dataset to use (via the datasets library).",
-    )
-    parser.add_argument(
-        "--dataset_config_name",
-        type=str,
-        default=None,
-        help="The configuration name of the dataset to use (via the datasets library).",
-    )
-    parser.add_argument(
-        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
-    )
-    parser.add_argument(
-        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
-    )
-    parser.add_argument(
-        "--validation_split_percentage",
-        default=5,
-        help="The percentage of the train set used as validation set in case there's no validation split",
-    )
-    parser.add_argument(
-        "--pad_to_max_length",
-        action="store_true",
-        help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        type=str,
-        help="Path to pretrained model or model identifier from huggingface.co/models.",
-        required=True,
-    )
-    parser.add_argument(
-        "--config_name",
-        type=str,
-        default=None,
-        help="Pretrained config name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default=None,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--use_slow_tokenizer",
-        action="store_true",
-        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
-    )
-    parser.add_argument(
-        "--per_device_train_batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per device) for the training dataloader.",
-    )
-    parser.add_argument(
-        "--per_device_eval_batch_size",
-        type=int,
-        default=8,
-        help="Batch size (per device) for the evaluation dataloader.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        type=float,
-        default=5e-5,
-        help="Initial learning rate (after the potential warmup period) to use.",
-    )
-    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
-    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
-    parser.add_argument(
-        "--max_train_steps",
-        type=int,
-        default=None,
-        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument(
-        "--lr_scheduler_type",
-        type=SchedulerType,
-        default="linear",
-        help="The scheduler type to use.",
-        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
-    )
-    parser.add_argument(
-        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
-    )
-    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
-    parser.add_argument(
-        "--model_type",
-        type=str,
-        default=None,
-        help="Model type to use if training from scratch.",
-        choices=MODEL_TYPES,
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        type=int,
-        default=None,
-        help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated.",
-    )
-    parser.add_argument(
-        "--line_by_line",
-        type=bool,
-        default=False,
-        help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
-    )
-    parser.add_argument(
-        "--preprocessing_num_workers",
-        type=int,
-        default=None,
-        help="The number of processes to use for the preprocessing.",
-    )
-    parser.add_argument(
-        "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument(
-        "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
-    )
-
-    args = parser.parse_args()
-
-    # Sanity checks
-    if args.dataset_name is None and args.train_file is None and args.validation_file is None:
-        raise ValueError("Need either a dataset name or a training/validation file.")
-    else:
-        if args.train_file is not None:
-            extension = args.train_file.split(".")[-1]
-            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
-        if args.validation_file is not None:
-            extension = args.validation_file.split(".")[-1]
-            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
-
-    if args.output_dir is not None:
-        os.makedirs(args.output_dir, exist_ok=True)
-
-    return args
-
-
-def main():
-    args = parse_args()
-
-    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
-    accelerator = Accelerator()
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state)
-
-    # Setup logging, we only want one process per machine to log things on the screen.
-    # accelerator.is_local_main_process is only True for one process per machine.
-    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
-    if accelerator.is_local_main_process:
-        datasets.utils.logging.set_verbosity_warning()
-        transformers.utils.logging.set_verbosity_info()
-    else:
-        datasets.utils.logging.set_verbosity_error()
-        transformers.utils.logging.set_verbosity_error()
-
-    # If passed along, set the training seed now.
-    if args.seed is not None:
-        set_seed(args.seed)
-
-    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
-    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
-    # (the dataset will be downloaded automatically from the datasets Hub).
-    #
-    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
-    # 'text' is found. You can easily tweak this behavior (see below).
-    #
-    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
-    # download the dataset.
-    if args.dataset_name is not None:
-        # Downloading and loading a dataset from the hub.
-        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
-        if "validation" not in raw_datasets.keys():
-            raw_datasets["validation"] = load_dataset(
-                args.dataset_name,
-                args.dataset_config_name,
-                split=f"train[:{args.validation_split_percentage}%]",
-            )
-            raw_datasets["train"] = load_dataset(
-                args.dataset_name,
-                args.dataset_config_name,
-                split=f"train[{args.validation_split_percentage}%:]",
-            )
-    else:
-        data_files = {}
-        if args.train_file is not None:
-            data_files["train"] = args.train_file
-        if args.validation_file is not None:
-            data_files["validation"] = args.validation_file
-        extension = args.train_file.split(".")[-1]
-        if extension == "txt":
-            extension = "text"
-        raw_datasets = load_dataset(extension, data_files=data_files)
-    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-    # Load pretrained model and tokenizer
-    #
-    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-    if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
-    elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
-    else:
-        config = CONFIG_MAPPING[args.model_type]()
-        logger.warning("You are instantiating a new config instance from scratch.")
-
-    if args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
-    elif args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
-        )
-
-    if args.model_name_or_path:
-        model = AutoModelForMaskedLM.from_pretrained(
-            args.model_name_or_path,
-            from_tf=bool(".ckpt" in args.model_name_or_path),
-            config=config,
-        )
-    else:
-        logger.info("Training new model from scratch")
-        model = AutoModelForMaskedLM.from_config(config)
-
-    model.resize_token_embeddings(len(tokenizer))
-
-    # Preprocessing the datasets.
-    # First we tokenize all the texts.
-    column_names = raw_datasets["train"].column_names
-    text_column_name = "text" if "text" in column_names else column_names[0]
-
-    if args.max_seq_length is None:
-        max_seq_length = tokenizer.model_max_length
-        if max_seq_length > 1024:
-            logger.warning(
-                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
-                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
-            )
-            max_seq_length = 1024
-    else:
-        if args.max_seq_length > tokenizer.model_max_length:
-            logger.warning(
-                f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
-                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-            )
-        max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)
-
-    if args.line_by_line:
-        # When using line_by_line, we just tokenize each nonempty line.
-        padding = "max_length" if args.pad_to_max_length else False
-
-        def tokenize_function(examples):
-            # Remove empty lines
-            examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
-            return tokenizer(
-                examples["text"],
-                padding=padding,
-                truncation=True,
-                max_length=max_seq_length,
-                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
-                # receives the `special_tokens_mask`.
-                return_special_tokens_mask=True,
-            )
-
-        tokenized_datasets = raw_datasets.map(
-            tokenize_function,
-            batched=True,
-            num_proc=args.preprocessing_num_workers,
-            remove_columns=[text_column_name],
-            load_from_cache_file=not args.overwrite_cache,
-        )
-    else:
-        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
-        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
-        # efficient when it receives the `special_tokens_mask`.
-        def tokenize_function(examples):
-            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
-
-        tokenized_datasets = raw_datasets.map(
-            tokenize_function,
-            batched=True,
-            num_proc=args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not args.overwrite_cache,
-        )
-
-        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
-        # max_seq_length.
-        def group_texts(examples):
-            # Concatenate all texts.
-            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
-            total_length = len(concatenated_examples[list(examples.keys())[0]])
-            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-            # customize this part to your needs.
-            total_length = (total_length // max_seq_length) * max_seq_length
-            # Split by chunks of max_len.
-            result = {
-                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
-                for k, t in concatenated_examples.items()
-            }
-            return result
-
-        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
-        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
-        # might be slower to preprocess.
-        #
-        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
-        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
-
-        tokenized_datasets = tokenized_datasets.map(
-            group_texts,
-            batched=True,
-            num_proc=args.preprocessing_num_workers,
-            load_from_cache_file=not args.overwrite_cache,
-        )
-
-    train_dataset = tokenized_datasets["train"]
-    eval_dataset = tokenized_datasets["validation"]
-
-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
-        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
-
-    # Data collator
-    # This one will take care of randomly masking the tokens.
-    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=args.mlm_probability)
-
-    # DataLoaders creation:
-    train_dataloader = DataLoader(
-        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
-    )
-    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
-
-    # Optimizer
-    # Split weights in two groups, one with weight decay and the other not.
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {
-            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
-            "weight_decay": 0.0,
-        },
-    ]
-    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
-
-    # Prepare everything with our `accelerator`.
-    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
-        model, optimizer, train_dataloader, eval_dataloader
-    )
-
-    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
-    # shorter in multiprocess)
-
-    # Scheduler and math around the number of training steps.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
-        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-    else:
-        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
-
-    lr_scheduler = get_scheduler(
-        name=args.lr_scheduler_type,
-        optimizer=optimizer,
-        num_warmup_steps=args.num_warmup_steps,
-        num_training_steps=args.max_train_steps,
-    )
-
-    # Train!
-    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
-
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {args.num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {args.max_train_steps}")
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
-    completed_steps = 0
-
-    for epoch in range(args.num_train_epochs):
-        model.train()
-        for step, batch in enumerate(train_dataloader):
-            outputs = model(**batch)
-            loss = outputs.loss
-            loss = loss / args.gradient_accumulation_steps
-            accelerator.backward(loss)
-            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
-                progress_bar.update(1)
-                completed_steps += 1
-
-            if completed_steps >= args.max_train_steps:
-                break
-
-        model.eval()
-        losses = []
-        for step, batch in enumerate(eval_dataloader):
-            with torch.no_grad():
-                outputs = model(**batch)
-
-            loss = outputs.loss
-            losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size)))
-
-        losses = torch.cat(losses)
-        losses = losses[: len(eval_dataset)]
-        perplexity = math.exp(torch.mean(losses))
-
-        logger.info(f"epoch {epoch}: perplexity: {perplexity}")
-
-    if args.output_dir is not None:
-        accelerator.wait_for_everyone()
-        unwrapped_model = accelerator.unwrap_model(model)
-        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/language-modeling/run_plm.py
+++ b/examples/language-modeling/run_plm.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2020 The HuggingFace Team All rights reserved.
 #
@@ -40,12 +39,8 @@ from transformers import (
    set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint, is_main_process
-from transformers.utils import check_min_version


-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.6.0.dev0")
-
 logger = logging.getLogger(__name__)


@@ -147,20 +142,6 @@ class DataTrainingArguments:
            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
        },
    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
-        },
-    )
-    max_val_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
-            "value if set."
-        },
-    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -220,7 +201,7 @@ def main():
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
-    logger.info(f"Training/evaluation parameters {training_args}")
+    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)
@@ -236,19 +217,17 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
-                cache_dir=model_args.cache_dir,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
-                cache_dir=model_args.cache_dir,
            )
    else:
        data_files = {}
@@ -259,7 +238,7 @@ def main():
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
-        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -320,13 +299,6 @@ def main():
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

-    if data_args.max_seq_length > tokenizer.model_max_length:
-        logger.warning(
-            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
-            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
-        )
-    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
-
    if data_args.line_by_line:
        # When using line_by_line, we just tokenize each nonempty line.
        padding = "max_length" if data_args.pad_to_max_length else False
@@ -334,7 +306,7 @@ def main():
        def tokenize_function(examples):
            # Remove empty lines
            examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
-            return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)
+            return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length)

        tokenized_datasets = datasets.map(
            tokenize_function,
@@ -356,6 +328,13 @@ def main():
            load_from_cache_file=not data_args.overwrite_cache,
        )

+        if data_args.max_seq_length > tokenizer.model_max_length:
+            logger.warn(
+                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+            )
+        max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
        def group_texts(examples):
@@ -378,7 +357,6 @@ def main():
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
-
        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
@@ -386,20 +364,6 @@ def main():
            load_from_cache_file=not data_args.overwrite_cache,
        )

-    if training_args.do_train:
-        if "train" not in tokenized_datasets:
-            raise ValueError("--do_train requires a train dataset")
-        train_dataset = tokenized_datasets["train"]
-        if data_args.max_train_samples is not None:
-            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-
-    if training_args.do_eval:
-        if "validation" not in tokenized_datasets:
-            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = tokenized_datasets["validation"]
-        if data_args.max_val_samples is not None:
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
-
    # Data collator
    data_collator = DataCollatorForPermutationLanguageModeling(
        tokenizer=tokenizer,
@@ -411,8 +375,8 @@ def main():
    trainer = Trainer(
        model=model,
        args=training_args,
-        train_dataset=train_dataset if training_args.do_train else None,
-        eval_dataset=eval_dataset if training_args.do_eval else None,
+        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
+        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
@@ -427,30 +391,37 @@ def main():
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload
-        metrics = train_result.metrics

-        max_train_samples = (
-            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
-        )
-        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+        output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
+        if trainer.is_world_process_zero():
+            with open(output_train_file, "w") as writer:
+                logger.info("***** Train results *****")
+                for key, value in sorted(train_result.metrics.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")

-        trainer.log_metrics("train", metrics)
-        trainer.save_metrics("train", metrics)
-        trainer.save_state()
+            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
+            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
+    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

-        metrics = trainer.evaluate()
+        eval_output = trainer.evaluate()

-        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
-        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
-        perplexity = math.exp(metrics["eval_loss"])
-        metrics["perplexity"] = perplexity
+        perplexity = math.exp(eval_output["eval_loss"])
+        results["perplexity"] = perplexity

-        trainer.log_metrics("eval", metrics)
-        trainer.save_metrics("eval", metrics)
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results_plm.txt")
+        if trainer.is_world_process_zero():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in sorted(results.items()):
+                    logger.info(f"  {key} = {value}")
+                    writer.write(f"{key} = {value}\n")
+
+    return results


 def _mp_fn(index):
--- a/examples/legacy/question-answering/run_squad.py
+++ b/examples/legacy/question-answering/run_squad.py
@@ -436,7 +436,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

            if args.version_2_with_negative:
-                logger.warning("tensorflow_datasets does not handle version 2 of SQuAD.")
+                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
--- a/examples/legacy/run_camembert.py
+++ b/examples/legacy/run_camembert.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 import torch

 from transformers import CamembertForMaskedLM, CamembertTokenizer
--- a/examples/legacy/run_chinese_ref.py
+++ b/examples/legacy/run_chinese_ref.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 import argparse
 import json
 from typing import List
--- a/examples/legacy/run_language_modeling.py
+++ b/examples/legacy/run_language_modeling.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/legacy/run_openai_gpt.py
+++ b/examples/legacy/run_openai_gpt.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/legacy/run_swag.py
+++ b/examples/legacy/run_swag.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/legacy/run_transfo_xl.py
+++ b/examples/legacy/run_transfo_xl.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
--- a/examples/legacy/seq2seq/README.md
+++ b/examples/legacy/seq2seq/README.md
@@ -1,334 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# Sequence-to-Sequence Training and Evaluation
-
-This directory contains examples for finetuning and evaluating transformers on summarization and translation tasks.
-For deprecated `bertabs` instructions, see [`bertabs/README.md`](https://github.com/huggingface/transformers/blob/master/examples/research_projects/bertabs/README.md).
-
-### Supported Architectures
-
- `BartForConditionalGeneration`
- `MarianMTModel`
- `PegasusForConditionalGeneration`
- `MBartForConditionalGeneration`
- `FSMTForConditionalGeneration`
- `T5ForConditionalGeneration`
-
-### Download the Datasets
-
-#### XSUM
-
-```bash
-cd examples/legacy/seq2seq
-wget https://cdn-datasets.huggingface.co/summarization/xsum.tar.gz
-tar -xzvf xsum.tar.gz
-export XSUM_DIR=${PWD}/xsum
-```
-this should make a directory called `xsum/` with files like `test.source`.
-To use your own data, copy that files format. Each article to be summarized is on its own line.
-
-#### CNN/DailyMail
-
-```bash
-cd examples/legacy/seq2seq
-wget https://cdn-datasets.huggingface.co/summarization/cnn_dm_v2.tgz
-tar -xzvf cnn_dm_v2.tgz  # empty lines removed
-mv cnn_cln cnn_dm
-export CNN_DIR=${PWD}/cnn_dm
-```
-this should make a directory called `cnn_dm/` with 6 files.
-
-#### WMT16 English-Romanian Translation Data
-
-download with this command:
-```bash
-wget https://cdn-datasets.huggingface.co/translation/wmt_en_ro.tar.gz
-tar -xzvf wmt_en_ro.tar.gz
-export ENRO_DIR=${PWD}/wmt_en_ro
-```
-this should make a directory called `wmt_en_ro/` with 6 files.
-
-#### WMT English-German
-
-```bash
-wget https://cdn-datasets.huggingface.co/translation/wmt_en_de.tgz
-tar -xzvf wmt_en_de.tgz
-export DATA_DIR=${PWD}/wmt_en_de
-```
-
-#### FSMT datasets (wmt)
-
-Refer to the scripts starting with `eval_` under:
-https://github.com/huggingface/transformers/tree/master/scripts/fsmt
-
-#### Pegasus (multiple datasets)
-
-Multiple eval datasets are available for download from:
-https://github.com/stas00/porting/tree/master/datasets/pegasus
-
-
-#### Your Data
-
-If you are using your own data, it must be formatted as one directory with 6 files:
-```
-train.source
-train.target
-val.source
-val.target
-test.source
-test.target
-```
-The `.source` files are the input, the `.target` files are the desired output.
-
-### Potential issues
-
- native AMP (`--fp16` and no apex) may lead to a huge memory leak and require 10x gpu memory. This has been fixed in pytorch-nightly and the minimal official version to have this fix will be pytorch-1.7.1. Until then if you have to use mixed precision please use AMP only with pytorch-nightly or NVIDIA's apex. Reference: https://github.com/huggingface/transformers/issues/8403
-
-
-### Tips and Tricks
-
-General Tips:
- since you need to run from `examples/legacy/seq2seq`, and likely need to modify code, the easiest workflow is fork transformers, clone your fork, and run `pip install -e .` before you get started.
- try `--freeze_encoder` or `--freeze_embeds` for faster training/larger batch size.  (3hr per epoch with bs=8, see the "xsum_shared_task" command below)
- `fp16_opt_level=O1` (the default works best).
- In addition to the pytorch-lightning .ckpt checkpoint, a transformers checkpoint will be saved.
-Load it with `BartForConditionalGeneration.from_pretrained(f'{output_dir}/best_tfmr)`.
- At the moment, `--do_predict` does not work in a multi-gpu setting. You need to use `evaluate_checkpoint` or the `run_eval.py` code.
- This warning can be safely ignored:
-    > "Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-xsum and are newly initialized: ['final_logits_bias']"
- Both finetuning and eval are 30% faster with `--fp16`. For that you need to [install apex](https://github.com/NVIDIA/apex#quick-start).
- Read scripts before you run them!
-
-Summarization Tips:
- (summ) 1 epoch at batch size 1 for bart-large takes 24 hours and requires 13GB GPU RAM with fp16 on an NVIDIA-V100.
- If you want to run experiments on improving the summarization finetuning process, try the XSUM Shared Task (below). It's faster to train than CNNDM because the summaries are shorter.
- For CNN/DailyMail, the default `val_max_target_length` and `test_max_target_length` will truncate the ground truth labels, resulting in slightly higher rouge scores. To get accurate rouge scores, you should rerun calculate_rouge on the `{output_dir}/test_generations.txt` file saved by `trainer.test()`
- `--max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 ` is a reasonable setting for XSUM.
- `wandb` can be used by specifying `--logger_name wandb`. It is useful for reproducibility. Specify the environment variable `WANDB_PROJECT='hf_xsum'` to do the XSUM shared task.
- If you are finetuning on your own dataset, start from `distilbart-cnn-12-6` if you want long summaries and `distilbart-xsum-12-6` if you want short summaries.
-(It rarely makes sense to start from `bart-large` unless you are a researching finetuning methods).
-
-**Update 2018-07-18**
-Datasets: `LegacySeq2SeqDataset` will be used for all tokenizers without a `prepare_seq2seq_batch` method. Otherwise, `Seq2SeqDataset` will be used.
-Future work/help wanted: A new dataset to support multilingual tasks.
-
-
-### Fine-tuning using Seq2SeqTrainer
-To use `Seq2SeqTrainer` for fine-tuning you should use the `finetune_trainer.py` script. It subclasses `Trainer` to extend it for seq2seq training. Except the `Trainer`-related `TrainingArguments`, it shares the same argument names as that of `finetune.py` file. One notable difference is that calculating generative metrics (BLEU, ROUGE) is optional and is controlled using the `--predict_with_generate` argument.
-
-With PyTorch 1.6+ it'll automatically use `native AMP` when `--fp16` is set.
-
-To see all the possible command line options, run:
-
-```bash
-python finetune_trainer.py --help
-```
-
-For multi-gpu training use `torch.distributed.launch`, e.g. with 2 gpus:
-```bash
-python -m torch.distributed.launch --nproc_per_node=2  finetune_trainer.py ...
-```
-
-**At the moment, `Seq2SeqTrainer` does not support *with teacher* distillation.**
-
-All `Seq2SeqTrainer`-based fine-tuning scripts are included in the `builtin_trainer` directory.
-
-#### TPU Training
-`Seq2SeqTrainer` supports TPU training with few caveats
-1. As `generate` method does not work on TPU at the moment, `predict_with_generate` cannot be used. You should use `--prediction_loss_only` to only calculate loss, and do not set `--do_predict` and `--predict_with_generate`.
-2. All sequences should be padded to be of equal length to avoid extremely slow training. (`finetune_trainer.py` does this automatically when running on TPU.)
-
-We provide a very simple launcher script named `xla_spawn.py` that lets you run our example scripts on multiple TPU cores without any boilerplate. Just pass a `--num_cores` flag to this script, then your regular training script with its arguments (this is similar to the `torch.distributed.launch` helper for `torch.distributed`).
-
-`builtin_trainer/finetune_tpu.sh` script provides minimal arguments needed for TPU training.
-
-The following command fine-tunes `sshleifer/student_marian_en_ro_6_3` on TPU V3-8 and should complete one epoch in ~5-6 mins.
-
-```bash
-./builtin_trainer/train_distil_marian_enro_tpu.sh
-```
-
-## Evaluation Commands
-
-To create summaries for each article in dataset, we use `run_eval.py`, here are a few commands that run eval for different tasks and models.
-If 'translation' is in your task name, the computed metric will be BLEU. Otherwise, ROUGE will be used.
-
-For t5, you need to specify --task translation_{src}_to_{tgt} as follows:
-```bash
-export DATA_DIR=wmt_en_ro
-./run_eval.py t5-base \
-    $DATA_DIR/val.source t5_val_generations.txt \
-    --reference_path $DATA_DIR/val.target \
-    --score_path enro_bleu.json \
-    --task translation_en_to_ro \
-    --n_obs 100 \
-    --device cuda \
-    --fp16 \
-    --bs 32
-```
-
-This command works for MBART, although the BLEU score is suspiciously low.
-```bash
-export DATA_DIR=wmt_en_ro
-./run_eval.py facebook/mbart-large-en-ro $DATA_DIR/val.source mbart_val_generations.txt \
-    --reference_path $DATA_DIR/val.target \
-    --score_path enro_bleu.json \
-    --task translation \
-    --n_obs 100 \
-    --device cuda \
-    --fp16 \
-    --bs 32
-```
-
-Summarization (xsum will be very similar):
-```bash
-export DATA_DIR=cnn_dm
-./run_eval.py sshleifer/distilbart-cnn-12-6 $DATA_DIR/val.source dbart_val_generations.txt \
-    --reference_path $DATA_DIR/val.target \
-    --score_path cnn_rouge.json \
-    --task summarization \
-    --n_obs 100 \
-
-th 56 \
-    --fp16 \
-    --bs 32
-```
-
-### Multi-GPU Evaluation
-here is a command to run xsum evaluation on 8 GPUS. It is more than linearly faster than run_eval.py in some cases
-because it uses SortishSampler to minimize padding. You can also use it on 1 GPU. `data_dir` must have
-`{type_path}.source` and `{type_path}.target`. Run `./run_distributed_eval.py --help` for all clargs.
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=8  run_distributed_eval.py \
-    --model_name sshleifer/distilbart-large-xsum-12-3  \
-    --save_dir xsum_generations \
-    --data_dir xsum \
-    --fp16  # you can pass generate kwargs like num_beams here, just like run_eval.py
-```
-
-Contributions that implement this command for other distributed hardware setups are welcome!
-
-#### Single-GPU Eval: Tips and Tricks
-
-When using `run_eval.py`, the following features can be useful:
-
-* if you running the script multiple times and want to make it easier to track what arguments produced that output, use `--dump-args`. Along with the results it will also dump any custom params that were passed to the script. For example if you used: `--num_beams 8 --early_stopping true`, the output will be:
-   ```
-   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True}
-   ```
-
-   `--info` is an additional argument available for the same purpose of tracking the conditions of the experiment. It's useful to pass things that weren't in the argument list, e.g. a language pair `--info "lang:en-ru"`. But also if you pass `--info` without a value it will fallback to the current date/time string, e.g. `2020-09-13 18:44:43`.
-
-   If using `--dump-args --info`, the output will be:
-
-   ```
-   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True, 'info': '2020-09-13 18:44:43'}
-   ```
-
-   If using `--dump-args --info "pair:en-ru chkpt=best`, the output will be:
-
-   ```
-   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True, 'info': 'pair=en-ru chkpt=best'}
-   ```
-
-
-* if you need to perform a parametric search in order to find the best ones that lead to the highest BLEU score, let `run_eval_search.py` to do the searching for you.
-
-   The script accepts the exact same arguments as `run_eval.py`, plus an additional argument `--search`. The value of `--search` is parsed, reformatted and fed to ``run_eval.py`` as additional args.
-
-   The format for the `--search` value is a simple string with hparams and colon separated values to try, e.g.:
-   ```
-    --search "num_beams=5:10 length_penalty=0.8:1.0:1.2 early_stopping=true:false"
-   ```
-   which will generate `12` `(2*3*2)` searches for a product of each hparam. For example the example that was just used will invoke `run_eval.py` repeatedly with:
-
-   ```
-    --num_beams 5 --length_penalty 0.8 --early_stopping true
-    --num_beams 5 --length_penalty 0.8 --early_stopping false
-    [...]
-    --num_beams 10 --length_penalty 1.2 --early_stopping false
-   ```
-
-   On completion, this function prints a markdown table of the results sorted by the best BLEU score and the winning arguments.
-
-```
-bleu  | num_beams | length_penalty | early_stopping
----- | --------- | -------------- | --------------
-26.71 |         5 |            1.1 |              1
-26.66 |         5 |            0.9 |              1
-26.66 |         5 |            0.9 |              0
-26.41 |         5 |            1.1 |              0
-21.94 |         1 |            0.9 |              1
-21.94 |         1 |            0.9 |              0
-21.94 |         1 |            1.1 |              1
-21.94 |         1 |            1.1 |              0
-
-Best score args:
-stas/wmt19-en-ru data/en-ru/val.source data/en-ru/test_translations.txt --reference_path data/en-ru/val.target --score_path data/en-ru/test_bleu.json --bs 8 --task translation --num_beams 5 --length_penalty 1.1 --early_stopping True
-```
-
-If you pass `--info "some experiment-specific info"` it will get printed before the results table - this is useful for scripting and multiple runs, so one can tell the different sets of results from each other.
-
-
-### Contributing
- follow the standard contributing guidelines and code of conduct.
- add tests to `test_seq2seq_examples.py`
- To run only the seq2seq tests, you must be in the root of the repository and run:
-```bash
-pytest examples/seq2seq/
-```
-
-### Converting pytorch-lightning checkpoints
-pytorch lightning ``-do_predict`` often fails, after you are done training, the best way to evaluate your model is to convert it.
-
-This should be done for you, with a file called `{save_dir}/best_tfmr`.
-
-If that file doesn't exist but you have a lightning `.ckpt` file, you can run
-```bash
-python convert_pl_checkpoint_to_hf.py PATH_TO_CKPT  randomly_initialized_hf_model_path save_dir/best_tfmr
-```
-Then either `run_eval` or `run_distributed_eval` with `save_dir/best_tfmr` (see previous sections)
-
-
-# Experimental Features
-These features are harder to use and not always useful.
-
-###  Dynamic Batch Size for MT
-`finetune.py` has a command line arg `--max_tokens_per_batch` that allows batches to be dynamically sized.
-This feature can only be used:
- with fairseq installed
- on 1 GPU
- without sortish sampler
- after calling `./save_len_file.py $tok $data_dir`
-
-For example,
-```bash
-./save_len_file.py Helsinki-NLP/opus-mt-en-ro  wmt_en_ro
-./dynamic_bs_example.sh --max_tokens_per_batch=2000 --output_dir benchmark_dynamic_bs
-```
-splits `wmt_en_ro/train` into 11,197 uneven lengthed batches and can finish 1 epoch in 8 minutes on a v100.
-
-For comparison,
-```bash
-./dynamic_bs_example.sh --sortish_sampler --train_batch_size 48
-```
-uses 12,723 batches of length 48 and takes slightly more time 9.5 minutes.
-
-The feature is still experimental, because:
-+ we can make it much more robust if we have memory mapped/preprocessed datasets.
-+ The speedup over sortish sampler is not that large at the moment.
--- a/examples/legacy/seq2seq/requirements.txt
+++ b/examples/legacy/seq2seq/requirements.txt
@@ -1,20 +0,0 @@
-tensorboard
-scikit-learn
-seqeval
-psutil
-sacrebleu
-rouge-score
-tensorflow_datasets
-matplotlib
-git-python==1.0.3
-faiss-cpu
-streamlit
-elasticsearch
-nltk
-pandas
-datasets >= 1.1.3
-fire
-pytest
-conllu
-sentencepiece != 0.1.92
-protobuf
--- a/examples/legacy/token-classification/README.md
+++ b/examples/legacy/token-classification/README.md
@@ -129,71 +129,6 @@ On the test dataset the following results could be achieved:
 10/04/2019 00:42:42 - INFO - __main__ -     recall = 0.8624150210424085
 ```

-#### Run the Tensorflow 2 version
-
-To start training, just run:
-
-```bash
-python3 run_tf_ner.py --data_dir ./ \
--labels ./labels.txt \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_device_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict
-```
-
-Such as the Pytorch version, if your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
-
-#### Evaluation
-
-Evaluation on development dataset outputs the following for our example:
-```bash
-           precision    recall  f1-score   support
-
- LOCderiv     0.7619    0.6154    0.6809        52
-  PERpart     0.8724    0.8997    0.8858      4057
-  OTHpart     0.9360    0.9466    0.9413       711
-  ORGpart     0.7015    0.6989    0.7002       269
-  LOCpart     0.7668    0.8488    0.8057       496
-      LOC     0.8745    0.9191    0.8963       235
- ORGderiv     0.7723    0.8571    0.8125        91
- OTHderiv     0.4800    0.6667    0.5581        18
-      OTH     0.5789    0.6875    0.6286        16
- PERderiv     0.5385    0.3889    0.4516        18
-      PER     0.5000    0.5000    0.5000         2
-      ORG     0.0000    0.0000    0.0000         3
-
-micro avg     0.8574    0.8862    0.8715      5968
-macro avg     0.8575    0.8862    0.8713      5968
-```
-
-On the test dataset the following results could be achieved:
-```bash
-           precision    recall  f1-score   support
-
-  PERpart     0.8847    0.8944    0.8896      9397
-  OTHpart     0.9376    0.9353    0.9365      1639
-  ORGpart     0.7307    0.7044    0.7173       697
-      LOC     0.9133    0.9394    0.9262       561
-  LOCpart     0.8058    0.8157    0.8107      1150
-      ORG     0.0000    0.0000    0.0000         8
- OTHderiv     0.5882    0.4762    0.5263        42
- PERderiv     0.6571    0.5227    0.5823        44
-      OTH     0.4906    0.6667    0.5652        39
- ORGderiv     0.7016    0.7791    0.7383       172
- LOCderiv     0.8256    0.6514    0.7282       109
-      PER     0.0000    0.0000    0.0000        11
-
-micro avg     0.8722    0.8774    0.8748     13869
-macro avg     0.8712    0.8774    0.8740     13869
-```
-
 ### Emerging and Rare Entities task: WNUT’17 (English NER) dataset

 Description of the WNUT’17 task from the [shared task website](http://noisy-text.github.io/2017/index.html):
--- a/examples/multiple-choice/README.md
+++ b/examples/multiple-choice/README.md
@@ -14,13 +14,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->

-# Multiple Choice
+## Multiple Choice

 Based on the script [`run_swag.py`]().

-## PyTorch script: fine-tuning on SWAG
-
-`run_swag` allows you to fine-tune any model from our [hub](https://huggingface.co/models) (as long as its architecture as a `ForMultipleChoice` version in the library) on the SWAG dataset or your own csv/jsonlines files as long as they are structured the same way. To make it works on another dataset, you will need to tweak the `preprocess_function` inside the script.
+#### Fine-tuning on SWAG

 ```bash
 python examples/multiple-choice/run_swag.py \
@@ -41,73 +39,6 @@ eval_acc = 0.8338998300509847
 eval_loss = 0.44457291918821606
 ```

-## PyTorch version, no Trainer
-
-Based on the script [run_ner_no_trainer.py](https://github.com/huggingface/transformers/blob/master/examples/multiple-choice/run_swag_no_trainer.py).
-
-Like `run_swag.py`, this script allows you to fine-tune any of the models on the [hub](https://huggingface.co/models) (as long as its architecture as a `ForMultipleChoice` version in the library) on
-the SWAG dataset or your own data in a csv or a JSON file. The main difference is that this
-script exposes the bare training loop, to allow you to quickly experiment and add any customization you would like.
-
-It offers less options than the script with `Trainer` (but you can easily change the options for the optimizer
-or the dataloaders directly in the script) but still run in a distributed setup, on TPU and supports mixed precision by
-the mean of the [🤗 `Accelerate`](https://github.com/huggingface/accelerate) library. You can use the script normally
-after installing it:
-
-```bash
-pip install accelerate
-```
-
-then
-
-```bash
-export DATASET_NAME=swag
-
-python run_swag_no_trainer.py \
-  --model_name_or_path bert-base-cased \
-  --dataset_name $DATASET_NAME \
-  --max_seq_length 128 \
-  --per_device_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3 \
-  --output_dir /tmp/$DATASET_NAME/
-```
-
-You can then use your usual launchers to run in it in a distributed environment, but the easiest way is to run
-
-```bash
-accelerate config
-```
-
-and reply to the questions asked. Then
-
-```bash
-accelerate test
-```
-
-that will check everything is ready for training. Finally, you can launch training with
-
-```bash
-export DATASET_NAME=swag
-
-accelerate launch run_swag_no_trainer.py \
-  --model_name_or_path bert-base-cased \
-  --dataset_name $DATASET_NAME \
-  --max_seq_length 128 \
-  --per_device_train_batch_size 32 \
-  --learning_rate 2e-5 \
-  --num_train_epochs 3 \
-  --output_dir /tmp/$DATASET_NAME/
-```
-
-This command is the same and will work for:
-
- a CPU-only setup
- a setup with one GPU
- a distributed training with several GPUs (single or multi node)
- a training on TPUs
-
-Note that this library is in alpha release so your feedback is more than welcome if you encounter any problem using it.

 ## Tensorflow

@@ -129,3 +60,6 @@ python ./examples/multiple-choice/run_tf_multiple_choice.py \
 --gradient_accumulation_steps 2 \
 --overwrite_output
 ```
+
+# Run it in colab
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb)
--- a/examples/multiple-choice/run_no_trainer.sh
+++ b/examples/multiple-choice/run_no_trainer.sh
@@ -1,19 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-accelerate launch run_swag_no_trainer.py \
-  --model_name_or_path bert-base-uncased \
-  --dataset_name swag \
-  --output_dir /tmp/test-swag-no-trainer \
-  --pad_to_max_length
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Lysandre	cc86472c78	Release: v4.3.1 Some checks failed Model templates runner / run_tests_templates (push) Has been cancelled Details Release - Conda / build_and_package (push) Has been cancelled Details	2021-02-09 09:55:55 +01:00
Patrick von Platen	02451cda74	Deprecate Wav2Vec2ForMaskedLM and add Wav2Vec2ForCTC (#10089 ) * add wav2vec2CTC and deprecate for maskedlm * remove from docs	2021-02-09 09:55:55 +01:00
Lysandre	800f385d78	Release: v4.3.0 Some checks failed Model templates runner / run_tests_templates (push) Has been cancelled Details Release - Conda / build_and_package (push) Has been cancelled Details	2021-02-08 18:31:49 +01:00
Anthony MOI	bcf49c0438	Update tokenizers requirement (#10077 )	2021-02-08 18:29:16 +01:00
Patrick von Platen	15a8906c71	Bump minimum Jax requirement to 2.8.0 (#10027 ) * Bump minimum Jax requirement to 2.8.0 * update table	2021-02-08 18:18:26 +01:00