Release v4.5.0

[WIP] GPT Neo cleanup (#10985 )
* better names * add attention mixin * all slow tests in one class * make helper methods static so we can test * add local attention tests * better names * doc * apply review suggestions
2021-04-06 12:37:47 -04:00 · 2021-04-06 12:24:15 -04:00 · 2021-04-06 15:12:21 +02:00 · 2021-04-06 14:56:18 +02:00 · 2021-04-06 08:55:40 -04:00 · 2021-04-06 08:03:00 -04:00
383 changed files with 24334 additions and 5983 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -3,7 +3,6 @@ orbs:
    gcp-gke: circleci/gcp-gke@1.0.4
    go: circleci/go@1.3.0

-
 # TPU REFERENCES
 references:
    checkout_ml_testing: &checkout_ml_testing
@@ -69,6 +68,8 @@ jobs:
            - image: circleci/python:3.6
        environment:
            OMP_NUM_THREADS: 1
+            RUN_PT_TF_CROSS_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -79,13 +80,13 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,speech]
-            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
+            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
                    - '~/.cache/pip'
-            - run: RUN_PT_TF_CROSS_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf ./tests/ -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
+            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf ./tests/ -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -97,6 +98,8 @@ jobs:
            - image: circleci/python:3.6
        environment:
            OMP_NUM_THREADS: 1
+            RUN_PT_FLAX_CROSS_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -107,13 +110,13 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,speech]
-            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
+            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
                    - '~/.cache/pip'
-            - run: RUN_PT_FLAX_CROSS_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax ./tests/ -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
+            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax ./tests/ -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -125,6 +128,7 @@ jobs:
            - image: circleci/python:3.7
        environment:
            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -135,8 +139,8 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech]
-            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
+            - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
@@ -153,6 +157,7 @@ jobs:
            - image: circleci/python:3.7
        environment:
            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -179,6 +184,7 @@ jobs:
            - image: circleci/python:3.7
        environment:
            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -205,6 +211,8 @@ jobs:
            - image: circleci/python:3.7
        environment:
            OMP_NUM_THREADS: 1
+            RUN_PIPELINE_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -215,13 +223,13 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech]
-            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
+            - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: RUN_PIPELINE_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test ./tests/ | tee tests_output.txt
+            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test ./tests/ | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -233,6 +241,8 @@ jobs:
            - image: circleci/python:3.7
        environment:
            OMP_NUM_THREADS: 1
+            RUN_PIPELINE_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -247,7 +257,7 @@ jobs:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: RUN_PIPELINE_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf ./tests/ -m is_pipeline_test | tee tests_output.txt
+            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf ./tests/ -m is_pipeline_test | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -259,6 +269,7 @@ jobs:
            - image: circleci/python:3.7
        environment:
            RUN_CUSTOM_TOKENIZERS: yes
+            TRANSFORMERS_IS_CI: yes
        steps:
            - checkout
            - restore_cache:
@@ -284,6 +295,7 @@ jobs:
            - image: circleci/python:3.6
        environment:
            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -299,7 +311,7 @@ jobs:
                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/ | tee examples_output.txt
+            - run: TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/ | tee examples_output.txt
            - store_artifacts:
                  path: ~/transformers/examples_output.txt
            - store_artifacts:
@@ -309,6 +321,9 @@ jobs:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.7
+        environment:
+            RUN_GIT_LFS_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
@@ -319,7 +334,7 @@ jobs:
                git config --global user.name "ci"
            - run: pip install --upgrade pip
            - run: pip install .[testing]
-            - run: RUN_GIT_LFS_TESTS=1 python -m pytest -sv ./tests/test_hf_api.py -k "HfLargefilesTest"
+            - run: python -m pytest -sv ./tests/test_hf_api.py -k "HfLargefilesTest"

    build_doc:
        working_directory: ~/transformers
@@ -383,12 +398,14 @@ jobs:
                      - '~/.cache/pip'
            - run: black --check examples tests src utils
            - run: isort --check-only examples tests src utils
+            - run: python utils/custom_init_isort.py --check_only
            - run: flake8 examples tests src utils
            - run: python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
            - run: python utils/check_copies.py
            - run: python utils/check_table.py
            - run: python utils/check_dummies.py
            - run: python utils/check_repo.py
+            - run: python utils/check_inits.py

    check_repository_consistency:
        working_directory: ~/transformers
@@ -407,6 +424,7 @@ jobs:
            - image: circleci/python:3.6
        environment:
            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
        resource_class: xlarge
        parallelism: 1
        steps:
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -57,4 +57,7 @@ deploy_doc "818878d" v3.5.1
 deploy_doc "c781171" v4.0.1
 deploy_doc "bfa4ccf" v4.1.1
 deploy_doc "7d9a9d0" v4.2.2
-deploy_doc "bae0c79"  # v4.3.3 Latest stable release
+deploy_doc "bae0c79" v4.3.3
+deploy_doc "c988db5" v4.4.0
+deploy_doc "c5d6a28" v4.4.1
+deploy_doc "6bc89ed"  # v4.4.2 Latest stable release
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -34,7 +34,7 @@ Models:
 - funnel: @sgugger
 - gpt2: @patrickvonplaten, @LysandreJik
 - rag: @patrickvonplaten, @lhoestq
- tensorflow: @jplu
+- tensorflow: @LysandreJik

 Library:

@@ -48,9 +48,13 @@ Library:

 Documentation: @sgugger

+Model hub:
+
+- for issues with a model report at https://discuss.huggingface.co/ and tag the model's creator.
+
 HF projects:

- nlp datasets: [different repo](https://github.com/huggingface/nlp)
+- datasets: [different repo](https://github.com/huggingface/datasets)
 - rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)

 Examples:
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -46,7 +46,7 @@ Models:
 - funnel: @sgugger
 - gpt2: @patrickvonplaten, @LysandreJik
 - rag: @patrickvonplaten, @lhoestq
- tensorflow: @jplu
+- tensorflow: @LysandreJik

 Library:

@@ -62,7 +62,7 @@ Documentation: @sgugger

 HF projects:

- nlp datasets: [different repo](https://github.com/huggingface/nlp)
+- datasets: [different repo](https://github.com/huggingface/datasets)
 - rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)

 Examples:
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -12,6 +12,12 @@ on:
      - "templates/**"
  repository_dispatch:

+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+
 jobs:
  run_tests_torch_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
@@ -40,10 +46,6 @@ jobs:
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

      - name: Run all non-slow tests on GPU
-        env:
-          OMP_NUM_THREADS: 8
-          MKL_NUM_THREADS: 8
-          HF_HOME: /mnt/cache
        run: |
          python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_gpu tests

@@ -83,11 +85,8 @@ jobs:

      - name: Run all non-slow tests on GPU
        env:
-          OMP_NUM_THREADS: 8
-          MKL_NUM_THREADS: 8
          TF_NUM_INTRAOP_THREADS: 8
          TF_NUM_INTEROP_THREADS: 1
-          HF_HOME: /mnt/cache
        run: |
          python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_gpu tests

@@ -131,10 +130,7 @@ jobs:

      - name: Run all non-slow tests on GPU
        env:
-          OMP_NUM_THREADS: 8
-          MKL_NUM_THREADS: 8
          MKL_SERVICE_FORCE_INTEL: 1
-          HF_HOME: /mnt/cache
        run: |
          python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_multi_gpu tests

@@ -174,11 +170,8 @@ jobs:

      - name: Run all non-slow tests on GPU
        env:
-          OMP_NUM_THREADS: 8
-          MKL_NUM_THREADS: 8
          TF_NUM_INTRAOP_THREADS: 8
          TF_NUM_INTEROP_THREADS: 1
-          HF_HOME: /mnt/cache
        run: |
          python -m pytest -n 2 --dist=loadfile --make-reports=tests_tf_multi_gpu tests

--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -8,6 +8,13 @@ on:
  schedule:
    - cron: "0 0 * * *"

+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  RUN_SLOW: yes
+  OMP_NUM_THREADS: 16
+  MKL_NUM_THREADS: 16
+
 jobs:
  run_all_tests_torch_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
@@ -36,11 +43,6 @@ jobs:
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

      - name: Run all tests on GPU
-        env:
-          OMP_NUM_THREADS: 16
-          MKL_NUM_THREADS: 16
-          RUN_SLOW: yes
-          HF_HOME: /mnt/cache
        run: |
          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_gpu tests

@@ -55,6 +57,7 @@ jobs:
          MKL_NUM_THREADS: 16
          RUN_SLOW: yes
          HF_HOME: /mnt/cache
+          TRANSFORMERS_IS_CI: yes
        run: |
          pip install -r examples/_tests_requirements.txt
          python -m pytest -n 1 --dist=loadfile --make-reports=examples_torch_gpu examples
@@ -66,11 +69,7 @@ jobs:
      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
-          OMP_NUM_THREADS: 16
-          MKL_NUM_THREADS: 16
-          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
-          HF_HOME: /mnt/cache
        run: |
          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests

@@ -110,12 +109,8 @@ jobs:

      - name: Run all tests on GPU
        env:
-          RUN_SLOW: yes
-          HF_HOME: /mnt/cache
-          OMP_NUM_THREADS: 16
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
-          MKL_NUM_THREADS: 16
        run: |
          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests

@@ -126,13 +121,9 @@ jobs:
      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
-          RUN_SLOW: yes
-          HF_HOME: /mnt/cache
-          OMP_NUM_THREADS: 16
          RUN_PIPELINE_TESTS: yes
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
-          MKL_NUM_THREADS: 16
        run: |
          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests

@@ -175,10 +166,6 @@ jobs:

      - name: Run all tests on GPU
        env:
-          RUN_SLOW: yes
-          HF_HOME: /mnt/cache
-          OMP_NUM_THREADS: 16
-          MKL_NUM_THREADS: 16
          MKL_SERVICE_FORCE_INTEL: 1
        run: |
          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
@@ -190,11 +177,7 @@ jobs:
      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
-          OMP_NUM_THREADS: 16
-          MKL_NUM_THREADS: 16
-          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
-          HF_HOME: /mnt/cache
        run: |
          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests

@@ -234,12 +217,8 @@ jobs:

      - name: Run all tests on GPU
        env:
-          OMP_NUM_THREADS: 16
-          RUN_SLOW: yes
-          MKL_NUM_THREADS: 16
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
-          HF_HOME: /mnt/cache
        run: |
          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests

@@ -250,13 +229,9 @@ jobs:
      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
-          OMP_NUM_THREADS: 16
-          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
-          MKL_NUM_THREADS: 16
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
-          HF_HOME: /mnt/cache
        run: |
          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests

--- a/31
+++ b/31
@@ -19,34 +19,44 @@ modified_only_fixup:
 deps_table_update:
 	@python setup.py deps_table_update

+# autogenerating code
+
+autogenerate_code: deps_table_update
+	python utils/class_mapping_update.py
+
 # Check that source code meets quality standards

-extra_quality_checks: deps_table_update
+extra_quality_checks:
 	python utils/check_copies.py
 	python utils/check_table.py
 	python utils/check_dummies.py
 	python utils/check_repo.py
-	python utils/style_doc.py src/transformers docs/source --max_len 119
-	python utils/class_mapping_update.py
+	python utils/check_inits.py

 # this target runs checks on all files
 quality:
 	black --check $(check_dirs)
 	isort --check-only $(check_dirs)
+	python utils/custom_init_isort.py --check_only
 	flake8 $(check_dirs)
-	python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
 	${MAKE} extra_quality_checks

 # Format source code automatically and check is there are any problems left that need manual fixing

-style: deps_table_update
+extra_style_checks:
+	python utils/custom_init_isort.py
+	python utils/style_doc.py src/transformers docs/source --max_len 119
+
+# this target runs checks on all files and potentially modifies some of them
+style:
 	black $(check_dirs)
 	isort $(check_dirs)
-	python utils/style_doc.py src/transformers docs/source --max_len 119
+	${MAKE} autogenerate_code
+	${MAKE} extra_style_checks

 # Super fast fix and check target that only works on relevant modified files since the branch was made

-fixup: modified_only_fixup extra_quality_checks
+fixup: modified_only_fixup extra_style_checks autogenerate_code extra_quality_checks

 # Make marked copies of snippets of codes conform to the original

@@ -65,6 +75,12 @@ test:
 test-examples:
 	python -m pytest -n auto --dist=loadfile -s -v ./examples/

+# Run tests for SageMaker DLC release
+
+test-sagemaker: # install sagemaker dependencies in advance with pip install .[sagemaker]
+	TEST_SAGEMAKER=True python -m pytest -n auto  -s -v ./tests/sagemaker
+
+
 # Check that docs can build

 docs:
@@ -83,4 +99,3 @@ post-release:

 post-patch:
 	python utils/release.py --post_release --patch
-
--- a/README.md
+++ b/README.md
@@ -194,6 +194,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
@@ -212,6 +213,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
 1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
@@ -232,6 +234,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,10 +1,11 @@
 // These two things need to be updated at each release for the version selector.
 // Last stable version
-const stableVersion = "v4.3.2"
+const stableVersion = "v4.4.2"
 // Dictionary doc folder to label. The last stable version should have an empty key.
 const versionMapping = {
    "master": "master",
-    "": "v4.3.0/v4.3.1/v4.3.2/v4.3.3 (stable)",
+    "": "v4.4.0/v4.4.1/v4.4.2 (stable)",
+    "v4.3.3": "v4.3.0/v4.3.1/v4.3.2/v4.3.3",
    "v4.2.2": "v4.2.0/v4.2.1/v4.2.2",
    "v4.1.1": "v4.1.0/v4.1.1",
    "v4.0.1": "v4.0.0/v4.0.1",
@@ -61,7 +62,7 @@ function addIcon() {
 function addCustomFooter() {
    const customFooter = document.createElement("div");
    const questionOrIssue = document.createElement("div");
-    questionOrIssue.innerHTML = "Stuck? Read our <a href='https://medium.com/huggingface'>Blog posts</a> or <a href='https://github.com/huggingface/transformers'>Create an issue</a>";
+    questionOrIssue.innerHTML = "Stuck? Read our <a href='https://huggingface.co/blog'>Blog posts</a> or <a href='https://github.com/huggingface/transformers'>Create an issue</a>";
    customFooter.appendChild(questionOrIssue);
    customFooter.classList.add("footer");

--- a/docs/source/community.md
+++ b/docs/source/community.md
@@ -48,3 +48,6 @@ This page regroups resources around 🤗 Transformers developed by the community
 |[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | How to fine-tune LED on pubmed for long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)|
 |[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | How to effectively evaluate LED on long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)|
 |[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | How to fine-tune *LayoutLMForSequenceClassification* on the RVL-CDIP dataset for scanned document classification | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)|
+|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | How to decode CTC sequence with language model adjustment | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)|
+|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | How to fine-tune BART for summarization in two languages with Trainer class | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
+|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | How to evaluate BigBird on long document question answering on Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -14,24 +14,24 @@
 #
 import os
 import sys
-sys.path.insert(0, os.path.abspath('../../src'))
+
+sys.path.insert(0, os.path.abspath("../../src"))


 # -- Project information -----------------------------------------------------

-project = u'transformers'
-copyright = u'2020, The Hugging Face Team, Licenced under the Apache License, Version 2.0'
-author = u'huggingface'
+project = "transformers"
+copyright = "2020, The Hugging Face Team, Licenced under the Apache License, Version 2.0"
+author = "huggingface"

 # The short X.Y version
-version = u''
+version = ""
 # The full version, including alpha/beta/rc tags
-release = u'4.4.2'
-
+release = "4.5.0.dev0"


 # Prefix link to point to master, comment this during version release and uncomment below line
-extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/master/%s', '')}
+extlinks = {"prefix_link": ("https://github.com/huggingface/transformers/blob/master/%s", "")}
 # Prefix link to always point to corresponding version, uncomment this during version release
 # extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/v'+ release + '/%s', '')}

@@ -45,27 +45,28 @@ extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/ma
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.extlinks',
-    'sphinx.ext.coverage',
-    'sphinx.ext.napoleon',
-    'recommonmark',
-    'sphinx.ext.viewcode',
-    'sphinx_markdown_tables',
-    'sphinx_copybutton'
+    "sphinx.ext.autodoc",
+    "sphinx.ext.extlinks",
+    "sphinx.ext.coverage",
+    "sphinx.ext.napoleon",
+    "recommonmark",
+    "sphinx.ext.viewcode",
+    "sphinx_markdown_tables",
+    "sphinxext.opengraph",
+    "sphinx_copybutton",
 ]

 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]

 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
-source_suffix = ['.rst', '.md']
+source_suffix = [".rst", ".md"]
 # source_suffix = '.rst'

 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -77,7 +78,7 @@ language = None
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]

 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = None
@@ -91,21 +92,30 @@ copybutton_prompt_is_regexp = True
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"

 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
-html_theme_options = {
-    'analytics_id': 'UA-83738774-2',
-    'navigation_with_keys': True
-}
+html_theme_options = {"analytics_id": "UA-83738774-2", "navigation_with_keys": True}
+
+#  Configuration for OpenGraph and Twitter Card Tags.
+# These are responsible for creating nice shareable social images https://ahrefs.com/blog/open-graph-meta-tags/
+# https://ogp.me/#type_website
+ogp_image = "https://huggingface.co/front/thumbnails/transformers.png"
+ogp_description = "State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0. Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation, etc in 100+ languages. Its aim is to make cutting-edge NLP easier to use for everyone"
+ogp_description_length = 160
+
+ogp_custom_meta_tags = [
+    f'<meta name="twitter:image" content="{ogp_image}">',
+    f'<meta name="twitter:description" content="{ogp_description}">',
+]

 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]

 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.
@@ -117,17 +127,17 @@ html_static_path = ['_static']
 #
 # html_sidebars = {}

-# This must be the name of an image file (path relative to the configuration 
-# directory) that is the favicon of the docs. Modern browsers use this as 
-# the icon for tabs, windows and bookmarks. It should be a Windows-style 
+# This must be the name of an image file (path relative to the configuration
+# directory) that is the favicon of the docs. Modern browsers use this as
+# the icon for tabs, windows and bookmarks. It should be a Windows-style
 # icon file (.ico).
-html_favicon = 'favicon.ico'
+html_favicon = "favicon.ico"


 # -- Options for HTMLHelp output ---------------------------------------------

 # Output file base name for HTML help builder.
-htmlhelp_basename = 'transformersdoc'
+htmlhelp_basename = "transformersdoc"


 # -- Options for LaTeX output ------------------------------------------------
@@ -136,15 +146,12 @@ latex_elements = {
    # The paper size ('letterpaper' or 'a4paper').
    #
    # 'papersize': 'letterpaper',
-
    # The font size ('10pt', '11pt' or '12pt').
    #
    # 'pointsize': '10pt',
-
    # Additional stuff for the LaTeX preamble.
    #
    # 'preamble': '',
-
    # Latex figure (float) alignment
    #
    # 'figure_align': 'htbp',
@@ -154,8 +161,7 @@ latex_elements = {
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'transformers.tex', u'transformers Documentation',
-     u'huggingface', 'manual'),
+    (master_doc, "transformers.tex", "transformers Documentation", "huggingface", "manual"),
 ]


@@ -163,10 +169,7 @@ latex_documents = [

 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'transformers', u'transformers Documentation',
-     [author], 1)
-]
+man_pages = [(master_doc, "transformers", "transformers Documentation", [author], 1)]


 # -- Options for Texinfo output ----------------------------------------------
@@ -175,9 +178,15 @@ man_pages = [
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'transformers', u'transformers Documentation',
-     author, 'transformers', 'One line description of project.',
-     'Miscellaneous'),
+    (
+        master_doc,
+        "transformers",
+        "transformers Documentation",
+        author,
+        "transformers",
+        "One line description of project.",
+        "Miscellaneous",
+    ),
 ]


@@ -196,11 +205,13 @@ epub_title = project
 # epub_uid = ''

 # A list of files that should not be packed into the epub file.
-epub_exclude_files = ['search.html']
+epub_exclude_files = ["search.html"]
+

 def setup(app):
-    app.add_css_file('css/huggingface.css')
-    app.add_css_file('css/code-snippets.css')
-    app.add_js_file('js/custom.js')
+    app.add_css_file("css/huggingface.css")
+    app.add_css_file("css/code-snippets.css")
+    app.add_js_file("js/custom.js")
+

 # -- Extension configuration -------------------------------------------------
--- a/docs/source/custom_datasets.rst
+++ b/docs/source/custom_datasets.rst
@@ -15,10 +15,10 @@ Fine-tuning with custom datasets

 .. note::

-    The datasets used in this tutorial are available and can be more easily accessed using the `🤗 NLP library
-    <https://github.com/huggingface/nlp>`_. We do not use this library to access the datasets here since this tutorial
-    meant to illustrate how to work with your own data. A brief of introduction can be found at the end of the tutorial
-    in the section ":ref:`nlplib`".
+    The datasets used in this tutorial are available and can be more easily accessed using the `🤗 Datasets library
+    <https://github.com/huggingface/datasets>`_. We do not use this library to access the datasets here since this
+    tutorial meant to illustrate how to work with your own data. A brief of introduction can be found at the end of the
+    tutorial in the section ":ref:`datasetslib`".

 This tutorial will take you through several examples of using 🤗 Transformers models with your own datasets. The guide
 shows one of many valid workflows for using these models and is meant to be illustrative rather than definitive. We
@@ -41,7 +41,7 @@ Sequence Classification with IMDb Reviews
 .. note::

    This dataset can be explored in the Hugging Face model hub (`IMDb <https://huggingface.co/datasets/imdb>`_), and
-    can be alternatively downloaded with the 🤗 NLP library with ``load_dataset("imdb")``.
+    can be alternatively downloaded with the 🤗 Datasets library with ``load_dataset("imdb")``.

 In this example, we'll show how to download, tokenize, and train a model on the IMDb reviews dataset. This task takes
 the text of a review and requires the model to predict whether the sentiment of the review is positive or negative.
@@ -260,7 +260,7 @@ Token Classification with W-NUT Emerging Entities
 .. note::

    This dataset can be explored in the Hugging Face model hub (`WNUT-17 <https://huggingface.co/datasets/wnut_17>`_),
-    and can be alternatively downloaded with the 🤗 NLP library with ``load_dataset("wnut_17")``.
+    and can be alternatively downloaded with the 🤗 Datasets library with ``load_dataset("wnut_17")``.

 Next we will look at token classification. Rather than classifying an entire sequence, this task classifies token by
 token. We'll demonstrate how to do this with `Named Entity Recognition
@@ -459,7 +459,7 @@ Question Answering with SQuAD 2.0
 .. note::

    This dataset can be explored in the Hugging Face model hub (`SQuAD V2
-    <https://huggingface.co/datasets/squad_v2>`_), and can be alternatively downloaded with the 🤗 NLP library with
+    <https://huggingface.co/datasets/squad_v2>`_), and can be alternatively downloaded with the 🤗 Datasets library with
    ``load_dataset("squad_v2")``.

 Question answering comes in many forms. In this example, we'll look at the particular type of extractive QA that
@@ -677,22 +677,23 @@ Additional Resources
  - :doc:`Preprocessing <preprocessing>`. Docs page on data preprocessing.
  - :doc:`Training <training>`. Docs page on training and fine-tuning.

-.. _nlplib:
+.. _datasetslib:

-Using the 🤗 NLP Datasets & Metrics library
+Using the 🤗 Datasets & Metrics library
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 This tutorial demonstrates how to read in datasets from various raw text formats and prepare them for training with 🤗
 Transformers so that you can do the same thing with your own custom datasets. However, we recommend users use the `🤗
-NLP library <https://github.com/huggingface/nlp>`_ for working with the 150+ datasets included in the `hub
+Datasets library <https://github.com/huggingface/datasets>`_ for working with the 150+ datasets included in the `hub
 <https://huggingface.co/datasets>`_, including the three datasets used in this tutorial. As a very brief overview, we
-will show how to use the NLP library to download and prepare the IMDb dataset from the first example, :ref:`seq_imdb`.
+will show how to use the Datasets library to download and prepare the IMDb dataset from the first example,
+:ref:`seq_imdb`.

 Start by downloading the dataset:

 .. code-block:: python

-    from nlp import load_dataset
+    from datasets import load_dataset
    train = load_dataset("imdb", split="train")

 Each dataset has multiple columns corresponding to different features. Let's see what our columns are.
@@ -724,5 +725,5 @@ dataset elements.
    >>> {key: val.shape for key, val in train[0].items()})
    {'labels': TensorShape([]), 'input_ids': TensorShape([512]), 'attention_mask': TensorShape([512])}

-We now have a fully-prepared dataset. Check out `the 🤗 NLP docs <https://huggingface.co/nlp/processing.html>`_ for a
-more thorough introduction.
+We now have a fully-prepared dataset. Check out `the 🤗 Datasets docs
+<https://huggingface.co/docs/datasets/processing.html>`_ for a more thorough introduction.
--- a/docs/source/fast_tokenizers.rst
+++ b/docs/source/fast_tokenizers.rst
@@ -0,0 +1,62 @@
+Using tokenizers from 🤗 Tokenizers
+=======================================================================================================================
+
+The :class:`~transformers.PreTrainedTokenizerFast` depends on the `tokenizers
+<https://huggingface.co/docs/tokenizers>`__ library. The tokenizers obtained from the 🤗 Tokenizers library can be
+loaded very simply into 🤗 Transformers.
+
+Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
+
+.. code-block::
+
+    >>> from tokenizers import Tokenizer
+    >>> from tokenizers.models import BPE
+    >>> from tokenizers.trainers import BpeTrainer
+    >>> from tokenizers.pre_tokenizers import Whitespace
+
+    >>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
+    >>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
+
+    >>> tokenizer.pre_tokenizer = Whitespace()
+    >>> files = [...]
+    >>> tokenizer.train(files, trainer)
+
+We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
+a JSON file for future re-use.
+
+Loading directly from the tokenizer object
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
+:class:`~transformers.PreTrainedTokenizerFast` class allows for easy instantiation, by accepting the instantiated
+`tokenizer` object as an argument:
+
+.. code-block::
+
+    >>> from transformers import PreTrainedTokenizerFast
+
+    >>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
+
+This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer
+page <main_classes/tokenizer>` for more information.
+
+Loading from a JSON file
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
+
+.. code-block::
+
+    >>> tokenizer.save("tokenizer.json")
+
+The path to which we saved this file can be passed to the :class:`~transformers.PreTrainedTokenizerFast` initialization
+method using the :obj:`tokenizer_file` parameter:
+
+.. code-block::
+
+    >>> from transformers import PreTrainedTokenizerFast
+
+    >>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
+
+This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer
+page <main_classes/tokenizer>` for more information.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -97,130 +97,139 @@ and conversion utilities for the following models:
 5. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
   Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
   Narayan, Aliaksei Severyn.
-6. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
+6. :doc:`BigBird-RoBERTa <model_doc/bigbird>` (from Google Research) released with the paper `Big Bird: Transformers
+   for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua
+   Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+7. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-7. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building an
+8. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building an
   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-8. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
+9. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
   <https://arxiv.org/abs/2010.10499>`__ by Adrian de Wynter and Daniel J. Perry.
-9. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
-   French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
-   Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-10. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
+10. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
+    French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
+    Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+11. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
    Span-based Dynamic Convolution <https://arxiv.org/abs/2008.02496>`__ by Zihang Jiang, Weihao Yu, Daquan Zhou,
    Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-11. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
+12. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
    Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
    Lav R. Varshney, Caiming Xiong and Richard Socher.
-12. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
+13. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
    Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu
    Chen.
-13. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
+14. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
    with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
    Weizhu Chen.
-14. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+15. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
    Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
    Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-15. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+16. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
    distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
    Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
    `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
    version of DistilBERT.
-16. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+17. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
    Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
    Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-17. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+18. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
    Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
    Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-18. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+19. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
    Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
    Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-19. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+20. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
    Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
    Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-20. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+21. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
    Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
    and Ilya Sutskever.
-21. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+22. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
    Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
    Luan, Dario Amodei** and Ilya Sutskever**.
-22. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
+23. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
+    <https://github.com/EleutherAI/gpt-neo>`__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+24. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
    <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
-23. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+25. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-24. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+26. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-25. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+27. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-26. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+28. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
    by Hao Tan and Mohit Bansal.
-27. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
+29. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
    Machine Translation <https://arxiv.org/abs/2010.11125>`__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi
    Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman
    Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-28. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+30. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
    Translator Team.
-29. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+31. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-30. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
+32. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
    Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
    Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-31. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+33. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
    Jianfeng Lu, Tie-Yan Liu.
-32. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+34. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-33. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+35. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
    Mohammad Saleh and Peter J. Liu.
-34. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+36. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-35. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+37. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-36. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+38. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-37. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
+39. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
    `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
    Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-38. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
+40. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
    about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
    Krishna, and Kurt W. Keutzer.
-39. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+41. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-40. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+42. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
    Francesco Piccinno and Julian Martin Eisenschlos.
-41. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+43. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-42. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+44. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
+    Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
+    Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
+    Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+45. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
    Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
    Zhou, Abdelrahman Mohamed, Michael Auli.
-43. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+46. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-44. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+47. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-45. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+48. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
    Zettlemoyer and Veselin Stoyanov.
-46. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+49. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-47. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
+50. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
    Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
    Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.

@@ -247,6 +256,8 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           BigBird           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       BlenderbotSmall       |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
@@ -275,13 +286,15 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -319,6 +332,8 @@ TensorFlow and/or Flax.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             ViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
@@ -363,11 +378,13 @@ TensorFlow and/or Flax.
    examples
    custom_datasets
    notebooks
+    sagemaker
    community
    converting_tensorflow_models
    migration
    contributing
    add_new_model
+    fast_tokenizers
    testing
    serialization

@@ -406,6 +423,7 @@ TensorFlow and/or Flax.
    model_doc/bert
    model_doc/bertweet
    model_doc/bertgeneration
+    model_doc/bigbird
    model_doc/blenderbot
    model_doc/blenderbot_small
    model_doc/bort
@@ -436,6 +454,7 @@ TensorFlow and/or Flax.
    model_doc/mt5
    model_doc/gpt
    model_doc/gpt2
+    model_doc/gpt_neo
    model_doc/pegasus
    model_doc/phobert
    model_doc/prophetnet
@@ -448,6 +467,7 @@ TensorFlow and/or Flax.
    model_doc/t5
    model_doc/tapas
    model_doc/transformerxl
+    model_doc/vit
    model_doc/wav2vec2
    model_doc/xlm
    model_doc/xlmprophetnet
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -151,9 +151,9 @@ environment variable for ``TRANSFORMERS_CACHE``.

 ### Note on model downloads (Continuous Integration or large-scale deployments)

-If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through
+If you expect to be downloading large volumes of models (more than 10,000) from huggingface.co (for instance through
 your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way
-faster, and cheaper. Feel free to contact us privately if you need any help.
+faster, and cheaper. Feel free to contact us privately, we'd love to help with this.

 ### Offline mode

--- a/docs/source/internal/generation_utils.rst
+++ b/docs/source/internal/generation_utils.rst
@@ -151,6 +151,16 @@ generation.
 .. autoclass:: transformers.HammingDiversityLogitsProcessor
    :members: __call__

+.. autoclass:: transformers.ForcedBOSTokenLogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.ForcedEOSTokenLogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.InfNanRemoveLogitsProcessor
+    :members: __call__
+
+
 StoppingCriteria
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/internal/pipelines_utils.rst
+++ b/docs/source/internal/pipelines_utils.rst
@@ -47,6 +47,4 @@ Data format
 Utilities
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autofunction:: transformers.pipelines.get_framework
-
 .. autoclass:: transformers.pipelines.PipelineException
--- a/docs/source/main_classes/callback.rst
+++ b/docs/source/main_classes/callback.rst
@@ -74,6 +74,32 @@ TrainerCallback
 .. autoclass:: transformers.TrainerCallback
    :members:

+Here is an example of how to register a custom callback with the PyTorch :class:`~transformers.Trainer`:
+
+.. code-block:: python
+
+    class MyCallback(TrainerCallback):
+        "A callback that prints a message at the beginning of training"
+
+        def on_train_begin(self, args, state, control, **kwargs):
+            print("Starting training")
+
+    trainer = Trainer(
+        model,
+        args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        callbacks=[MyCallback]  # We can either pass the callback class this way or an instance of it (MyCallback())
+    )
+
+Another way to register a callback is to call ``trainer.add_callback()`` as follows:
+
+.. code-block:: python
+
+    trainer = Trainer(...)
+    trainer.add_callback(MyCallback)
+    # Alternatively, we can pass an instance of the callback class
+    trainer.add_callback(MyCallback())

 TrainerState
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/main_classes/feature_extractor.rst
+++ b/docs/source/main_classes/feature_extractor.rst
@@ -39,3 +39,10 @@ BatchFeature

 .. autoclass:: transformers.BatchFeature
    :members:
+
+
+ImageFeatureExtractionMixin
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.image_utils.ImageFeatureExtractionMixin
+    :members:
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -62,6 +62,11 @@ PreTrainedTokenizer
 PreTrainedTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+The :class:`~transformers.PreTrainedTokenizerFast` depend on the `tokenizers
+<https://huggingface.co/docs/tokenizers>`__ library. The tokenizers obtained from the 🤗 tokenizers library can be
+loaded very simply into 🤗 transformers. Take a look at the :doc:`Using tokenizers from 🤗 tokenizers
+<../fast_tokenizers>` page to understand how this is done.
+
 .. autoclass:: transformers.PreTrainedTokenizerFast
    :special-members: __call__
    :members: batch_decode, convert_ids_to_tokens, convert_tokens_to_ids, convert_tokens_to_string, decode, encode, 
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -31,7 +31,10 @@ the above features. To inject custom behavior you can subclass them and override
 - **get_test_dataloader**/**get_test_tfdataset** -- Creates the test DataLoader (PyTorch) or TF Dataset.
 - **log** -- Logs information on the various objects watching training.
 - **create_optimizer_and_scheduler** -- Sets up the optimizer and learning rate scheduler if they were not passed at
-  init.
+  init. Note, that you can also subclass or override the ``create_optimizer`` and ``create_scheduler`` methods
+  separately.
+- **create_optimizer** -- Sets up the optimizer if it wasn't passed at init.
+- **create_scheduler** -- Sets up the learning rate scheduler if it wasn't passed at init.
 - **compute_loss** - Computes the loss on a batch of training inputs.
 - **training_step** -- Performs a training step.
 - **prediction_step** -- Performs an evaluation/test step.
@@ -542,8 +545,6 @@ cell with:
            "cpu_offload": true
        },

-        "zero_allow_untested_optimizer": true,
-
        "optimizer": {
            "type": "AdamW",
            "params": {
@@ -612,17 +613,11 @@ example ``.json`` files with:

 Some more examples are to be found in the `main repo <https://github.com/microsoft/DeepSpeed>`__ as well.

-While you always have to supply the DeepSpeed configuration file, you can configure the DeepSpeed integration in
-several ways:
-
-1. Supply most of the configuration inside the file, and just use a few required command line arguments. This is the
-   recommended way as it puts most of the configuration params in one place.
-2. Supply just the ZeRO configuration params inside the file, and configure the rest using the normal
-   :class:`~transformers.Trainer` command line arguments.
-3. Any variation of the first two ways.
+When using DeepSpeed you always need to supply a DeepSpeed configuration file, yet some configuration parameters have
+to be configured via the command line. You will find the nuances in the rest of this guide.

 To get an idea of what DeepSpeed configuration file looks like, here is one that activates ZeRO stage 2 features,
-enables FP16, uses AdamW optimizer and WarmupLR scheduler:
+enables FP16, uses ``AdamW`` optimizer and ``WarmupLR`` scheduler:

 .. code-block:: json

@@ -666,36 +661,33 @@ enables FP16, uses AdamW optimizer and WarmupLR scheduler:
       }
    }

-If you already have a command line that you have been using with :class:`transformers.Trainer` args, you can continue
-using those and the :class:`~transformers.Trainer` will automatically convert them into the corresponding DeepSpeed
-configuration at run time. For example, you could use the following configuration file:
-
-.. code-block:: json
-
-    {
-       "zero_optimization": {
-           "stage": 2,
-           "allgather_partitions": true,
-           "allgather_bucket_size": 5e8,
-           "overlap_comm": true,
-           "reduce_scatter": true,
-           "reduce_bucket_size": 5e8,
-           "contiguous_gradients": true,
-           "cpu_offload": true
-       }
-    }
-
-and the following command line arguments:
-
-.. code-block:: bash
-
-    --learning_rate 3e-5 --warmup_steps 500 --adam_beta1 0.8 --adam_beta2 0.999 --adam_epsilon 1e-8 \
-    --weight_decay 3e-7 --lr_scheduler_type constant_with_warmup --fp16 --fp16_backend amp
-
-to achieve the same configuration as provided by the longer json file in the first example.
-
 When you execute the program, DeepSpeed will log the configuration it received from the :class:`~transformers.Trainer`
-to the console, so you can see exactly what the final configuration was passed to it.
+to the console, so you can see exactly what was the final configuration passed to it.
+
+
+Passing Configuration
+=======================================================================================================================
+
+As discussed in this document normally the DeepSpeed configuration is passed as a path to a json file, but if you're
+not using the command line interface to configure the training, and instead instantiate the
+:class:`~transformers.Trainer` via :class:`~transformers.TrainingArguments` then for the ``deepspeed`` argument you can
+pass a nested ``dict``. This allows you to create the configuration on the fly and doesn't require you to write it to
+the file system before passing it to :class:`~transformers.TrainingArguments`.
+
+To summarize you can do:
+
+.. code-block:: python
+
+    TrainingArguments(..., deespeed="/path/to/ds_config.json")
+
+or:
+
+.. code-block:: python
+
+    ds_config_dict=dict(scheduler=scheduler_params, optimizer=optimizer_params)
+    TrainingArguments(..., deespeed=ds_config_dict)
+
+

 Shared Configuration
 =======================================================================================================================
@@ -761,9 +753,27 @@ no equivalent command line arguments.



-Optimizer
+Optimizer and Scheduler
 =======================================================================================================================

+As long as you don't enable ``cpu_offload`` you can mix and match DeepSpeed and HuggingFace schedulers and optimizers,
+with the exception of using the combination of HuggingFace scheduler and DeepSpeed optimizer:
+
+--------------+--------------+--------------+
+| Combos       | HF Scheduler | DS Scheduler |
+--------------+--------------+--------------+
+| HF Optimizer | Yes          | Yes          |
+--------------+--------------+--------------+
+| DS Optimizer | No           | Yes          |
+--------------+--------------+--------------+
+
+If ``cpu_offload`` is enabled you must use both DeepSpeed scheduler and DeepSpeed optimizer.
+
+
+
+Optimizer
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+

 DeepSpeed's main optimizers are Adam, AdamW, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are
 thus recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here
@@ -773,7 +783,7 @@ If you don't configure the ``optimizer`` entry in the configuration file, the :c
 automatically set it to ``AdamW`` and will use the supplied values or the defaults for the following command line
 arguments: ``--learning_rate``, ``--adam_beta1``, ``--adam_beta2``, ``--adam_epsilon`` and ``--weight_decay``.

-Here is an example of the pre-configured ``optimizer`` entry for AdamW:
+Here is an example of the pre-configured ``optimizer`` entry for ``AdamW``:

 .. code-block:: json

@@ -789,6 +799,17 @@ Here is an example of the pre-configured ``optimizer`` entry for AdamW:
         }
    }

+Note that the command line arguments will override the values in the configuration file. This is so that there is one
+definitive source of the values and to avoid hard to find errors when for example, the learning rate is set to
+different values in different places. Command line rules. The values that get overridden are:
+
+- ``lr`` with the value of ``--learning_rate``
+- ``betas`` with the value of ``--adam_beta1 --adam_beta2``
+- ``eps`` with the value of ``--adam_epsilon``
+- ``weight_decay`` with the value of ``--weight_decay``
+
+Therefore please remember to tune the shared hyperparameters on the command line.
+
 If you want to use another optimizer which is not listed above, you will have to add ``"zero_allow_untested_optimizer":
 true`` to the top level configuration.

@@ -797,41 +818,24 @@ make sure to adjust the values. e.g. if use Adam you will want ``weight_decay``


 Scheduler
-=======================================================================================================================
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

 DeepSpeed supports LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR LR schedulers. The full documentation is `here
 <https://www.deepspeed.ai/docs/config-json/#scheduler-parameters>`__.

-If you don't configure the ``scheduler`` entry in the configuration file, the :class:`~transformers.Trainer` will use
-the value of ``--lr_scheduler_type`` to configure it. Currently the :class:`~transformers.Trainer` supports only 2 LR
-schedulers that are also supported by DeepSpeed:
+
+Here is where the schedulers overlap between 🤗 Transformers and DeepSpeed:

 * ``WarmupLR`` via ``--lr_scheduler_type constant_with_warmup``
 * ``WarmupDecayLR`` via ``--lr_scheduler_type linear``. This is also the default value for ``--lr_scheduler_type``,
  therefore, if you don't configure the scheduler this is scheduler that will get configured by default.

-In either case, the values of ``--learning_rate`` and ``--warmup_steps`` will be used for the configuration.

-In other words, if you don't use the configuration file to set the ``scheduler`` entry, provide either:
+If you don't configure the ``scheduler`` entry in the configuration file, the :class:`~transformers.Trainer` will use
+the values of ``--lr_scheduler_type``, ``--learning_rate`` and ``--warmup_steps`` to configure a 🤗 Transformers version
+of it.

-.. code-block:: bash
-
-    --lr_scheduler_type constant_with_warmup --learning_rate 3e-5 --warmup_steps 500
-
-or
-
-.. code-block:: bash
-
-    --lr_scheduler_type linear --learning_rate 3e-5 --warmup_steps 500
-
-with the desired values. If you don't pass these arguments, reasonable default values will be used instead.
-
-In the case of WarmupDecayLR ``total_num_steps`` gets set either via the ``--max_steps`` command line argument, or if
-it is not provided, derived automatically at run time based on the environment and the size of the dataset and other
-command line arguments.
-
-Here is an example of the pre-configured ``scheduler`` entry for WarmupLR (``constant_with_warmup`` in the
-:class:`~transformers.Trainer` API):
+Here is an example of the pre-configured ``scheduler`` entry for ``WarmupLR``:

 .. code-block:: json

@@ -846,6 +850,39 @@ Here is an example of the pre-configured ``scheduler`` entry for WarmupLR (``con
         }
    }

+Note that the command line arguments will override the values in the configuration file. This is so that there is one
+definitive source of the values and to avoid hard to find errors when for example, the learning rate is set to
+different values in different places. Command line rules. The values that get overridden are:
+
+- ``warmup_max_lr`` with the value of ``--learning_rate``
+- ``warmup_num_steps`` with the value of ``--warmup_steps``
+- ``total_num_steps`` with either the value of ``--max_steps`` or if it is not provided, derived automatically at run
+  time based on the environment and the size of the dataset and other command line arguments (needed for
+  ``WarmupDecayLR``).
+
+Therefore please remember to tune the shared hyperparameters on the command line.
+
+For example, for ``WarmupDecayLR``, you can use the following entry:
+
+.. code-block:: json
+
+    {
+       "scheduler": {
+             "type": "WarmupDecayLR",
+             "params": {
+                 "total_num_steps": 10,
+                 "last_batch_iteration": -1,
+                 "warmup_min_lr": 0,
+                 "warmup_max_lr": 0.001,
+                 "warmup_num_steps": 1000
+             }
+         }
+    }
+
+and ``warmup_max_lr``, ``warmup_num_steps`` and ``total_num_steps`` will be corrected at loading time.
+
+
+
 Automatic Mixed Precision
 =======================================================================================================================

@@ -933,9 +970,9 @@ Notes
 * While DeepSpeed has a pip installable PyPI package, it is highly recommended that it gets installed from `source
  <https://github.com/microsoft/deepspeed#installation>`__ to best match your hardware and also if you need to enable
  certain features, like 1-bit Adam, which aren't available in the pypi distribution.
-* You don't have to use the :class:`~transformers.Trainer` to use DeepSpeed with HuggingFace ``transformers`` - you can
-  use any model with your own trainer, and you will have to adapt the latter according to `the DeepSpeed integration
-  instructions <https://www.deepspeed.ai/getting-started/#writing-deepspeed-models>`__.
+* You don't have to use the :class:`~transformers.Trainer` to use DeepSpeed with 🤗 Transformers - you can use any model
+  with your own trainer, and you will have to adapt the latter according to `the DeepSpeed integration instructions
+  <https://www.deepspeed.ai/getting-started/#writing-deepspeed-models>`__.

 Main DeepSpeed Resources
 =======================================================================================================================
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -189,3 +189,52 @@ FlaxAutoModel

 .. autoclass:: transformers.FlaxAutoModel
    :members:
+
+
+FlaxAutoModelForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAutoModelForPreTraining
+    :members:
+
+
+FlaxAutoModelForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAutoModelForMaskedLM
+    :members:
+
+
+FlaxAutoModelForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAutoModelForSequenceClassification
+    :members:
+
+
+FlaxAutoModelForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAutoModelForQuestionAnswering
+    :members:
+
+
+FlaxAutoModelForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAutoModelForTokenClassification
+    :members:
+
+
+FlaxAutoModelForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAutoModelForMultipleChoice
+    :members:
+
+
+FlaxAutoModelForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAutoModelForNextSentencePrediction
+    :members:
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -209,8 +209,50 @@ FlaxBertModel
    :members: __call__


+FlaxBertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBertForPreTraining
+    :members: __call__
+
+
 FlaxBertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.FlaxBertForMaskedLM
    :members: __call__
+
+
+FlaxBertForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBertForNextSentencePrediction
+    :members: __call__
+
+
+FlaxBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBertForSequenceClassification
+    :members: __call__
+
+
+FlaxBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBertForMultipleChoice
+    :members: __call__
+
+
+FlaxBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBertForTokenClassification
+    :members: __call__
+
+
+FlaxBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBertForQuestionAnswering
+    :members: __call__
--- a/docs/source/model_doc/bigbird.rst
+++ b/docs/source/model_doc/bigbird.rst
@@ -0,0 +1,130 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BigBird
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BigBird model was proposed in `Big Bird: Transformers for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by
+Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon,
+Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention
+based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse
+attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it
+has been shown that applying sparse, global, and random attention approximates full attention, while being
+computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context,
+BigBird has shown improved performance on various long document NLP tasks, such as question answering and
+summarization, compared to BERT or RoBERTa.
+
+The abstract from the paper is the following:
+
+*Transformers-based models, such as BERT, have been one of the most successful deep learning models for NLP.
+Unfortunately, one of their core limitations is the quadratic dependency (mainly in terms of memory) on the sequence
+length due to their full attention mechanism. To remedy this, we propose, BigBird, a sparse attention mechanism that
+reduces this quadratic dependency to linear. We show that BigBird is a universal approximator of sequence functions and
+is Turing complete, thereby preserving these properties of the quadratic, full attention model. Along the way, our
+theoretical analysis reveals some of the benefits of having O(1) global tokens (such as CLS), that attend to the entire
+sequence as part of the sparse attention mechanism. The proposed sparse attention can handle sequences of length up to
+8x of what was previously possible using similar hardware. As a consequence of the capability to handle longer context,
+BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also
+propose novel applications to genomics data.*
+
+Tips:
+
+- For an in-detail explanation on how BigBird's attention works, see `this blog post
+  <https://huggingface.co/blog/big-bird>`__.
+- BigBird comes with 2 implementations: **original_full** & **block_sparse**. For the sequence length < 1024, using
+  **original_full** is advised as there is no benefit in using **block_sparse** attention.
+- The code currently uses window size of 3 blocks and 2 global blocks.
+- Sequence length must be divisible by block size.
+- Current implementation supports only **ITC**.
+- Current implementation doesn't support **num_random_blocks = 0**
+
+The original code can be found `here <https://github.com/google-research/bigbird>`__.
+
+BigBirdConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdConfig
+    :members:
+
+
+BigBirdTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+BigBird specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.big_bird.modeling_big_bird.BigBirdForPreTrainingOutput
+    :members:
+
+
+BigBirdModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdModel
+    :members: forward
+
+
+BigBirdForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdForPreTraining
+    :members: forward
+
+
+BigBirdForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdForCausalLM
+    :members: forward
+
+
+BigBirdForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdForMaskedLM
+    :members: forward
+
+
+BigBirdForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdForSequenceClassification
+    :members: forward
+
+
+BigBirdForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdForMultipleChoice
+    :members: forward
+
+
+BigBirdForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdForTokenClassification
+    :members: forward
+
+
+BigBirdForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BigBirdForQuestionAnswering
+    :members: forward
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -50,7 +50,7 @@ The original code can be found `here <https://github.com/openai/finetune-transfo
 Note:

 If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install ``ftfy``
-and ``SpaCy``::
+and ``SpaCy``:

 .. code-block:: bash

--- a/docs/source/model_doc/gpt_neo.rst
+++ b/docs/source/model_doc/gpt_neo.rst
@@ -0,0 +1,65 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+GPT Neo
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The GPTNeo model was released in the `EleutherAI/gpt-neo <https://github.com/EleutherAI/gpt-neo>`__ repository by Sid
+Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. It is a GPT2 like causal language model trained on the
+`Pile <https://pile.eleuther.ai/>`__ dataset.
+
+The architecture is similar to GPT2 except that GPT Neo uses local attention in every other layer with a window size of
+256 tokens.
+
+Generation
+_______________________________________________________________________________________________________________________
+
+The :obj:`generate()` method can be used to generate text using GPT Neo model.
+
+.. code-block::
+
+    >>> from transformers import GPTNeoForCausalLM, GPT2Tokenizer
+    >>> model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
+    >>> tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
+
+    >>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
+    ...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
+    ...          "researchers was the fact that the unicorns spoke perfect English."
+
+    >>> input_ids = tokenizer(unicorns, return_tensors="pt").input_ids
+
+    >>> gen_tokens = model.generate(ids, do_sample=True, temperature=0.9, max_length=100,)
+    >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
+
+
+GPTNeoConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTNeoConfig
+    :members:
+
+
+GPTNeoModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTNeoModel
+    :members: forward
+
+
+GPTNeoForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTNeoForCausalLM
+    :members: forward
--- a/docs/source/model_doc/layoutlm.rst
+++ b/docs/source/model_doc/layoutlm.rst
@@ -130,3 +130,31 @@ LayoutLMForTokenClassification

 .. autoclass:: transformers.LayoutLMForTokenClassification
    :members:
+
+
+TFLayoutLMModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLayoutLMModel
+    :members:
+
+
+TFLayoutLMForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLayoutLMForMaskedLM
+    :members:
+
+
+TFLayoutLMForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLayoutLMForSequenceClassification
+    :members:
+
+
+TFLayoutLMForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLayoutLMForTokenClassification
+    :members:
--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -0,0 +1,102 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Vision Transformer (ViT)
+-----------------------------------------------------------------------------------------------------------------------
+
+.. note::
+
+    This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
+    breaking changes to fix it in the future. If you see something strange, file a `Github Issue
+    <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__.
+
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Vision Transformer (ViT) model was proposed in `An Image is Worth 16x16 Words: Transformers for Image Recognition
+at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
+Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
+Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
+very good results compared to familiar convolutional architectures.
+
+
+The abstract from the paper is the following:
+
+*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
+applications to computer vision remain limited. In vision, attention is either applied in conjunction with
+convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
+structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
+sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
+data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
+Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
+substantially fewer computational resources to train.*
+
+Tips:
+
+- To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches,
+  which are then linearly embedded. A [CLS] token is added to serve as representation of an entire image, which can be
+  used for classification. The authors also add absolute position embeddings, and feed the resulting sequence of
+  vectors to a standard Transformer encoder.
+- The Vision Transformer was pre-trained using a resolution of 224x224. During fine-tuning, it is often beneficial to
+  use a higher resolution than pre-training `(Touvron et al., 2019) <https://arxiv.org/abs/1906.06423>`__, `(Kolesnikov
+  et al., 2020) <https://arxiv.org/abs/1912.11370>`__. The authors report the best results with a resolution of 384x384
+  during fine-tuning.
+- As the Vision Transformer expects each image to be of the same size (resolution), one can use
+  :class:`~transformers.ViTFeatureExtractor` to resize (or rescale) and normalize images for the model.
+- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
+  each checkpoint. For example, :obj:`google/vit-base-patch16-224` refers to a base-sized architecture with patch
+  resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the `hub
+  <https://huggingface.co/models?search=vit>`__.
+- The available checkpoints are either (1) pre-trained on `ImageNet-21k <http://www.image-net.org/>`__ (a collection of
+  14 million images and 21k classes) only, or (2) also fine-tuned on `ImageNet
+  <http://www.image-net.org/challenges/LSVRC/2012/>`__ (also referred to as ILSVRC 2012, a collection of 1.3 million
+  images and 1,000 classes).
+- The best results are obtained with supervised pre-training, which is not the case in NLP. The authors also performed
+  an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked
+  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
+  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.
+
+
+The original code (written in JAX) can be found `here <https://github.com/google-research/vision_transformer>`__.
+
+Note that we converted the weights from Ross Wightman's `timm library
+<https://github.com/rwightman/pytorch-image-models>`__, who already converted the weights from JAX to PyTorch. Credits
+go to him!
+
+
+ViTConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ViTConfig
+    :members:
+
+
+ViTFeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ViTFeatureExtractor
+    :members: __call__
+
+
+ViTModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ViTModel
+    :members: forward
+
+
+ViTForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ViTForImageClassification
+    :members: forward
--- a/docs/source/perplexity.rst
+++ b/docs/source/perplexity.rst
@@ -18,8 +18,8 @@ that the metric applies specifically to classical language models (sometimes cal
 models) and is not well defined for masked language models like BERT (see :doc:`summary of the models
 <model_summary>`).

-Perplexity is defined as the exponentiated average log-likelihood of a sequence. If we have a tokenized sequence
-:math:`X = (x_0, x_1, \dots, x_t)`, then the perplexity of :math:`X` is,
+Perplexity is defined as the exponentiated average negative log-likelihood of a sequence. If we have a tokenized
+sequence :math:`X = (x_0, x_1, \dots, x_t)`, then the perplexity of :math:`X` is,

 .. math::

--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -139,6 +139,12 @@ For the full list, refer to `https://huggingface.co/models <https://huggingface.
 |                    | ``gpt2-xl``                                                | | 48-layer, 1600-hidden, 25-heads, 1558M parameters.                                                                                  |
 |                    |                                                            | | OpenAI's XL-sized GPT-2 English model                                                                                               |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| GPTNeo             | ``EleutherAI/gpt-neo-1.3B``                                | | 24-layer, 2048-hidden, 16-heads, 1.3B parameters.                                                                                   |
+|                    |                                                            | | EleutherAI's GPT-3 like language model.                                                                                             |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``EleutherAI/gpt-neo-2.7B``                                | | 32-layer, 2560-hidden, 20-heads, 2.7B parameters.                                                                                   |
+|                    |                                                            | | EleutherAI's GPT-3 like language model.                                                                                             |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | Transformer-XL     | ``transfo-xl-wt103``                                       | | 18-layer, 1024-hidden, 16-heads, 257M parameters.                                                                                   |
 |                    |                                                            | | English model trained on wikitext-103                                                                                               |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
--- a/docs/source/sagemaker.md
+++ b/docs/source/sagemaker.md
@@ -0,0 +1,393 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Run training on Amazon SageMaker
+
+Hugging Face and Amazon are introducing new [Hugging Face Deep Learning Containers (DLCs)](https://github.com/aws/deep-learning-containers/blob/master/available_images.md#huggingface-training-containers) to make it easier than ever to train Hugging Face Transformer models in [Amazon SageMaker](https://aws.amazon.com/sagemaker/).
+
+To learn how to access and use the new Hugging Face DLCs with the Amazon SageMaker Python SDK, check out the guides and resources below.
+
+---
+
+## Deep Learning Container (DLC) overview
+
+The Deep Learning Container are in every available where Amazon SageMaker is available. You can see the [AWS region table](https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/) for all AWS global infrastructure. To get an detailed overview of all included packages look [here in the release notes](https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/deep-learning-containers-images.html).
+
+| 🤗 Transformers version | 🤗 Datasets version | PyTorch/TensorFlow version | type     | device | Python Version | Example `image_uri`                                                                                                               |
+| ----------------------- | ------------------- | -------------------------- | -------- | ------ | -------------- | --------------------------------------------------------------------------------------------------------------------------------- |
+| 4.4.2                   | 1.5.0               | PyTorch 1.6.0              | training | GPU    | 3.6            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04`    |
+| 4.4.2                   | 1.5.0               | TensorFlow 2.4.1           | training | GPU    | 3.7            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.4.2-gpu-py37-cu110-ubuntu18.04` |
+
+---
+
+## Getting Started: Train a 🤗 Transformers Model
+
+To train a 🤗 Transformers model by using the `HuggingFace` SageMaker Python SDK you need to:
+
+- [Prepare a training script](#prepare-a-transformers-fine-tuning-script)
+- [Create a `HuggingFace` Estimator](#create-an-huggingface-estimator)
+- [Run training by calling the `fit` method](#execute-training)
+- [Access you model](#access-trained-model)
+
+### Setup & Installation
+
+Before you can train a transformers models with Amazon SageMaker you need to sign up for an AWS account. If you do not have an AWS account yet learn more [here](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-set-up.html).
+
+After you complete these tasks you can get started using either [SageMaker Studio](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-studio-onboard.html), [SageMaker Notebook Instances](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-console.html), or a local environment. To start training locally you need configure the right [IAM permission](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html).
+
+Upgrade to the latest `sagemaker` version.
+
+```bash
+pip install sagemaker --upgrade
+```
+
+**SageMaker environment**
+
+_Note: The execution role is intended to be available only when running a notebook within SageMaker. If you run `get_execution_role` in a notebook not on SageMaker, expect a "region" error._
+
+```python
+import sagemaker
+sess = sagemaker.Session()
+role = sagemaker.get_execution_role()
+```
+
+**Local environment**
+
+```python
+import sagemaker
+import boto3
+
+iam_client = boto3.client('iam')
+role = iam_client.get_role(RoleName='role-name-of-your-iam-role-with-right-permissions')['Role']['Arn']
+sess = sagemaker.Session()
+```
+
+### Prepare a 🤗 Transformers fine-tuning script.
+
+The training script is very similar to a training script you might run outside of SageMaker, but you can access useful properties about the training environment through various environment variables, including the following:
+
+- `SM_MODEL_DIR`: A string that represents the path where the training job writes the model artifacts to. After training, artifacts in this directory are uploaded to S3 for model hosting. `SM_MODEL_DIR` is always set to `/opt/ml/model`.
+
+- `SM_NUM_GPUS`: An integer representing the number of GPUs available to the host.
+
+- `SM_CHANNEL_XXXX:` A string that represents the path to the directory that contains the input data for the specified channel. For example, if you specify two input channels in the HuggingFace estimator’s fit call, named `train` and `test`, the environment variables `SM_CHANNEL_TRAIN` and `SM_CHANNEL_TEST` are set.
+
+You can find a full list of the exposed environment variables [here](https://github.com/aws/sagemaker-training-toolkit/blob/master/ENVIRONMENT_VARIABLES.md).
+
+Later we define `hyperparameters` in the [HuggingFace Estimator](#create-an-huggingface-estimator), which are passed in as named arguments and and can be processed with the `ArgumentParser()`.
+
+```python
+import transformers
+import datasets
+import argparse
+import os
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    # hyperparameters sent by the client are passed as command-line arguments to the script.
+    parser.add_argument("--epochs", type=int, default=3)
+    parser.add_argument("--per_device_train_batch_size", type=int, default=32)
+    parser.add_argument("--model_name_or_path", type=str)
+
+    # Data, model, and output directories
+    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
+    parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+    parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
+```
+
+_Note that SageMaker doesn’t support argparse actions. For example, if you want to use a boolean hyperparameter, specify `type` as `bool` in your script and provide an explicit `True` or `False` value._
+
+For a complete example of a 🤗 Transformers training script, see [train.py](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/scripts/train.py)
+
+### Create an HuggingFace Estimator
+
+You run 🤗 Transformers training scripts on SageMaker by creating `HuggingFace` Estimators. The Estimator handles end-to-end Amazon SageMaker training. The training of your script is invoked when you call `fit` on a `HuggingFace` Estimator. In the Estimator you define, which fine-tuning script should be used as `entry_point`, which `instance_type` should be used, which `hyperparameters` are passed in, you can find all possible `HuggingFace` Parameter [here](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/sagemaker.huggingface.html#huggingface-estimator). and an example of a fine-tuning script [here](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/scripts/train.py).
+You can find all useable `instance_types` [here](https://aws.amazon.com/de/sagemaker/pricing/).
+
+The following code sample shows how you train a custom `HuggingFace` script `train.py`, passing in three hyperparameters (`epochs`, `per_device_train_batch_size`, and `model_name_or_path`).
+
+```python
+from sagemaker.huggingface import HuggingFace
+
+
+# hyperparameters, which are passed into the training job
+hyperparameters={'epochs': 1,
+                 'per_device_train_batch_size': 32,
+                 'model_name_or_path': 'distilbert-base-uncased'
+                 }
+
+# create the Estimator
+huggingface_estimator = HuggingFace(
+        entry_point='train.py',
+        source_dir='./scripts',
+        instance_type='ml.p3.2xlarge',
+        instance_count=1,
+        role=role,
+        transformers_version='4.4',
+        pytorch_version='1.6',
+        py_version='py36',
+        hyperparameters = hyperparameters
+)
+```
+
+To run the `TrainingJob` locally you can define `instance_type='local'` or `instance_type='local-gpu'` for gpu usage. _Note: this does not working within SageMaker Studio_
+
+### Execute Training
+
+You start your `TrainingJob` by calling `fit` on a `HuggingFace` Estimator. In the `fit` method you specify your input training data, like a string S3 URI `s3://my-bucket/my-training-data` or a `FileSystemInput` for [EFS or FSx Lustre](https://sagemaker.readthedocs.io/en/stable/overview.html?highlight=FileSystemInput#use-file-systems-as-training-inputs), see [here](https://sagemaker.readthedocs.io/en/stable/overview.html?highlight=FileSystemInput#use-file-systems-as-training-inputs).
+
+```python
+huggingface_estimator.fit(
+  {'train': 's3://sagemaker-us-east-1-558105141721/samples/datasets/imdb/train',
+   'test': 's3://sagemaker-us-east-1-558105141721/samples/datasets/imdb/test'}
+)
+
+```
+
+SageMaker takes care of starting and managing all the required ec2 instances for ands starts the training job by running.
+
+```bash
+/opt/conda/bin/python train.py --epochs 1 --model_name_or_path distilbert-base-uncased --per_device_train_batch_size 32
+```
+
+### Access trained model
+
+After training is done you can access your model either through the [AWS console](https://console.aws.amazon.com/console/home?nc2=h_ct&src=header-signin) or downloading it directly from S3.
+
+```python
+from sagemaker.s3 import S3Downloader
+
+S3Downloader.download(
+    s3_uri=huggingface_estimator.model_data, # s3 uri where the trained model is located
+    local_path='.', # local path where *.targ.gz is saved
+    sagemaker_session=sess # sagemaker session used for training the model
+)
+```
+
+---
+
+## Sample Notebooks
+
+You can find here a list of the official notebooks provided by Hugging Face.
+
+| Notebook                                                                                                                                                                                        | Description                                                                                                      |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------- |
+| [Getting Started Pytorch](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb)                                                   | End-to-End binary Text-Classification example using `Trainer` and `imdb` dataset                                 |
+| [Getting Started Tensorflow](https://github.com/huggingface/notebooks/blob/master/sagemaker/02_getting_started_tensorflow/sagemaker-notebook.ipynb)                                             | End-to-End binary Text-Classification example using `Keras` and `imdb` dataset                                   |
+| [Distributed Training Data Parallelism](https://github.com/huggingface/notebooks/blob/master/sagemaker/03_distributed_training_data_parallelism/sagemaker-notebook.ipynb)                       | End-to-End distributed Question-Answering example using `Trainer` and 🤗 Transformers example script for `SQAuD` |
+| [Distributed Training Model Parallelism](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)                     | End-to-End model parallelism example using `SageMakerTrainer` and `run_glue.py` script                           |
+| [Spot Instances and continues training](https://github.com/huggingface/notebooks/blob/master/sagemaker/05_spot_instances/sagemaker-notebook.ipynb)                                              | End-to-End to Text-Classification example using spot instances with continued training.                          |
+| [SageMaker Metrics](https://github.com/huggingface/notebooks/blob/master/sagemaker/06_sagemaker_metrics/sagemaker-notebook.ipynb)                                                               | End-to-End to Text-Classification example using SageMaker Metrics to extract and log metrics during training     |
+| [Distributed Training Data Parallelism Tensorflow](https://github.com/huggingface/notebooks/blob/master/sagemaker/07_tensorflow_distributed_training_data_parallelism/sagemaker-notebook.ipynb) | End-to-End distributed binary Text-Classification example using `Keras` and `TensorFlow`                    
+| [Distributed Seq2Seq Training with Data Parallelism and BART](https://github.com/huggingface/notebooks/blob/master/sagemaker/08_distributed_summarization_bart_t5/sagemaker-notebook.ipynb) | End-to-End distributed summarization example `BART-large` and 🤗 Transformers example script for `summarization`                        |
+
+
+---
+
+## Advanced Features
+
+In addition to the Deep Learning Container and the SageMaker SDK, we have implemented other additional features.
+
+### Distributed Training: Data-Parallel
+
+You can use [SageMaker Data Parallelism Library](https://aws.amazon.com/blogs/aws/managed-data-parallelism-in-amazon-sagemaker-simplifies-training-on-large-datasets/) out of the box for distributed training. We added the functionality of Data Parallelism directly into the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html). If your `train.py` uses the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) API you only need to define the distribution parameter in the HuggingFace Estimator.
+
+- [Example Notebook PyTorch](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)
+- [Example Notebook TensorFlow](https://github.com/huggingface/notebooks/blob/master/sagemaker/07_tensorflow_distributed_training_data_parallelism/sagemaker-notebook.ipynb)
+
+```python
+# configuration for running training on smdistributed Data Parallel
+distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}
+
+# create the Estimator
+huggingface_estimator = HuggingFace(
+        entry_point='train.py',
+        source_dir='./scripts',
+        instance_type='ml.p3dn.24xlarge',
+        instance_count=2,
+        role=role,
+        transformers_version='4.4.2',
+        pytorch_version='1.6.0',
+        py_version='py36',
+        hyperparameters = hyperparameters
+        distribution = distribution
+)
+
+```
+
+### Distributed Training: Model-Parallel
+
+You can use [SageMaker Model Parallelism Library](https://aws.amazon.com/blogs/aws/amazon-sagemaker-simplifies-training-deep-learning-models-with-billions-of-parameters/) out of the box for distributed training. We added the functionality of Model Parallelism directly into the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html). If your `train.py` uses the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) API you only need to define the distribution parameter in the HuggingFace Estimator.  
+For detailed information about the adjustments take a look [here](https://sagemaker.readthedocs.io/en/stable/api/training/smd_model_parallel_general.html?highlight=modelparallel#required-sagemaker-python-sdk-parameters).
+
+
+- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)
+
+
+```python
+# configuration for running training on smdistributed Model Parallel
+mpi_options = {
+    "enabled" : True,
+    "processes_per_host" : 8
+}
+
+smp_options = {
+    "enabled":True,
+    "parameters": {
+        "microbatches": 4,
+        "placement_strategy": "spread",
+        "pipeline": "interleaved",
+        "optimize": "speed",
+        "partitions": 4,
+        "ddp": True,
+    }
+}
+
+distribution={
+    "smdistributed": {"modelparallel": smp_options},
+    "mpi": mpi_options
+}
+
+ # create the Estimator
+huggingface_estimator = HuggingFace(
+        entry_point='train.py',
+        source_dir='./scripts',
+        instance_type='ml.p3dn.24xlarge',
+        instance_count=2,
+        role=role,
+        transformers_version='4.4.2',
+        pytorch_version='1.6.0',
+        py_version='py36',
+        hyperparameters = hyperparameters,
+        distribution = distribution
+)
+```
+
+### Spot Instances
+
+With the creation of HuggingFace Framework extension for the SageMaker Python SDK we can also leverage the benefit of [fully-managed EC2 spot instances](https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html) and save up to 90% of our training cost.
+
+_Note: Unless your training job completes quickly, we recommend you use [checkpointing](https://docs.aws.amazon.com/sagemaker/latest/dg/model-checkpoints.html) with managed spot training, therefore you need to define the `checkpoint_s3_uri`._
+
+To use spot instances with the `HuggingFace` Estimator we have to set the `use_spot_instances` parameter to `True` and define your `max_wait` and `max_run` time. You can read more about the [managed spot training lifecycle here](https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html).
+
+- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/05_spot_instances/sagemaker-notebook.ipynb)
+
+```python
+# hyperparameters, which are passed into the training job
+hyperparameters={'epochs': 1,
+                 'train_batch_size': 32,
+                 'model_name':'distilbert-base-uncased',
+                 'output_dir':'/opt/ml/checkpoints'
+                 }
+# create the Estimator
+
+huggingface_estimator = HuggingFace(
+        entry_point='train.py',
+        source_dir='./scripts',
+        instance_type='ml.p3.2xlarge',
+        instance_count=1,
+	    checkpoint_s3_uri=f's3://{sess.default_bucket()}/checkpoints'
+        use_spot_instances=True,
+        max_wait=3600, # This should be equal to or greater than max_run in seconds'
+        max_run=1000,
+        role=role,
+        transformers_version='4.4',
+        pytorch_version='1.6',
+        py_version='py36',
+        hyperparameters = hyperparameters
+)
+
+# Training seconds: 874
+# Billable seconds: 262
+# Managed Spot Training savings: 70.0%
+
+```
+
+### Git Repository
+
+When you create a `HuggingFace` Estimator, you can specify a [training script that is stored in a GitHub repository](https://sagemaker.readthedocs.io/en/stable/overview.html#use-scripts-stored-in-a-git-repository) as the entry point for the estimator, so that you don’t have to download the scripts locally. If Git support is enabled, the `entry_point` and `source_dir` should be relative paths in the Git repo if provided. 
+
+If you are using `git_config` to run the [🤗 Transformers examples scripts](https://github.com/huggingface/transformers/tree/master/examples) keep in mind that you need to configure the right `'branch'` for you `transformers_version`, e.g. if you use `transformers_version='4.4.2` you have to use `'branch':'v4.4.2'`. 
+
+As an example to use `git_config` with an [example script from the transformers repository](https://github.com/huggingface/transformers/tree/master/examples/text-classification).
+
+_Tip: define `output_dir` as `/opt/ml/model` in the hyperparameter for the script to save your model to S3 after training._
+
+- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/02_getting_started_tensorflow/sagemaker-notebook.ipynb)
+
+```python
+# configure git settings
+git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.4.2'} # v4.4.2 is referring to the `transformers_version you use in the estimator.
+
+ # create the Estimator
+huggingface_estimator = HuggingFace(
+        entry_point='run_glue.py',
+        source_dir='./examples/text-classification',
+        git_config=git_config,
+        instance_type='ml.p3.2xlarge',
+        instance_count=1,
+        role=role,
+        transformers_version='4.4',
+        pytorch_version='1.6',
+        py_version='py36',
+        hyperparameters=hyperparameters
+)
+
+```
+
+### SageMaker Metrics
+
+[SageMaker Metrics](https://docs.aws.amazon.com/sagemaker/latest/dg/training-metrics.html#define-train-metrics) can automatically parse the logs for metrics and send those metrics to CloudWatch. If you want SageMaker to parse logs you have to specify the metrics that you want SageMaker to send to CloudWatch when you configure the training job. You specify the name of the metrics that you want to send and the regular expressions that SageMaker uses to parse the logs that your algorithm emits to find those metrics.
+
+- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/06_sagemaker_metrics/sagemaker-notebook.ipynb)
+
+```python
+# define metrics definitions
+
+metric_definitions = [
+{"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
+{"Name": "eval_accuracy", "Regex": "eval_accuracy.*=\D*(.*?)$"},
+{"Name": "eval_loss", "Regex": "eval_loss.*=\D*(.*?)$"},
+]
+
+# create the Estimator
+
+huggingface_estimator = HuggingFace(
+        entry_point='train.py',
+        source_dir='./scripts',
+        instance_type='ml.p3.2xlarge',
+        instance_count=1,
+        role=role,
+        transformers_version='4.4',
+        pytorch_version='1.6',
+        py_version='py36',
+        metric_definitions=metric_definitions,
+        hyperparameters = hyperparameters)
+
+```
+
+## Additional Resources
+
+- [Announcement Blog Post](https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face)
+
+- [AWS and Hugging Face collaborate to simplify and accelerate adoption of natural language processing](https://aws.amazon.com/blogs/machine-learning/aws-and-hugging-face-collaborate-to-simplify-and-accelerate-adoption-of-natural-language-processing-models/)
+
+- [Amazon SageMaker documentation for Hugging Face](https://docs.aws.amazon.com/sagemaker/latest/dg/hugging-face.html)
+
+- [SageMaker Python SDK](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/index.html)
--- a/docs/source/task_summary.rst
+++ b/docs/source/task_summary.rst
@@ -454,7 +454,7 @@ of tokens.
    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
    >>> model = AutoModelWithLMHead.from_pretrained("gpt2")

-    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and "
+    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and"

    >>> input_ids = tokenizer.encode(sequence, return_tensors="pt")

--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -151,7 +151,6 @@ As mentioned earlier you can see what tests are contained inside the ``Optimizat

   pytest tests/test_optimization.py::OptimizationTest --collect-only -q

-
 You can run tests by keyword expressions.

 To run only tests whose name contains ``adam``:
@@ -160,6 +159,9 @@ To run only tests whose name contains ``adam``:

   pytest -k adam tests/test_optimization.py

+Logical ``and`` and ``or`` can be used to indicate whether all keywords should match or either. ``not`` can be used to
+negate.
+
 To run all tests except those whose name contains ``adam``:

 .. code-block:: bash
@@ -168,11 +170,24 @@ To run all tests except those whose name contains ``adam``:

 And you can combine the two patterns in one:

-
 .. code-block:: bash

   pytest -k "ada and not adam" tests/test_optimization.py

+For example to run both ``test_adafactor`` and ``test_adam_w`` you can use:
+
+.. code-block:: bash
+
+   pytest -k "test_adam_w or test_adam_w" tests/test_optimization.py
+
+Note that we use ``or`` here, since we want either of the keywords to match to include both.
+
+If you want to include only tests that include both patterns, ``and`` is to be used:
+
+.. code-block:: bash
+
+   pytest -k "test and ada" tests/test_optimization.py
+


 Run only modified tests
--- a/examples/README.md
+++ b/examples/README.md
@@ -15,8 +15,13 @@ limitations under the License.

 # Examples

-This folder contains actively maintained examples of use of 🤗 Transformers organized along NLP tasks. If you are looking for an example that used to
-be in this folder, it may have moved to our [research projects](https://github.com/huggingface/transformers/tree/master/examples/research_projects) subfolder (which contains frozen snapshots of research projects).
+This folder contains actively maintained examples of use of 🤗 Transformers organized along NLP tasks. If you are looking for an example that used to be in this folder, it may have moved to our [research projects](https://github.com/huggingface/transformers/tree/master/examples/research_projects) subfolder (which contains frozen snapshots of research projects) or to the [legacy](https://github.com/huggingface/transformers/tree/master/examples/legacy) subfolder.
+
+While we strive to present as many use cases as possible, the scripts in this folder are just examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs. To help you with that, all the PyTorch versions of the examples fully expose the preprocessing of the data. This way, you can easily tweak them.
+
+This is similar if you want the scripts to report another metric than the one they currently use: look at the `compute_metrics` function inside the script. It takes the full arrays of predictions and labels and has to return a dictionary of string keys and float values. Just change it to add (or replace) your own metric to the ones already reported.
+
+Please discuss on the [forum](https://discuss.huggingface.co/) or in an [issue](https://github.com/huggingface/transformers/issues) a feature you would like to implement in an example before submitting a PR: we welcome bug fixes but since we want to keep the examples as simple as possible, it's unlikely we will merge a pull request adding more functionality at the cost of readability.

 ## Important note

@@ -85,14 +90,46 @@ Coming soon!

 | Task | Example datasets | Trainer support | TFTrainer support | 🤗 Datasets | Colab
 |---|---|:---:|:---:|:---:|:---:|
-| [**`language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/language-modeling)       | Raw text        | ✅ | -  | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)
-| [**`multiple-choice`**](https://github.com/huggingface/transformers/tree/master/examples/multiple-choice)           | SWAG, RACE, ARC | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb)
+| [**`language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/language-modeling)       | WikiText-2      | ✅ | -  | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/language_modeling.ipynb)
+| [**`multiple-choice`**](https://github.com/huggingface/transformers/tree/master/examples/multiple-choice)           | SWAG            | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/multiple_choice.ipynb)
 | [**`question-answering`**](https://github.com/huggingface/transformers/tree/master/examples/question-answering)     | SQuAD           | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb)
-| [**`summarization`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)                     | CNN/Daily Mail  | ✅  | - | - | -
-| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/text-classification)   | GLUE, XNLI      | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification.ipynb)
+| [**`summarization`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)                     |  XSum           | ✅ | -  | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/summarization.ipynb)
+| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/text-classification)   | GLUE            | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification.ipynb)
 | [**`text-generation`**](https://github.com/huggingface/transformers/tree/master/examples/text-generation)           | -               | n/a | n/a | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)
 | [**`token-classification`**](https://github.com/huggingface/transformers/tree/master/examples/token-classification) | CoNLL NER       | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification.ipynb)
-| [**`translation`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)                       | WMT             | ✅  | - | - | -
+| [**`translation`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)                       | WMT             | ✅  | - | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/translation.ipynb)
+
+
+## Running quick tests
+
+Most examples are equipped with a mechanism to truncate the number of dataset samples to the desired length. This is useful for debugging purposes, for example to quickly check that all stages of the programs can complete, before running the same setup on the full dataset which may take hours to complete.
+
+For example here is how to truncate all three splits to just 50 samples each:
+```
+examples/token-classification/run_ner.py \
+--max_train_samples 50 \
+--max_val_samples 50 \
+--max_test_samples 50 \
+[...]
+```
+
+Most example scripts should have the first two command line arguments and some have the third one. You can quickly check if a given example supports any of these by passing a `-h` option, e.g.:
+```
+examples/token-classification/run_ner.py -h
+```
+
+## Resuming training
+
+You can resume training from a previous checkpoint like this:
+
+1. Pass `--output_dir previous_output_dir` without `--overwrite_output_dir` to resume training from the latest checkpoint in `output_dir` (what you would use if the training was interrupted, for instance).
+2. Pass `--model_name_or_path path_to_a_specific_checkpoint` to resume training from that checkpoint folder.
+
+Should you want to turn an example into a notebook where you'd no longer have access to the command
+line, 🤗 Trainer supports resuming from a checkpoint via `trainer.train(resume_from_checkpoint)`.
+
+1. If `resume_from_checkpoint` is `True` it will look for the last checkpoint in the value of `output_dir` passed via `TrainingArguments`.
+2. If `resume_from_checkpoint` is a path to a specific checkpoint it will use that saved checkpoint folder to resume the training from.


 ## Distributed training and mixed precision
@@ -104,7 +141,7 @@ use the following command:
 ```bash
 python -m torch.distributed.launch \
    --nproc_per_node number_of_gpu_you_have path_to_script.py \
-	--all_arguments_of_the_script 
+	--all_arguments_of_the_script
 ```

 As an example, here is how you would fine-tune the BERT large model (with whole word masking) on the text
@@ -148,7 +185,7 @@ regular training script with its arguments (this is similar to the `torch.distri
 ```bash
 python xla_spawn.py --num_cores num_tpu_you_have \
    path_to_script.py \
-	--all_arguments_of_the_script 
+	--all_arguments_of_the_script
 ```

 As an example, here is how you would fine-tune the BERT large model (with whole word masking) on the text
@@ -203,34 +240,11 @@ Whenever you use `Trainer` or `TFTrainer` classes, your losses, evaluation metri

 Advanced configuration is possible by setting environment variables:

-<table>
-  <thead>
-    <tr>
-      <th style="text-align:left">Environment Variables</th>
-      <th style="text-align:left">Options</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td style="text-align:left">WANDB_LOG_MODEL</td>
-      <td style="text-align:left">Log the model as artifact at the end of training (<b>false</b> by default)</td>
-    </tr>
-    <tr>
-      <td style="text-align:left">WANDB_WATCH</td>
-      <td style="text-align:left">
-        <ul>
-          <li><b>gradients</b> (default): Log histograms of the gradients</li>
-          <li><b>all</b>: Log histograms of gradients and parameters</li>
-          <li><b>false</b>: No gradient or parameter logging</li>
-        </ul>
-      </td>
-    </tr>
-    <tr>
-      <td style="text-align:left">WANDB_PROJECT</td>
-      <td style="text-align:left">Organize runs by project</td>
-    </tr>
-  </tbody>
-</table>
+| Environment Variable | Value |
+|---|---|
+| WANDB_LOG_MODEL | Log the model as artifact (log the model as artifact at the end of training (`false` by default) |
+| WANDB_WATCH | one of `gradients` (default) to log histograms of gradients, `all` to log histograms of both gradients and parameters, or `false` for no histogram logging |
+| WANDB_PROJECT | Organize runs by project |

 Set run names with `run_name` argument present in scripts or as part of `TrainingArguments`.

--- a/examples/language-modeling/README.md
+++ b/examples/language-modeling/README.md
@@ -22,12 +22,11 @@ ALBERT, BERT, DistilBERT, RoBERTa, XLNet... GPT and GPT-2 are trained or fine-tu
 loss. XLNet uses permutation language modeling (PLM), you can find more information about the differences between those
 objectives in our [model summary](https://huggingface.co/transformers/model_summary.html).

-These scripts leverage the 🤗 Datasets library and the Trainer API. You can easily customize them to your needs if you
-need extra processing on your datasets.
+There are two sets of scripts provided. The first set leverages the Trainer API. The second set with `no_trainer` in the suffix uses a custom training loop and leverages the 🤗 Accelerate library . Both sets use the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.

 **Note:** The old script `run_language_modeling.py` is still available [here](https://github.com/huggingface/transformers/blob/master/examples/legacy/run_language_modeling.py).

-The following examples, will run on a datasets hosted on our [hub](https://huggingface.co/datasets) or with your own
+The following examples, will run on datasets hosted on our [hub](https://huggingface.co/datasets) or with your own
 text files for training and validation. We give examples of both below.

 ### GPT-2/GPT and causal language modeling
@@ -60,6 +59,15 @@ python run_clm.py \
    --output_dir /tmp/test-clm
 ```

+This uses the built in HuggingFace `Trainer` for training. If you want to use a custom training loop, you can utilize or adapt the `run_clm_no_trainer.py` script. Take a look at the script for a list of supported arguments. An example is shown below:
+
+```bash
+python run_clm_no_trainer.py \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --model_name_or_path gpt2 \
+    --output_dir /tmp/test-clm
+```

 ### RoBERTa/BERT/DistilBERT and masked language modeling

@@ -95,23 +103,33 @@ python run_mlm.py \
 If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script
 concatenates all texts and then splits them in blocks of the same length).

+This uses the built in HuggingFace `Trainer` for training. If you want to use a custom training loop, you can utilize or adapt the `run_mlm_no_trainer.py` script. Take a look at the script for a list of supported arguments. An example is shown below:
+
+```bash
+python run_mlm_no_trainer.py \
+    --dataset_name wikitext \
+    --dataset_config_name wikitext-2-raw-v1 \
+    --model_name_or_path roberta-base \
+    --output_dir /tmp/test-mlm
+```
+
 **Note:** On TPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make
 sure all your batches have the same length.

 ### Whole word masking

-This part was moved to `examples/research_projects/mlm_wwm`. 
+This part was moved to `examples/research_projects/mlm_wwm`.

 ### XLNet and permutation language modeling

-XLNet uses a different training objective, which is permutation language modeling. It is an autoregressive method 
-to learn bidirectional contexts by maximizing the expected likelihood over all permutations of the input 
+XLNet uses a different training objective, which is permutation language modeling. It is an autoregressive method
+to learn bidirectional contexts by maximizing the expected likelihood over all permutations of the input
 sequence factorization order.

-We use the `--plm_probability` flag to define the ratio of length of a span of masked tokens to surrounding 
+We use the `--plm_probability` flag to define the ratio of length of a span of masked tokens to surrounding
 context length for permutation language modeling.

-The `--max_span_length` flag may also be used to limit the length of a span of masked tokens used 
+The `--max_span_length` flag may also be used to limit the length of a span of masked tokens used
 for permutation language modeling.

 Here is how to fine-tune XLNet on wikitext-2:
--- a/examples/language-modeling/run_clm.py
+++ b/examples/language-modeling/run_clm.py
@@ -48,7 +48,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
+check_min_version("4.5.0")

 logger = logging.getLogger(__name__)

@@ -213,7 +213,7 @@ def main():
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
-    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)
--- a/examples/language-modeling/run_clm_no_trainer.py
+++ b/examples/language-modeling/run_clm_no_trainer.py
@@ -0,0 +1,456 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for causal language modeling (BERT, ALBERT, RoBERTa...)
+on a text file or a dataset without using HuggingFace Trainer.
+
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=causal-lm
+"""
+# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
+
+import argparse
+import logging
+import math
+import os
+import random
+
+import datasets
+import torch
+from datasets import load_dataset
+from torch.utils.data.dataloader import DataLoader
+from tqdm.auto import tqdm
+
+import transformers
+from accelerate import Accelerator
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_MAPPING,
+    AdamW,
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    SchedulerType,
+    default_data_collator,
+    get_scheduler,
+    set_seed,
+)
+
+
+logger = logging.getLogger(__name__)
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task")
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
+    )
+    parser.add_argument(
+        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
+    )
+    parser.add_argument(
+        "--validation_split_percentage",
+        default=5,
+        help="The percentage of the train set used as validation set in case there's no validation split",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Model type to use if training from scratch.",
+        choices=MODEL_TYPES,
+    )
+    parser.add_argument(
+        "--block_size",
+        type=int,
+        default=None,
+        help="Optional input sequence length after tokenization. The training dataset will be truncated in block of this size for training. Default to the model max input length for single sentence inputs (take into account special tokens).",
+    )
+    parser.add_argument(
+        "--preprocessing_num_workers",
+        type=int,
+        default=None,
+        help="The number of processes to use for the preprocessing.",
+    )
+    parser.add_argument(
+        "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
+    )
+
+    args = parser.parse_args()
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_file is None and args.validation_file is None:
+        raise ValueError("Need either a dataset name or a training/validation file.")
+    else:
+        if args.train_file is not None:
+            extension = args.train_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
+        if args.validation_file is not None:
+            extension = args.validation_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
+
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    accelerator = Accelerator()
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state)
+
+    # Setup logging, we only want one process per machine to log things on the screen.
+    # accelerator.is_local_main_process is only True for one process per machine.
+    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                split=f"train[:{args.validation_split_percentage}%]",
+            )
+            raw_datasets["train"] = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                split=f"train[{args.validation_split_percentage}%:]",
+            )
+    else:
+        data_files = {}
+        if args.train_file is not None:
+            data_files["train"] = args.train_file
+        if args.validation_file is not None:
+            data_files["validation"] = args.validation_file
+        extension = args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+        raw_datasets = load_dataset(extension, data_files=data_files)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if args.config_name:
+        config = AutoConfig.from_pretrained(args.config_name)
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(args.model_name_or_path)
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
+    elif args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+
+    if args.model_name_or_path:
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForCausalLM.from_config(config)
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    column_names = raw_datasets["train"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+
+    def tokenize_function(examples):
+        return tokenizer(examples[text_column_name])
+
+    tokenized_datasets = raw_datasets.map(
+        tokenize_function,
+        batched=True,
+        num_proc=args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=not args.overwrite_cache,
+    )
+
+    if args.block_size is None:
+        block_size = tokenizer.model_max_length
+        if block_size > 1024:
+            logger.warn(
+                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
+                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
+            )
+        block_size = 1024
+    else:
+        if args.block_size > tokenizer.model_max_length:
+            logger.warn(
+                f"The block_size passed ({args.block_size}) is larger than the maximum length for the model"
+                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
+            )
+        block_size = min(args.block_size, tokenizer.model_max_length)
+
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        result["labels"] = result["input_ids"].copy()
+        return result
+
+    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
+    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
+    # to preprocess.
+    #
+    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+
+    lm_datasets = tokenized_datasets.map(
+        group_texts,
+        batched=True,
+        num_proc=args.preprocessing_num_workers,
+        load_from_cache_file=not args.overwrite_cache,
+    )
+
+    train_dataset = lm_datasets["train"]
+    eval_dataset = lm_datasets["validation"]
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # DataLoaders creation:
+    train_dataloader = DataLoader(
+        train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size
+    )
+    eval_dataloader = DataLoader(
+        eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size
+    )
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader
+    )
+
+    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
+    # shorter in multiprocess)
+
+    # Scheduler and math around the number of training steps.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    else:
+        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # Train!
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+
+    for epoch in range(args.num_train_epochs):
+        model.train()
+        for step, batch in enumerate(train_dataloader):
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss = loss / args.gradient_accumulation_steps
+            accelerator.backward(loss)
+            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+        model.eval()
+        losses = []
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                outputs = model(**batch)
+
+            loss = outputs.loss
+            losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size)))
+
+        losses = torch.cat(losses)
+        losses = losses[: len(eval_dataset)]
+        perplexity = math.exp(torch.mean(losses))
+
+        logger.info(f"epoch {epoch}: perplexity: {perplexity}")
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/language-modeling/run_mlm.py
+++ b/examples/language-modeling/run_mlm.py
@@ -48,7 +48,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
+check_min_version("4.5.0")

 logger = logging.getLogger(__name__)
 MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys())
@@ -223,7 +223,7 @@ def main():
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
-    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)
--- a/examples/language-modeling/run_mlm_flax.py
+++ b/examples/language-modeling/run_mlm_flax.py
@@ -307,7 +307,7 @@ def create_learning_rate_scheduler(
                progress = jnp.maximum(0.0, (step - warmup_steps) / float(steps_per_cycle))
                ret *= jnp.maximum(0.0, 0.5 * (1.0 + jnp.cos(jnp.pi * (progress % 1.0))))
            else:
-                raise ValueError("Unknown factor %s." % name)
+                raise ValueError(f"Unknown factor {name}.")
        return jnp.asarray(ret, dtype=jnp.float32)

    return step_fn
@@ -332,9 +332,7 @@ def accuracy(logits, targets, weights=None):
      Tuple of scalar loss and batch normalizing factor.
    """
    if logits.ndim != targets.ndim + 1:
-        raise ValueError(
-            "Incorrect shapes. Got shape %s logits and %s targets" % (str(logits.shape), str(targets.shape))
-        )
+        raise ValueError(f"Incorrect shapes. Got shape {logits.shape} logits and {targets.shape} targets")

    loss = jnp.equal(jnp.argmax(logits, axis=-1), targets)
    loss *= weights
@@ -353,9 +351,7 @@ def cross_entropy(logits, targets, weights=None, label_smoothing=0.0):
      Tuple of scalar loss and batch normalizing factor.
    """
    if logits.ndim != targets.ndim + 1:
-        raise ValueError(
-            "Incorrect shapes. Got shape %s logits and %s targets" % (str(logits.shape), str(targets.shape))
-        )
+        raise ValueError(f"Incorrect shapes. Got shape {logits.shape} logits and {targets.shape} targets")

    vocab_size = logits.shape[-1]
    confidence = 1.0 - label_smoothing
@@ -463,7 +459,7 @@ if __name__ == "__main__":
    )

    # Set the verbosity to info of the Transformers logger (on main process only):
-    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)
--- a/examples/language-modeling/run_mlm_no_trainer.py
+++ b/examples/language-modeling/run_mlm_no_trainer.py
@@ -0,0 +1,500 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...)
+on a text file or a dataset without using HuggingFace Trainer.
+
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=masked-lm
+"""
+# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
+
+import argparse
+import logging
+import math
+import os
+import random
+
+import datasets
+import torch
+from datasets import load_dataset
+from torch.utils.data.dataloader import DataLoader
+from tqdm.auto import tqdm
+
+import transformers
+from accelerate import Accelerator
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_MAPPING,
+    AdamW,
+    AutoConfig,
+    AutoModelForMaskedLM,
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    SchedulerType,
+    get_scheduler,
+    set_seed,
+)
+
+
+logger = logging.getLogger(__name__)
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a Masked Language Modeling task")
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
+    )
+    parser.add_argument(
+        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
+    )
+    parser.add_argument(
+        "--validation_split_percentage",
+        default=5,
+        help="The percentage of the train set used as validation set in case there's no validation split",
+    )
+    parser.add_argument(
+        "--pad_to_max_length",
+        action="store_true",
+        help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Model type to use if training from scratch.",
+        choices=MODEL_TYPES,
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        type=int,
+        default=None,
+        help="The maximum total input sequence length after tokenization. Sequences longer than this will be truncated.",
+    )
+    parser.add_argument(
+        "--line_by_line",
+        type=bool,
+        default=False,
+        help="Whether distinct lines of text in the dataset are to be handled as distinct sequences.",
+    )
+    parser.add_argument(
+        "--preprocessing_num_workers",
+        type=int,
+        default=None,
+        help="The number of processes to use for the preprocessing.",
+    )
+    parser.add_argument(
+        "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument(
+        "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
+    )
+
+    args = parser.parse_args()
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_file is None and args.validation_file is None:
+        raise ValueError("Need either a dataset name or a training/validation file.")
+    else:
+        if args.train_file is not None:
+            extension = args.train_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file."
+        if args.validation_file is not None:
+            extension = args.validation_file.split(".")[-1]
+            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."
+
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    accelerator = Accelerator()
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state)
+
+    # Setup logging, we only want one process per machine to log things on the screen.
+    # accelerator.is_local_main_process is only True for one process per machine.
+    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                split=f"train[:{args.validation_split_percentage}%]",
+            )
+            raw_datasets["train"] = load_dataset(
+                args.dataset_name,
+                args.dataset_config_name,
+                split=f"train[{args.validation_split_percentage}%:]",
+            )
+    else:
+        data_files = {}
+        if args.train_file is not None:
+            data_files["train"] = args.train_file
+        if args.validation_file is not None:
+            data_files["validation"] = args.validation_file
+        extension = args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+        raw_datasets = load_dataset(extension, data_files=data_files)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if args.config_name:
+        config = AutoConfig.from_pretrained(args.config_name)
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(args.model_name_or_path)
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
+    elif args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+
+    if args.model_name_or_path:
+        model = AutoModelForMaskedLM.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForMaskedLM.from_config(config)
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    column_names = raw_datasets["train"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+
+    if args.max_seq_length is None:
+        max_seq_length = tokenizer.model_max_length
+        if max_seq_length > 1024:
+            logger.warn(
+                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
+                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
+            )
+            max_seq_length = 1024
+    else:
+        if args.max_seq_length > tokenizer.model_max_length:
+            logger.warn(
+                f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
+                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+            )
+        max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)
+
+    if args.line_by_line:
+        # When using line_by_line, we just tokenize each nonempty line.
+        padding = "max_length" if args.pad_to_max_length else False
+
+        def tokenize_function(examples):
+            # Remove empty lines
+            examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
+            return tokenizer(
+                examples["text"],
+                padding=padding,
+                truncation=True,
+                max_length=max_seq_length,
+                # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
+                # receives the `special_tokens_mask`.
+                return_special_tokens_mask=True,
+            )
+
+        tokenized_datasets = raw_datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            remove_columns=[text_column_name],
+            load_from_cache_file=not args.overwrite_cache,
+        )
+    else:
+        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
+        # efficient when it receives the `special_tokens_mask`.
+        def tokenize_function(examples):
+            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
+
+        tokenized_datasets = raw_datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not args.overwrite_cache,
+        )
+
+        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
+        # max_seq_length.
+        def group_texts(examples):
+            # Concatenate all texts.
+            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            total_length = len(concatenated_examples[list(examples.keys())[0]])
+            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+            # customize this part to your needs.
+            total_length = (total_length // max_seq_length) * max_seq_length
+            # Split by chunks of max_len.
+            result = {
+                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
+                for k, t in concatenated_examples.items()
+            }
+            return result
+
+        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+        # might be slower to preprocess.
+        #
+        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+
+        tokenized_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            num_proc=args.preprocessing_num_workers,
+            load_from_cache_file=not args.overwrite_cache,
+        )
+
+    train_dataset = tokenized_datasets["train"]
+    eval_dataset = tokenized_datasets["validation"]
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # Data collator
+    # This one will take care of randomly masking the tokens.
+    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=args.mlm_probability)
+
+    # DataLoaders creation:
+    train_dataloader = DataLoader(
+        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
+    )
+    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader
+    )
+
+    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
+    # shorter in multiprocess)
+
+    # Scheduler and math around the number of training steps.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    else:
+        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # Train!
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+
+    for epoch in range(args.num_train_epochs):
+        model.train()
+        for step, batch in enumerate(train_dataloader):
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss = loss / args.gradient_accumulation_steps
+            accelerator.backward(loss)
+            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+        model.eval()
+        losses = []
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                outputs = model(**batch)
+
+            loss = outputs.loss
+            losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size)))
+
+        losses = torch.cat(losses)
+        losses = losses[: len(eval_dataset)]
+        perplexity = math.exp(torch.mean(losses))
+
+        logger.info(f"epoch {epoch}: perplexity: {perplexity}")
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/language-modeling/run_plm.py
+++ b/examples/language-modeling/run_plm.py
@@ -44,7 +44,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
+check_min_version("4.5.0")

 logger = logging.getLogger(__name__)

@@ -220,7 +220,7 @@ def main():
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
-    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)
--- a/examples/legacy/token-classification/run_tf_ner.py
+++ b/examples/legacy/token-classification/run_tf_ner.py
--- a/examples/multiple-choice/README.md
+++ b/examples/multiple-choice/README.md
@@ -14,11 +14,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->

-## Multiple Choice
+# Multiple Choice

 Based on the script [`run_swag.py`]().

-#### Fine-tuning on SWAG
+## PyTorch script: fine-tuning on SWAG
+
+`run_swag` allows you to fine-tune any model from our [hub](https://huggingface.co/models) (as long as its architecture as a `ForMultipleChoice` version in the library) on the SWAG dataset or your own csv/jsonlines files as long as they are structured the same way. To make it works on another dataset, you will need to tweak the `preprocess_function` inside the script.

 ```bash
 python examples/multiple-choice/run_swag.py \
@@ -39,6 +41,73 @@ eval_acc = 0.8338998300509847
 eval_loss = 0.44457291918821606
 ```

+## PyTorch version, no Trainer
+
+Based on the script [run_ner_no_trainer.py](https://github.com/huggingface/transformers/blob/master/examples/multiple-choice/run_swag_no_trainer.py).
+
+Like `run_swag.py`, this script allows you to fine-tune any of the models on the [hub](https://huggingface.co/models) (as long as its architecture as a `ForMultipleChoice` version in the library) on
+the SWAG dataset or your own data in a csv or a JSON file. The main difference is that this
+script exposes the bare training loop, to allow you to quickly experiment and add any customization you would like.
+
+It offers less options than the script with `Trainer` (but you can easily change the options for the optimizer
+or the dataloaders directly in the script) but still run in a distributed setup, on TPU and supports mixed precision by
+the mean of the [🤗 `Accelerate`](https://github.com/huggingface/accelerate) library. You can use the script normally
+after installing it:
+
+```bash
+pip install accelerate
+```
+
+then
+
+```bash
+export DATASET_NAME=swag
+
+python run_swag_no_trainer.py \
+  --model_name_or_path bert-base-cased \
+  --dataset_name $DATASET_NAME \
+  --max_seq_length 128 \
+  --per_device_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --output_dir /tmp/$DATASET_NAME/
+```
+
+You can then use your usual launchers to run in it in a distributed environment, but the easiest way is to run
+
+```bash
+accelerate config
+```
+
+and reply to the questions asked. Then
+
+```bash
+accelerate test
+```
+
+that will check everything is ready for training. Finally, you can launch training with
+
+```bash
+export DATASET_NAME=swag
+
+accelerate launch run_swag_no_trainer.py \
+  --model_name_or_path bert-base-cased \
+  --dataset_name $DATASET_NAME \
+  --max_seq_length 128 \
+  --per_device_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --output_dir /tmp/$DATASET_NAME/
+```
+
+This command is the same and will work for:
+
+- a CPU-only setup
+- a setup with one GPU
+- a distributed training with several GPUs (single or multi node)
+- a training on TPUs
+
+Note that this library is in alpha release so your feedback is more than welcome if you encounter any problem using it.

 ## Tensorflow

@@ -60,6 +129,3 @@ python ./examples/multiple-choice/run_tf_multiple_choice.py \
 --gradient_accumulation_steps 2 \
 --overwrite_output
 ```
-
-# Run it in colab
-[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb)
--- a/examples/multiple-choice/run_no_trainer.sh
+++ b/examples/multiple-choice/run_no_trainer.sh
@@ -0,0 +1,19 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+accelerate launch run_swag_no_trainer.py \
+  --model_name_or_path bert-base-uncased \
+  --dataset_name swag \
+  --output_dir /tmp/test-swag-no-trainer \
+  --pad_to_max_length
--- a/examples/multiple-choice/run_swag.py
+++ b/examples/multiple-choice/run_swag.py
@@ -46,7 +46,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
+check_min_version("4.5.0")

 logger = logging.getLogger(__name__)

@@ -247,7 +247,7 @@ def main():
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
-    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)
--- a/examples/multiple-choice/run_swag_no_trainer.py
+++ b/examples/multiple-choice/run_swag_no_trainer.py
@@ -0,0 +1,488 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning a 🤗 Transformers model on multiple choice relying on the accelerate library without using a Trainer.
+"""
+# You can also adapt this script on your own multiple choice task. Pointers for this are left as comments.
+
+import argparse
+import logging
+import math
+import os
+import random
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import datasets
+import torch
+from datasets import load_dataset, load_metric
+from torch.utils.data.dataloader import DataLoader
+from tqdm.auto import tqdm
+
+import transformers
+from accelerate import Accelerator
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_MAPPING,
+    AdamW,
+    AutoConfig,
+    AutoModelForMultipleChoice,
+    AutoTokenizer,
+    PreTrainedTokenizerBase,
+    SchedulerType,
+    default_data_collator,
+    get_scheduler,
+    set_seed,
+)
+from transformers.file_utils import PaddingStrategy
+
+
+logger = logging.getLogger(__name__)
+# You should update this to your particular problem to have better documentation of `model_type`
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task")
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
+    )
+    parser.add_argument(
+        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
+    )
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=128,
+        help=(
+            "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
+            " sequences shorter will be padded if `--pad_to_max_lengh` is passed."
+        ),
+    )
+    parser.add_argument(
+        "--pad_to_max_length",
+        action="store_true",
+        help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--use_slow_tokenizer",
+        action="store_true",
+        help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Model type to use if training from scratch.",
+        choices=MODEL_TYPES,
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Activate debug mode and run training only with a subset of data.",
+    )
+    args = parser.parse_args()
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    return args
+
+
+@dataclass
+class DataCollatorForMultipleChoice:
+    """
+    Data collator that will dynamically pad the inputs for multiple choice received.
+
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+            sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+            maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+            different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+
+    def __call__(self, features):
+        label_name = "label" if "label" in features[0].keys() else "labels"
+        labels = [feature.pop(label_name) for feature in features]
+        batch_size = len(features)
+        num_choices = len(features[0]["input_ids"])
+        flattened_features = [
+            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+        ]
+        flattened_features = sum(flattened_features, [])
+
+        batch = self.tokenizer.pad(
+            flattened_features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+
+        # Un-flatten
+        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
+        # Add back labels
+        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
+        return batch
+
+
+def main():
+    args = parse_args()
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    accelerator = Accelerator()
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state)
+
+    # Setup logging, we only want one process per machine to log things on the screen.
+    # accelerator.is_local_main_process is only True for one process per machine.
+    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+    else:
+        data_files = {}
+        if args.train_file is not None:
+            data_files["train"] = args.train_file
+        if args.validation_file is not None:
+            data_files["validation"] = args.validation_file
+        extension = args.train_file.split(".")[-1]
+        raw_datasets = load_dataset(extension, data_files=data_files)
+    # Trim a number of training examples
+    if args.debug:
+        for split in raw_datasets.keys():
+            raw_datasets[split] = raw_datasets[split].select(range(100))
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    if raw_datasets["train"] is not None:
+        column_names = raw_datasets["train"].column_names
+    else:
+        column_names = raw_datasets["validation"].column_names
+
+    # When using your own dataset or a different dataset from swag, you will probably need to change this.
+    ending_names = [f"ending{i}" for i in range(4)]
+    context_name = "sent1"
+    question_header_name = "sent2"
+    label_column_name = "label" if "label" in column_names else "labels"
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if args.config_name:
+        config = AutoConfig.from_pretrained(args.model_name_or_path)
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(args.model_name_or_path)
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
+    elif args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+
+    if args.model_name_or_path:
+        model = AutoModelForMultipleChoice.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForMultipleChoice.from_config(config)
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    padding = "max_length" if args.pad_to_max_length else False
+
+    def preprocess_function(examples):
+        first_sentences = [[context] * 4 for context in examples[context_name]]
+        question_headers = examples[question_header_name]
+        second_sentences = [
+            [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+        ]
+        labels = examples[label_column_name]
+
+        # Flatten out
+        first_sentences = sum(first_sentences, [])
+        second_sentences = sum(second_sentences, [])
+
+        # Tokenize
+        tokenized_examples = tokenizer(
+            first_sentences,
+            second_sentences,
+            max_length=args.max_length,
+            padding=padding,
+            truncation=True,
+        )
+        # Un-flatten
+        tokenized_inputs = {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
+        tokenized_inputs["labels"] = labels
+        return tokenized_inputs
+
+    processed_datasets = raw_datasets.map(
+        preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names
+    )
+
+    train_dataset = processed_datasets["train"]
+    eval_dataset = processed_datasets["validation"]
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # DataLoaders creation:
+    if args.pad_to_max_length:
+        # If padding was already done ot max length, we use the default data collator that will just convert everything
+        # to tensors.
+        data_collator = default_data_collator
+    else:
+        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
+        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
+        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+        data_collator = DataCollatorForMultipleChoice(
+            tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
+        )
+
+    train_dataloader = DataLoader(
+        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
+    )
+    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Use the device given by the `accelerator` object.
+    device = accelerator.device
+    model.to(device)
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader
+    )
+
+    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
+    # shorter in multiprocess)
+
+    # Scheduler and math around the number of training steps.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    else:
+        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # Metrics
+    metric = load_metric("accuracy")
+
+    # Train!
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+
+    for epoch in range(args.num_train_epochs):
+        model.train()
+        for step, batch in enumerate(train_dataloader):
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss = loss / args.gradient_accumulation_steps
+            accelerator.backward(loss)
+            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+        model.eval()
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                outputs = model(**batch)
+            predictions = outputs.logits.argmax(dim=-1)
+            metric.add_batch(
+                predictions=accelerator.gather(predictions),
+                references=accelerator.gather(batch["labels"]),
+            )
+
+        eval_metric = metric.compute()
+        accelerator.print(f"epoch {epoch}: {eval_metric}")
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/multiple-choice/run_tf_multiple_choice.py
+++ b/examples/multiple-choice/run_tf_multiple_choice.py
@@ -116,12 +116,10 @@ def main():
        level=logging.INFO,
    )
    logger.warning(
-        "device: %s, n_replicas: %s, 16-bits training: %s",
-        training_args.device,
-        training_args.n_replicas,
-        training_args.fp16,
+        f"device: {training_args.device}, n_replicas: {training_args.n_replicas}, "
+        f"16-bits training: {training_args.fp16}"
    )
-    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed
    set_seed(training_args.seed)
@@ -131,7 +129,7 @@ def main():
        label_list = processor.get_labels()
        num_labels = len(label_list)
    except KeyError:
-        raise ValueError("Task not found: %s" % (data_args.task_name))
+        raise ValueError(f"Task not found: {data_args.task_name}")

    # Load pretrained model and tokenizer
    #
@@ -210,8 +208,8 @@ def main():
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key, value in result.items():
-                logger.info("  %s = %s", key, value)
-                writer.write("%s = %s\n" % (key, value))
+                logger.info(f"  {key} = {value}")
+                writer.write(f"{key} = {value}\n")

            results.update(result)

--- a/examples/multiple-choice/utils_multiple_choice.py
+++ b/examples/multiple-choice/utils_multiple_choice.py
@@ -99,13 +99,7 @@ if is_torch_available():
            processor = processors[task]()

            cached_features_file = os.path.join(
-                data_dir,
-                "cached_{}_{}_{}_{}".format(
-                    mode.value,
-                    tokenizer.__class__.__name__,
-                    str(max_seq_length),
-                    task,
-                ),
+                data_dir, f"cached_{mode.value}_{tokenizer.__class__.__name__}_{max_seq_length}_{task}"
            )

            # Make sure only the first process in distributed training processes the dataset,
@@ -125,14 +119,14 @@ if is_torch_available():
                        examples = processor.get_test_examples(data_dir)
                    else:
                        examples = processor.get_train_examples(data_dir)
-                    logger.info("Training examples: %s", len(examples))
+                    logger.info(f"Training examples: {len(examples)}")
                    self.features = convert_examples_to_features(
                        examples,
                        label_list,
                        max_seq_length,
                        tokenizer,
                    )
-                    logger.info("Saving features into cached file %s", cached_features_file)
+                    logger.info(f"Saving features into cached file {cached_features_file}")
                    torch.save(self.features, cached_features_file)

        def __len__(self):
@@ -172,7 +166,7 @@ if is_tf_available():
                examples = processor.get_test_examples(data_dir)
            else:
                examples = processor.get_train_examples(data_dir)
-            logger.info("Training examples: %s", len(examples))
+            logger.info(f"Training examples: {len(examples)}")

            self.features = convert_examples_to_features(
                examples,
@@ -184,7 +178,7 @@ if is_tf_available():
            def gen():
                for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
                    if ex_index % 10000 == 0:
-                        logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+                        logger.info(f"Writing example {ex_index} of {len(examples)}")

                    yield (
                        {
@@ -255,7 +249,7 @@ class RaceProcessor(DataProcessor):

    def get_train_examples(self, data_dir):
        """See base class."""
-        logger.info("LOOKING AT {} train".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} train")
        high = os.path.join(data_dir, "train/high")
        middle = os.path.join(data_dir, "train/middle")
        high = self._read_txt(high)
@@ -264,7 +258,7 @@ class RaceProcessor(DataProcessor):

    def get_dev_examples(self, data_dir):
        """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} dev")
        high = os.path.join(data_dir, "dev/high")
        middle = os.path.join(data_dir, "dev/middle")
        high = self._read_txt(high)
@@ -273,7 +267,7 @@ class RaceProcessor(DataProcessor):

    def get_test_examples(self, data_dir):
        """See base class."""
-        logger.info("LOOKING AT {} test".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} test")
        high = os.path.join(data_dir, "test/high")
        middle = os.path.join(data_dir, "test/middle")
        high = self._read_txt(high)
@@ -298,7 +292,7 @@ class RaceProcessor(DataProcessor):
        """Creates examples for the training and dev sets."""
        examples = []
        for (_, data_raw) in enumerate(lines):
-            race_id = "%s-%s" % (set_type, data_raw["race_id"])
+            race_id = f"{set_type}-{data_raw['race_id']}"
            article = data_raw["article"]
            for i in range(len(data_raw["answers"])):
                truth = str(ord(data_raw["answers"][i]) - ord("A"))
@@ -322,17 +316,17 @@ class SynonymProcessor(DataProcessor):

    def get_train_examples(self, data_dir):
        """See base class."""
-        logger.info("LOOKING AT {} train".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} train")
        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctrain.csv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} dev")
        return self._create_examples(self._read_csv(os.path.join(data_dir, "mchp.csv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} dev")

        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctest.csv")), "test")

@@ -368,17 +362,17 @@ class SwagProcessor(DataProcessor):

    def get_train_examples(self, data_dir):
        """See base class."""
-        logger.info("LOOKING AT {} train".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} train")
        return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} dev")
        return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} dev")
        raise ValueError(
            "For swag testing, the input file does not contain a label column. It can not be tested in current code"
            "setting!"
@@ -419,16 +413,16 @@ class ArcProcessor(DataProcessor):

    def get_train_examples(self, data_dir):
        """See base class."""
-        logger.info("LOOKING AT {} train".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} train")
        return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} dev")
        return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")

    def get_test_examples(self, data_dir):
-        logger.info("LOOKING AT {} test".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} test")
        return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")

    def get_labels(self):
@@ -450,7 +444,7 @@ class ArcProcessor(DataProcessor):
            elif truth in "1234":
                return int(truth) - 1
            else:
-                logger.info("truth ERROR! %s", str(truth))
+                logger.info(f"truth ERROR! {truth}")
                return None

        examples = []
@@ -496,11 +490,11 @@ class ArcProcessor(DataProcessor):
        if type == "train":
            assert len(examples) > 1
            assert examples[0].label is not None
-        logger.info("len examples: %s}", str(len(examples)))
-        logger.info("Three choices: %s", str(three_choice))
-        logger.info("Five choices: %s", str(five_choice))
-        logger.info("Other choices: %s", str(other_choices))
-        logger.info("four choices: %s", str(four_choice))
+        logger.info(f"len examples: {len(examples)}")
+        logger.info(f"Three choices: {three_choice}")
+        logger.info(f"Five choices: {five_choice}")
+        logger.info(f"Other choices: {other_choices}")
+        logger.info(f"four choices: {four_choice}")

        return examples

@@ -520,7 +514,7 @@ def convert_examples_to_features(
    features = []
    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
        if ex_index % 10000 == 0:
-            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+            logger.info(f"Writing example {ex_index} of {len(examples)}")
        choices_inputs = []
        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
            text_a = context
@@ -570,7 +564,7 @@ def convert_examples_to_features(

    for f in features[:2]:
        logger.info("*** Example ***")
-        logger.info("feature: %s" % f)
+        logger.info("feature: {f}")

    return features

--- a/examples/question-answering/README.md
+++ b/examples/question-answering/README.md
@@ -24,6 +24,11 @@ uses special features of those tokenizers. You can check if your favorite model
 of the script.

 The old version of this script can be found [here](https://github.com/huggingface/transformers/tree/master/examples/legacy/question-answering).
+
+`run_qa.py` allows you to fine-tune any model from our [hub](https://huggingface.co/models) (as long as its architecture as a `ForQuestionAnswering` version in the library) on the SQUAD dataset or another question-answering dataset of the `datasets` library or your own csv/jsonlines files as long as they are structured the same way as SQUAD. You might need to tweak the data processing inside the script if your data is structured differently.
+
+Note that if your dataset contains samples with no possible answers (like SQUAD version 2), you need to pass along the flag `--version_2_with_negative`.
+
 #### Fine-tuning BERT on SQuAD1.0

 This example code fine-tunes BERT on the SQuAD1.0 dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large)
--- a/examples/question-answering/run_qa.py
+++ b/examples/question-answering/run_qa.py
@@ -46,7 +46,7 @@ from utils_qa import postprocess_qa_predictions


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
+check_min_version("4.5.0")

 logger = logging.getLogger(__name__)

@@ -100,6 +100,10 @@ class DataTrainingArguments:
        default=None,
        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
+    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
@@ -136,6 +140,13 @@ class DataTrainingArguments:
            "value if set."
        },
    )
+    max_test_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+            "value if set."
+        },
+    )
    version_2_with_negative: bool = field(
        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
    )
@@ -164,8 +175,13 @@ class DataTrainingArguments:
    )

    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+            and self.test_file is None
+        ):
+            raise ValueError("Need either a dataset name or a training/validation file/test_file.")
        else:
            if self.train_file is not None:
                extension = self.train_file.split(".")[-1]
@@ -173,6 +189,9 @@ class DataTrainingArguments:
            if self.validation_file is not None:
                extension = self.validation_file.split(".")[-1]
                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."


 def main():
@@ -221,7 +240,7 @@ def main():
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
-    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)
@@ -247,7 +266,9 @@ def main():
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
            extension = data_args.validation_file.split(".")[-1]
-
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
        datasets = load_dataset(extension, data_files=data_files, field="data")
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -291,8 +312,10 @@ def main():
    # Preprocessing is slighlty different for training and evaluation.
    if training_args.do_train:
        column_names = datasets["train"].column_names
-    else:
+    elif training_args.do_eval:
        column_names = datasets["validation"].column_names
+    else:
+        column_names = datasets["test"].column_names
    question_column_name = "question" if "question" in column_names else column_names[0]
    context_column_name = "context" if "context" in column_names else column_names[1]
    answer_column_name = "answers" if "answers" in column_names else column_names[2]
@@ -444,12 +467,12 @@ def main():
    if training_args.do_eval:
        if "validation" not in datasets:
            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = datasets["validation"]
+        eval_examples = datasets["validation"]
        if data_args.max_val_samples is not None:
            # We will select sample from whole data
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+            eval_examples = eval_examples.select(range(data_args.max_val_samples))
        # Validation Feature Creation
-        eval_dataset = eval_dataset.map(
+        eval_dataset = eval_examples.map(
            prepare_validation_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
@@ -460,6 +483,25 @@ def main():
            # During Feature creation dataset samples might increase, we will select required samples again
            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))

+    if training_args.do_predict:
+        if "test" not in datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        test_examples = datasets["test"]
+        if data_args.max_test_samples is not None:
+            # We will select sample from whole data
+            test_examples = test_examples.select(range(data_args.max_test_samples))
+        # Test Feature Creation
+        test_dataset = test_examples.map(
+            prepare_validation_features,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        if data_args.max_test_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            test_dataset = test_dataset.select(range(data_args.max_test_samples))
+
    # Data collator
    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
    # collator.
@@ -470,7 +512,7 @@ def main():
    )

    # Post-processing:
-    def post_processing_function(examples, features, predictions):
+    def post_processing_function(examples, features, predictions, stage="eval"):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions = postprocess_qa_predictions(
            examples=examples,
@@ -482,6 +524,7 @@ def main():
            null_score_diff_threshold=data_args.null_score_diff_threshold,
            output_dir=training_args.output_dir,
            is_world_process_zero=trainer.is_world_process_zero(),
+            prefix=stage,
        )
        # Format the result to the format the metric expects.
        if data_args.version_2_with_negative:
@@ -490,7 +533,8 @@ def main():
            ]
        else:
            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
-        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in datasets["validation"]]
+
+        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
        return EvalPrediction(predictions=formatted_predictions, label_ids=references)

    metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
@@ -504,7 +548,7 @@ def main():
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
-        eval_examples=datasets["validation"] if training_args.do_eval else None,
+        eval_examples=eval_examples if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        post_process_function=post_processing_function,
@@ -543,6 +587,18 @@ def main():
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

+    # Prediction
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+        results = trainer.predict(test_dataset, test_examples)
+        metrics = results.metrics
+
+        max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len(test_dataset)
+        metrics["test_samples"] = min(max_test_samples, len(test_dataset))
+
+        trainer.log_metrics("test", metrics)
+        trainer.save_metrics("test", metrics)
+

 def _mp_fn(index):
    # For xla_spawn (TPUs)
--- a/examples/question-answering/run_qa_beam_search.py
+++ b/examples/question-answering/run_qa_beam_search.py
@@ -45,7 +45,7 @@ from utils_qa import postprocess_qa_predictions_with_beam_search


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
+check_min_version("4.5.0")

 logger = logging.getLogger(__name__)

@@ -99,6 +99,10 @@ class DataTrainingArguments:
        default=None,
        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input test data file to test the perplexity on (a text file)."},
+    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
@@ -135,6 +139,13 @@ class DataTrainingArguments:
            "value if set."
        },
    )
+    max_test_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of test examples to this "
+            "value if set."
+        },
+    )
    version_2_with_negative: bool = field(
        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
    )
@@ -163,8 +174,13 @@ class DataTrainingArguments:
    )

    def __post_init__(self):
-        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
-            raise ValueError("Need either a dataset name or a training/validation file.")
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+            and self.test_file is None
+        ):
+            raise ValueError("Need either a dataset name or a training/validation/test file.")
        else:
            if self.train_file is not None:
                extension = self.train_file.split(".")[-1]
@@ -172,6 +188,9 @@ class DataTrainingArguments:
            if self.validation_file is not None:
                extension = self.validation_file.split(".")[-1]
                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."


 def main():
@@ -220,7 +239,7 @@ def main():
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
-    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)
@@ -241,9 +260,13 @@ def main():
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
-        extension = data_args.train_file.split(".")[-1]
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
        datasets = load_dataset(extension, data_files=data_files, field="data")
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -278,8 +301,10 @@ def main():
    # Preprocessing is slighlty different for training and evaluation.
    if training_args.do_train:
        column_names = datasets["train"].column_names
-    else:
+    elif training_args.do_eval:
        column_names = datasets["validation"].column_names
+    else:
+        column_names = datasets["test"].column_names
    question_column_name = "question" if "question" in column_names else column_names[0]
    context_column_name = "context" if "context" in column_names else column_names[1]
    answer_column_name = "answers" if "answers" in column_names else column_names[2]
@@ -478,12 +503,12 @@ def main():
    if training_args.do_eval:
        if "validation" not in datasets:
            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = datasets["validation"]
+        eval_examples = datasets["validation"]
        if data_args.max_val_samples is not None:
            # Selecting Eval Samples from Dataset
-            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+            eval_examples = eval_examples.select(range(data_args.max_val_samples))
        # Create Features from Eval Dataset
-        eval_dataset = eval_dataset.map(
+        eval_dataset = eval_examples.map(
            prepare_validation_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
@@ -494,6 +519,25 @@ def main():
            # Selecting Samples from Dataset again since Feature Creation might increase samples size
            eval_dataset = eval_dataset.select(range(data_args.max_val_samples))

+    if training_args.do_predict:
+        if "test" not in datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        test_examples = datasets["test"]
+        if data_args.max_test_samples is not None:
+            # We will select sample from whole data
+            test_examples = test_examples.select(range(data_args.max_test_samples))
+        # Test Feature Creation
+        test_dataset = test_examples.map(
+            prepare_validation_features,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        if data_args.max_test_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            test_dataset = test_dataset.select(range(data_args.max_test_samples))
+
    # Data collator
    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
    # collator.
@@ -504,7 +548,7 @@ def main():
    )

    # Post-processing:
-    def post_processing_function(examples, features, predictions):
+    def post_processing_function(examples, features, predictions, stage="eval"):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search(
            examples=examples,
@@ -517,6 +561,7 @@ def main():
            end_n_top=model.config.end_n_top,
            output_dir=training_args.output_dir,
            is_world_process_zero=trainer.is_world_process_zero(),
+            prefix=stage,
        )
        # Format the result to the format the metric expects.
        if data_args.version_2_with_negative:
@@ -526,7 +571,8 @@ def main():
            ]
        else:
            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
-        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in datasets["validation"]]
+
+        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
        return EvalPrediction(predictions=formatted_predictions, label_ids=references)

    metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
@@ -540,7 +586,7 @@ def main():
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
-        eval_examples=datasets["validation"] if training_args.do_eval else None,
+        eval_examples=eval_examples if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        post_process_function=post_processing_function,
@@ -580,6 +626,18 @@ def main():
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

+    # Prediction
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+        results = trainer.predict(test_dataset, test_examples)
+        metrics = results.metrics
+
+        max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len(test_dataset)
+        metrics["test_samples"] = min(max_test_samples, len(test_dataset))
+
+        trainer.log_metrics("test", metrics)
+        trainer.save_metrics("test", metrics)
+

 def _mp_fn(index):
    # For xla_spawn (TPUs)
--- a/examples/question-answering/run_tf_squad.py
+++ b/examples/question-answering/run_tf_squad.py
@@ -148,12 +148,10 @@ def main():
        level=logging.INFO,
    )
    logger.info(
-        "n_replicas: %s, distributed training: %s, 16-bits training: %s",
-        training_args.n_replicas,
-        bool(training_args.n_replicas > 1),
-        training_args.fp16,
+        f"n_replicas: {training_args.n_replicas}, distributed training: {bool(training_args.n_replicas > 1)}, "
+        f"16-bits training: {training_args.fp16}"
    )
-    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info(f"Training/evaluation parameters {training_args}")

    # Prepare Question-Answering task
    # Load pretrained model and tokenizer
--- a/examples/question-answering/trainer_qa.py
+++ b/examples/question-answering/trainer_qa.py
@@ -98,7 +98,7 @@ class QuestionAnsweringTrainer(Trainer):
        if isinstance(test_dataset, datasets.Dataset):
            test_dataset.set_format(type=test_dataset.format["type"], columns=list(test_dataset.features.keys()))

-        eval_preds = self.post_process_function(test_examples, test_dataset, output.predictions)
+        eval_preds = self.post_process_function(test_examples, test_dataset, output.predictions, "test")
        metrics = self.compute_metrics(eval_preds)

        return PredictionOutput(predictions=eval_preds.predictions, label_ids=eval_preds.label_ids, metrics=metrics)
--- a/examples/question-answering/utils_qa.py
+++ b/examples/question-answering/utils_qa.py
@@ -215,14 +215,14 @@ def postprocess_qa_predictions(
        assert os.path.isdir(output_dir), f"{output_dir} is not a directory."

        prediction_file = os.path.join(
-            output_dir, "predictions.json" if prefix is None else f"predictions_{prefix}".json
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
        )
        nbest_file = os.path.join(
-            output_dir, "nbest_predictions.json" if prefix is None else f"nbest_predictions_{prefix}".json
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
        )
        if version_2_with_negative:
            null_odds_file = os.path.join(
-                output_dir, "null_odds.json" if prefix is None else f"null_odds_{prefix}".json
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
            )

        logger.info(f"Saving predictions to {prediction_file}.")
@@ -403,14 +403,14 @@ def postprocess_qa_predictions_with_beam_search(
        assert os.path.isdir(output_dir), f"{output_dir} is not a directory."

        prediction_file = os.path.join(
-            output_dir, "predictions.json" if prefix is None else f"predictions_{prefix}".json
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
        )
        nbest_file = os.path.join(
-            output_dir, "nbest_predictions.json" if prefix is None else f"nbest_predictions_{prefix}".json
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
        )
        if version_2_with_negative:
            null_odds_file = os.path.join(
-                output_dir, "null_odds.json" if prefix is None else f"null_odds_{prefix}".json
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
            )

        print(f"Saving predictions to {prediction_file}.")
--- a/examples/research_projects/lxmert/requirements.txt
+++ b/examples/research_projects/lxmert/requirements.txt
@@ -28,7 +28,7 @@ ipython
 ipython-genutils==0.2.0
 ipywidgets==7.5.1
 jedi==0.17.2
-Jinja2==2.11.2
+Jinja2>=2.11.3
 joblib==0.16.0
 jsonschema==3.2.0
 jupyter==1.0.0
@@ -56,7 +56,7 @@ parso==0.7.1
 pep517==0.8.2
 pexpect==4.8.0
 pickleshare==0.7.5
-Pillow==7.2.0
+Pillow>=8.1.1
 progress==1.5
 prometheus-client==0.8.0
 prompt-toolkit==3.0.7
@@ -64,13 +64,13 @@ ptyprocess==0.6.0
 pyaml==20.4.0
 pyarrow==1.0.1
 pycparser==2.20
-Pygments==2.6.1
+Pygments>=2.7.4
 pyparsing==2.4.6
 pyrsistent==0.16.0
 python-dateutil==2.8.1
 pytoml==0.1.21
 pytz==2020.1
-PyYAML==5.3.1
+PyYAML>=5.4
 pyzmq==19.0.2
 qtconsole==4.7.7
 QtPy==1.9.0
--- a/examples/research_projects/rag/distributed_pytorch_retriever.py
+++ b/examples/research_projects/rag/distributed_pytorch_retriever.py
@@ -22,10 +22,10 @@ class RagPyTorchDistributedRetriever(RagRetriever):
    Args:
        config (:class:`~transformers.RagConfig`):
            The configuration of the RAG model this Retriever is used with. Contains parameters indicating which ``Index`` to build.
-        question_encoder_tokenizer (:class:`~transformers.PretrainedTokenizer`):
+        question_encoder_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
            The tokenizer that was used to tokenize the question.
            It is used to decode the question and then use the generator_tokenizer.
-        generator_tokenizer (:class:`~transformers.PretrainedTokenizer`):
+        generator_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
            The tokenizer used for the generator part of the RagModel.
        index (:class:`~transformers.models.rag.retrieval_rag.Index`, optional, defaults to the one defined by the configuration):
            If specified, use this index instead of the one built using the configuration
--- a/examples/research_projects/rag/distributed_ray_retriever.py
+++ b/examples/research_projects/rag/distributed_ray_retriever.py
@@ -50,10 +50,10 @@ class RagRayDistributedRetriever(RagRetriever):
    Args:
        config (:class:`~transformers.RagConfig`):
            The configuration of the RAG model this Retriever is used with. Contains parameters indicating which ``Index`` to build.
-        question_encoder_tokenizer (:class:`~transformers.PretrainedTokenizer`):
+        question_encoder_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
            The tokenizer that was used to tokenize the question.
            It is used to decode the question and then use the generator_tokenizer.
-        generator_tokenizer (:class:`~transformers.PretrainedTokenizer`):
+        generator_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
            The tokenizer used for the generator part of the RagModel.
        retrieval_workers (:obj:`List[ray.ActorClass(RayRetriever)]`): A list of already initialized `RayRetriever` actors.
            These actor classes run on remote processes and are responsible for performing the index lookup.
--- a/examples/research_projects/wav2vec2/FINE_TUNE_XLSR_WAV2VEC2.md
+++ b/examples/research_projects/wav2vec2/FINE_TUNE_XLSR_WAV2VEC2.md
@@ -0,0 +1,516 @@
+# Fine-Tuning week of XLSR-Wav2Vec2 on 60 languages 🌍
+
+Welcome to the fine-tuning week! The goal of this week is to have state-of-the-art automatic speech recognition (ASR) models in as many languages as possible. The fine-tuning week ends on Friday, the 26th March at midnight PST time.
+
+Participants are encouraged to fine-tune the pretrained [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) checkpoint on one or more of the 60 languages of [Common Voice dataset](https://commonvoice.mozilla.org/en/datasets).
+Furthermore, it is very much appreciated if participants fine-tune XLSR-Wav2Vec2 on a language that is not included in the Common Voice dataset.
+
+All fine-tuned models uploaded until Friday, the 26th March midnight PST, will be taken into account for competition, and the best model per language will be awarded a prize if the best model performs reasonably well. 
+The testing data to evaluate the models will be the official [Common Voice dataset](https://commonvoice.mozilla.org/en/datasets) *`test data`* of version 6.1. Again, participants are very much encouraged to fine-tune XLSR-Wav2Vec2 on languages that are not found in the Common Voice dataset since those languages are even more likely to be underrepresented in the speech community. 
+Each model fine-tuned on a language not found in Common Voice, will be evaluated by the Hugging Face team after Friday, the 26th March at midnight PST, and if the model performs reasonably well, the model receives a prize as well. 
+For more information on which data can be used for training, how the models are evaluated exactly, and what type of data preprocessing can be used, please see ["Training and Evaluation Rules"](#training-and-evaluation-rules).
+
+**Please keep in mind:**
+The spirit of the fine-tuning week is to provide state-of-the-art speech recognition in as many languages as possible to the community! 
+So while we encourage healthy competition between people/groups of the same language so that better results are obtained, it is extremely important that we help each other and share our insights with the whole team/community. 
+What matters in the end is what has been achieved by the team as a whole during the fine-tuning week. 
+That being said, we strongly encourage people to share tips & tricks on the forum or Slack, help each other when team members encounter bugs, and work in groups. 
+To make it easier to share and help, forum threads have been created under the name {language} ASR: Fine-Tuning Wav2Vec2, e.g. here. 
+It is very much possible that prizes will be given to groups of people instead of individuals. Also, don't hesitate to ask questions, propose improvements to the organization, to the material given to participants, etc...🤗
+
+## Table of Contents
+
+- [Organization of the fine tuning week](#organization-of-the-fine-tuning-week)
+- [How to fine tune XLSR Wav2Vec2](#how-to-fine-tune-xlsr-wav2vec2)
+	- [Google colab setup](#google-colab-setup)
+	- [Local machine](#local-machine)
+- [How to upload my trained checkpoint](#how-to-upload-my-trained-checkpoint)
+	- [How to create the README](#how-to-create-the-readme)
+- [How to evaluate my trained checkpoint](#how-to-evaluate-my-trained-checkpoint)
+- [Rules of training and evaluation](#rules-of-training-and-evaluation)
+- [Tips and tricks](#tips-and-tricks)
+	- [How to combine multiple datasests into one](#how-to-combine-multiple-datasets-into-one)
+	- [How to effectively preprocess the data](#how-to-effectively-preprocess-the-data)
+	- [How to efficiently preproces the data](#how-to-do-efficiently-load-datasets-with-limited-ram-and-hard-drive-space)
+	- [How to do hyperparameter tuning](#how-to-do-hyperparameter-tuning)
+	- [How to preprocess and evaluate character based languages](#how-to-preprocess-and-evaluate-character-based-languages)
+- [Further reading material](#further-reading-material)
+- [FAQ](#faq)
+
+## Organization of the fine tuning week
+
+The week officially starts on 22.03.2021 and ends on 29.03.2021, but you are more than welcome to start fine-tuning models before the start date. 
+General questions you might have, general problems you encounter, and general tips can be shared directly on the Slack channel (see [this post](https://discuss.huggingface.co/t/open-to-the-community-xlsr-wav2vec2-fine-tuning-week-for-low-resource-languages/4467) on how to be added to Slack). 
+More language-specific questions or specific bugs should be posted on the [forum](https://discuss.huggingface.co/) (feel free to use already existing language-specific threads, *e.g.* [this one](https://discuss.huggingface.co/t/arabic-asr-fine-tuning-wav2vec2/4608) or open a new one if there is no thread for your language yet) or directly on [github](https://github.com/huggingface/transformers) if you think some code or document needs correction/improvement.
+Starting on Monday, the 22.03.2021, the Hugging Face team will try to provide an overview of currently trained models along with their evaluation results.
+All the necessary information on:
+
+- How to fine-tune the XLSR model
+- How to upload the model
+- How to share your evaluation results & training/eval script
+- What are the training/evaluation rules
+
+can be found in the sections below. If something is still unclear, feel free to drop a message in the Slack channel.
+
+## How to fine tune XLSR Wav2Vec2
+
+This chapter gives an in-detail explanation of how to fine-tune [Facebook's multi-lingual Wav2vec2](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on any language of the [Common Voice dataset](https://commonvoice.mozilla.org/en/datasets).
+
+Two possible setups can be used to fine-tune Wav2Vec2. The easiest setup is to simply use [google colab](https://colab.research.google.com/). It is possible to train the full model in a *free* google colab, but it is recommended to use google colab pro since it is more stable.
+
+The other option is to run a script locally. While this can be more difficult to set up, it also means that you have more control over the training run and probably access to better GPUs than you would have in a google colab. 
+For small datasets, it is usually totally sufficient to train your model
+in a google colab. For larger and thus more memory-intensive datasets, it is probably
+better to fine-tune the model locally.
+
+For each option, we explain in detail how to fine-tune XLSR-Wav2Vec2 in the following.
+
+### Google colab setup
+
+**Note**: Instead of reading the following section, you can simply watch [this](https://www.youtube.com/watch?v=UynYn2C3tI0&ab_channel=PatrickvonPlaten) video, where Patrick explains how to adapt the google colab for your specific language.
+
+**1.**: If you plan on training XLSR-Wav2Vec2 in a google colab, you should first make sure to have a valid gmail account. You can sign up for a gmail account [here](https://accounts.google.com/signup/v2/webcreateaccount?hl=en&flowName=GlifWebSignIn&flowEntry=SignUp). 
+Having successfully signed up for gmail, you can now sign in to your account to make sure you are logged in when opening new tabs in your browser.
+
+**2.**: Next, head over to the official [Fine-Tune XLSR-Wav2Vec2 with 🤗 Transformes](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_Tune_XLSR_Wav2Vec2_on_Turkish_ASR_with_%F0%9F%A4%97_Transformers.ipynb) google colab. The first thing you should do is to make a copy of it - click `->File->Save a copy in Drive`. This should save a copy of the google colab in your google drive. 
+
+**3.**: Now it is highly recommended to carefully read the google colab without running the cells yet. 
+You should get an understanding of the model is trained and what you will have to change when training the model in a different language. 
+Having done so, you can again head over to [Common Voice](https://commonvoice.mozilla.org/en/datasets) and pick a language you want to fine-tune [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on. Make sure you remember the language code (For each language, you can find it under the field "*Version*". It corresponds to **all characters before the first underscore**. *E.g.* for Greek it is *el*, while for Irish it is *ga-IE*.
+
+**4.**: Now you should replace the language code used for the demo of this colab, being *tr* for Turkish with the language code corresponding to the language you just chose in the **second** cell of the google colab. This will load the correct data for your language.
+
+**5.**: It is time to start running the google colab! Make sure that you have selected "GPU" as your runtime environment and you can start running the cells one-by-one. Make sure you attentively read the text between the cells to understand what is happening and to eventually correct the cells to improve the fine-tuning script for your language. Things you might want to improve/change:
+ 
+ - Data loading. It is very much recommended to use more than just the official training data of the Common Voice dataset. If you find more data on the internet, feel free to use it! Check out the section ["How to combined multiple datasets into one"](#how-to-combine-multiple-datasets-into-one)
+
+- Data Processing. You should adapt the data processing to your specific language. In data processing, you should make the data more uniform so that it will be easier for the model to learn how to classify speech in your data. Here it can be really helpful to be proficient in the language to know what can be done to simplify the language without changing the meaning. 
+Data processing methods include, but are not limited to:
+	- Normalizing your data. Make sure all characters are lower-cased.
+	- Remove typographical symbols and punctuation marks. See a list [here](https://en.wikipedia.org/wiki/List_of_typographical_symbols_and_punctuation_marks). Be careful to not remove punctuation marks that can change the meaning of the sentence. *E.g.* you should not remove the single quotation mark `'` in English, as it would change the words `"it's"` to `"its"` which is a different word and has thus a different meaning. For more tips on data processing see ["How to effectively preprocess the data"](#how-to-effectively-preprocess-the-data")
+
+- Hyperparameter Tuning. Depending on the size of the data you should probably change the hyperparameters of the google colab. You can change any parameter you like. For more tips and tricks see ["How to do hyperparameter tuning for my language"](#how-to-do-hyperparameter-tuning-for-my-language)
+
+When running the google colab make sure that you uncomment the cell corresponding to mounting your google drive to the colab. This cell looks as follows:
+
+```python
+# from google.colab import drive
+# drive.mount('/content/gdrive/')
+``` 
+
+Uncomment it, run it, and follow the instructions to mount your google drive. This way you can be sure that the model parameters and created tokenizer & feature extractor files are saved in **your** google drive.
+
+Also, make sure that you uncomment the cells corresponding to save the preprocessing files and trained model weights to your drive. Otherwise, you might lose a trained model if you google crashes. You should change the name of your model from `wav2vec2-large-xlsr-turkish-demo` to `wav2vec2-large-xlsr-{your_favorite_name}`.
+
+Those cells correspond to:
+
+```python
+# processor.save_pretrained("/content/gdrive/MyDrive/wav2vec2-large-xlsr-turkish-demo")
+```
+
+and the line:
+
+```python
+  output_dir="/content/gdrive/MyDrive/wav2vec2-large-xlsr-turkish-demo",
+```
+
+further below (which should already be uncommented).
+
+Having finished the training you should find the following files/folders under the folder `wav2vec2-large-xlsr-{your_favorite_name}` in your google drive:
+
+- `preprocessor_config.json` - the parameters of the feature extractor
+- `special_tokens_map.json` - the special token map of the tokenizer
+- `tokenizer_config.json` - the parameters of the tokenizer
+- `vocab.json` - the vocabulary of the tokenizer
+- `checkpoint-{...}/` - the saved checkpoints saved during training. Each checkpoint should contain the files: `config.json`, `optimizer.pt`, `pytorch_model.bin`, `scheduler.pt`, `training_args.bin`. The files `config.json` and `pytorch_model.bin` define your model.
+
+If you are happy with your training results it is time to upload your model! 
+Download the following files to your local computer: **`preprocessor_config.json`, `special_tokens_map.json`, `tokenizer_config.json`, `vocab.json`, `config.json`, `pytorch_model.bin`**. Those files fully define a XLSR-Wav2Vec2 model checkpoint.
+
+Awesome you have successfully trained a XLSR-Wav2Vec2 model 😎. Now you can jump to the section ["How to upload my trained checkpoint"](#how-to-upload-my-trained-checkpoint)
+
+### Local machine
+
+We have provided `run_common_voice.py` script to run fine-tuning on local machine. The script is similar to the colab but allows you to launch training using command line, save and continue training from previous checkpoints and launch training on multiple GPUs.
+For bigger datasets, we recommend to train Wav2Vec2 locally instead of in a google colab.
+
+1. To begin with, we should clone transformers localy and install all the required packages.
+
+First, you need to clone the `transformers` repo with:
+
+```
+$ git clone https://github.com/huggingface/transformers.git
+```
+
+Second, head over to the `examples/research_projects/wav2vec2` directory, where the `run_common_voice.py` script is located.
+
+```
+$ cd transformers/examples/research_projects/wav2vec2
+```
+
+Third, install the required packages. The
+packages are listed in the `requirements.txt` file and can be installed with
+
+```
+$ pip install -r requirements.txt
+```
+
+	**Note**: Installing the latest version of `torchaudio` will also upgrade `torch` to it's latest stable version. If you are using specific version of `torch` then make sure
+	to use the correct `torchaudio` version compatible with your version of `torch`. By default the `requirements.txt` will install the latest version of `torchaudio`.
+
+2. Next, take a look at the `run_common_voice.py` script to get an understanding of how it works. In short the script does the following:
+
+	- Load the given common voice dataset
+	- Create vocab for the language
+	- Load the model with given hyperparameters
+	- Pre-process the dataset to input into the model
+	- Run training
+	- Run evaluation
+
+3. The following examples show how you can launch fine-tuning for the common voice dataset. 
+Here we will run the script on the *Turkish* Common Voice dataset for demonstration purposes.
+	
+	**To lanuch fine-tuninig on a single GPU:**
+	
+	```bash
+	python run_common_voice.py \
+		--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
+		--dataset_config_name="tr" \ # use this argument to specify the language code
+		--output_dir=./wav2vec2-large-xlsr-turkish-demo \
+		--overwrite_output_dir \
+		--num_train_epochs="5" \
+		--per_device_train_batch_size="16" \
+		--learning_rate="3e-4" \
+		--warmup_steps="500" \
+		--evaluation_strategy="steps" \
+		--save_steps="400" \
+		--eval_steps="400" \
+		--logging_steps="400" \
+		--save_total_limit="3" \
+		--freeze_feature_extractor \
+		--feat_proj_dropout="0.0" \
+		--layerdrop="0.1" \
+		--gradient_checkpointing \
+		--fp16 \
+		--group_by_length \
+		--do_train --do_eval
+	```
+
+	**To lanuch fine-tuninig on multiple GPUs:**
+	
+	```bash
+	python -m torch.distributed.launch \
+		--nproc_per_node 4 run_common_voice.py \
+		--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
+		--dataset_config_name="tr" \ # use this argument to specify the language code
+		--output_dir=./wav2vec2-large-xlsr-turkish-demo \
+		--overwrite_output_dir \
+		--num_train_epochs="5" \
+		--per_device_train_batch_size="16" \
+		--learning_rate="3e-4" \
+		--warmup_steps="500" \
+		--evaluation_strategy="steps" \
+		--save_steps="400" \
+		--eval_steps="400" \
+		--logging_steps="400" \
+		--save_total_limit="3" \
+		--freeze_feature_extractor \
+		--feat_proj_dropout="0.0" \
+		--layerdrop="0.1" \
+		--gradient_checkpointing \
+		--fp16 \
+		--group_by_length \
+		--do_train --do_eval
+	```
+
+	The above command will launch the training on 4 GPUs. Use the `--nproc_per_node` option to specify the number of GPUs.
+
+	Once the training is finished, the model and checkpoints will be saved under the directory specified by the `--output_dir` argument.
+
+4. The script also allows you to resume training from the last saved checkpoint. To resume training from last saved checkpoint remove the `--overwrite_output_dir` option and run the same command again.  And to continue training from a specific checkpoint, keep the `--overwrite_output_dir`
+option and pass the path of the checkpoint as `--model_name_or_path`.
+
+As the script is based on the `Trainer` API, refer to the [Trainer docs](https://huggingface.co/transformers/main_classes/trainer.html) for more information about ``Trainer`` and ``TrainingArguments``.
+
+[OVH cloud](https://www.ovh.com/world/) has generously offered free compute for this sprint. Please refer to [this video](https://www.youtube.com/watch?v=2hlkWAESMk8&ab_channel=Databuzzword) to get started with OVH. 
+
+
+## How to upload my trained checkpoint
+
+To upload your trained checkpoint, you have to create a new model repository on the 🤗 model hub, from this page: https://huggingface.co/new
+
+> You can also follow the more in-depth instructions [here](https://huggingface.co/transformers/model_sharing.html) if needed.
+
+Having created your model repository on the hub, you should clone it locally:
+
+```bash
+git lfs install
+
+git clone https://huggingface.co/username/your-model-name
+```
+
+Then and add the following files that fully define a XLSR-Wav2Vec2 checkpoint into the repository. You should have added the following files.
+
+- `preprocessor_config.json`
+- `special_tokens_map.json`
+- `tokenizer_config.json`
+- `vocab.json`
+- `config.json`
+- `pytorch_model.bin`
+
+Having added the above files, you should run the following to push files to your model repository.  
+```
+git add . && git commit -m "Add model files" && git push
+```
+
+The next **very important** step is to create the model card. For people to use your fine-tuned 
+model it is important to understand: 
+
+- What kind of model is it?
+- What is your model useful for?
+- What data was your model trained on?
+- How well does your model perform?
+
+All these questions should be answered in a model card which is the first thing people see when 
+visiting your model on the hub under `https://huggingface.co/{your_username}/{your_modelname}`.
+
+**Note**:
+It is extremely important that you add this model card or else we cannot find your model and thus cannot take the model into 
+account for the final evaluation.
+
+### How to create the readme
+
+The model card is written in markdown (`.md`) and should be added by simply clicking on the "Add model card" button which is found on the top right corner. 
+You are encouraged to copy-paste the following template into your model card. 
+
+**Make sure that** instead of copying the output of the markdown file you copy the **raw** version of the following part. 
+
+To get the raw version of this file, simply click on the "`raw`" button on the top right corner of this file next to "`blame`" and copy everything below the marker.
+Make sure that you read and consequently remove all #TODO: statements from the model card. 
+
+<======================Copy **raw** version from here=========================
+---
+language: {lang_id} #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
+datasets:
+- common_voice #TODO: remove if you did not use the common voice dataset
+- TODO: add more datasets if you have used additional datasets. Make sure to use the exact same 
+dataset name as the one found [here](https://huggingface.co/datasets). If the dataset can not be found in the official datasets, just give it a new name
+metrics:
+- wer
+tags:
+- audio
+- automatic-speech-recognition
+- speech
+- xlsr-fine-tuning-week
+license: apache-2.0
+model-index:
+- name: {human_readable_name} #TODO: replace {human_readable_name} with a name of your model as it should appear on the leaderboard. It could be something like `Elgeish XLSR Wav2Vec2 Large 53`
+  results:
+  - task: 
+      name: Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: Common Voice {lang_id} #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
+      type: common_voice
+      args: {lang_id} #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
+    metrics:
+       - name: Test WER
+         type: wer
+         value: {wer_result_on_test} #TODO (IMPORTANT): replace {wer_result_on_test} with the WER error rate you achieved on the common_voice test set. It should be in the format XX.XX (don't add the % sign here). **Please** remember to fill out this value after you evaluated your model, so that your model appears on the leaderboard. If you fill out this model card before evaluating your model, please remember to edit the model card afterward to fill in your value
+---
+
+# Wav2Vec2-Large-XLSR-53-{language} #TODO: replace language with your {language}, *e.g.* French
+
+Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) on {language} using the [Common Voice](https://huggingface.co/datasets/common_voice), ... and ... dataset{s}. #TODO: replace {language} with your language, *e.g.* French and eventually add more datasets that were used and eventually remove common voice if model was not trained on common voice
+When using this model, make sure that your speech input is sampled at 16kHz.
+
+## Usage
+
+The model can be used directly (without a language model) as follows:
+
+```python
+import torch
+import torchaudio
+from datasets import load_dataset
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+
+test_dataset = load_dataset("common_voice", "{lang_id}", split="test[:2%]") #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
+
+processor = Wav2Vec2Processor.from_pretrained("{model_id}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
+model = Wav2Vec2ForCTC.from_pretrained("{model_id}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
+
+resampler = torchaudio.transforms.Resample(48_000, 16_000)
+
+# Preprocessing the datasets.
+# We need to read the aduio files as arrays
+def speech_file_to_array_fn(batch):
+	speech_array, sampling_rate = torchaudio.load(batch["path"])
+	batch["speech"] = resampler(speech_array).squeeze().numpy()
+	return batch
+
+test_dataset = test_dataset.map(speech_file_to_array_fn)
+inputs = processor(test_dataset[:2]["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
+
+with torch.no_grad():
+	logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits
+
+predicted_ids = torch.argmax(logits, dim=-1)
+
+print("Prediction:", processor.batch_decode(predicted_ids))
+print("Reference:", test_dataset[:2]["sentence"])
+```
+
+
+## Evaluation
+
+The model can be evaluated as follows on the {language} test data of Common Voice.  # TODO: replace #TODO: replace language with your {language}, *e.g.* French
+
+
+```python
+import torch
+import torchaudio
+from datasets import load_dataset, load_metric
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+import re
+
+test_dataset = load_dataset("common_voice", "{lang_id}", split="test") #TODO: replace {lang_id} in your language code here. Make sure the code is one of the *ISO codes* of [this](https://huggingface.co/languages) site.
+wer = load_metric("wer")
+
+processor = Wav2Vec2Processor.from_pretrained("{model_id}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
+model = Wav2Vec2ForCTC.from_pretrained("{model_id}") #TODO: replace {model_id} with your model id. The model id consists of {your_username}/{your_modelname}, *e.g.* `elgeish/wav2vec2-large-xlsr-53-arabic`
+model.to("cuda")
+
+chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“]'  # TODO: adapt this list to include all special characters you removed from the data
+resampler = torchaudio.transforms.Resample(48_000, 16_000)
+
+# Preprocessing the datasets.
+# We need to read the aduio files as arrays
+def speech_file_to_array_fn(batch):
+	batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower()
+	speech_array, sampling_rate = torchaudio.load(batch["path"])
+	batch["speech"] = resampler(speech_array).squeeze().numpy()
+	return batch
+
+test_dataset = test_dataset.map(speech_file_to_array_fn)
+
+# Preprocessing the datasets.
+# We need to read the aduio files as arrays
+def evaluate(batch):
+	inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
+
+	with torch.no_grad():
+		logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits
+
+	pred_ids = torch.argmax(logits, dim=-1)
+	batch["pred_strings"] = processor.batch_decode(pred_ids)
+	return batch
+
+result = test_dataset.map(evaluate, batched=True, batch_size=8)
+
+print("WER: {:2f}".format(100 * wer.compute(predictions=result["pred_strings"], references=result["sentence"])))
+```
+
+**Test Result**: XX.XX %  # TODO: write output of print here. IMPORTANT: Please remember to also replace {wer_result_on_test} at the top of with this value here. tags.
+
+
+## Training
+
+The Common Voice `train`, `validation`, and ... datasets were used for training as well as ... and ...  # TODO: adapt to state all the datasets that were used for training.
+
+The script used for training can be found [here](...) # TODO: fill in a link to your training script here. If you trained your model in a colab, simply fill in the link here. If you trained the model locally, it would be great if you could upload the training script on github and paste the link here.
+
+=======================To here===============================>
+
+Your model in then available under *huggingface.co/{your_username}/{your_chosen_xlsr-large_model_name}* for everybody to use 🎉.
+
+## How to evaluate my trained checkpoint
+
+Having uploaded your model, you should now evaluate your model in a final step. This should be as simple as 
+copying the evaluation code of your model card into a python script and running it. Make sure to note 
+the final result on the model card **both** under the YAML tags at the very top **and** below your evaluation code under "Test Results".
+
+## Rules of training and evaluation
+
+In this section, we will quickly go over what data is allowed to be used as training 
+data, what kind of data preprocessing is allowed be used, and how the model should be evaluated.
+
+To make it very simple regarding the first point: **All data except the official common voice `test` data set can be used as training data**. For models trained in a language that is not included in Common Voice, the author of the model is responsible to 
+leave a reasonable amount of data for evaluation.
+
+Second, the rules regarding the preprocessing are not that as straight-forward. It is allowed (and recommended) to 
+normalize the data to only have lower-case characters. It is also allowed (and recommended) to remove typographical 
+symbols and punctuation marks. A list of such symbols can *e.g.* be fonud [here](https://en.wikipedia.org/wiki/List_of_typographical_symbols_and_punctuation_marks) - however here we already must be careful. We should **not** remove a symbol that 
+would change the meaning of the words, *e.g.* in English, we should not remove the single quotation mark `'` since it 
+would change the meaning of the word `"it's"` to `"its"` which would then be incorrect. So the golden rule here is to 
+not remove any characters that could change the meaning of a word into another word. This is not always obvious and should 
+be given some consideration. As another example, it is fine to remove the "Hypen-minus" sign "`-`" since it doesn't change the 
+meaninng of a word to another one. *E.g.* "`fine-tuning`" would be changed to "`finetuning`" which has still the same meaning.
+
+Since those choices are not always obvious when in doubt feel free to ask on Slack or even better post on the forum, as was 
+done, *e.g.* [here](https://discuss.huggingface.co/t/spanish-asr-fine-tuning-wav2vec2/4586).
+
+## Tips and tricks
+
+This section summarizes a couple of tips and tricks across various topics. It will continously be updated during the week.
+
+### How to combine multiple datasets into one
+
+Check out [this](https://discuss.huggingface.co/t/how-to-combine-local-data-files-with-an-official-dataset/4685) post.
+
+### How to effectively preprocess the data
+
+
+### How to do efficiently load datasets with limited ram and hard drive space
+
+Check out [this](https://discuss.huggingface.co/t/german-asr-fine-tuning-wav2vec2/4558/8?u=patrickvonplaten) post.
+
+
+### How to do hyperparameter tuning
+
+
+### How to preprocess and evaluate character based languages
+
+
+## Further reading material
+
+It is recommended that take some time to read up on how Wav2vec2 works in theory. 
+Getting a better understanding of the theory and the inner mechanisms of the model often helps when fine-tuning the model. 
+
+**However**, if you don't like reading blog posts/papers, don't worry - it is by no means necessary to go through the theory to fine-tune Wav2Vec2 on your language of choice.
+
+If you are interested in learning more about the model though, here are a couple of resources that are important to better understand Wav2Vec2:
+
+- [Facebook's Wav2Vec2 blog post](https://ai.facebook.com/blog/wav2vec-state-of-the-art-speech-recognition-through-self-supervision/)
+- [Official Wav2Vec2 paper](https://arxiv.org/abs/2006.11477)
+- [Official XLSR Wav2vec2 paper](https://arxiv.org/pdf/2006.13979.pdf)
+- [Hugging Face Blog](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2)
+- [How does CTC (Connectionist Temporal Classification) work](https://distill.pub/2017/ctc/)
+
+It helps to have a good understanding of the following points:
+
+- How was XLSR-Wav2Vec2 pretrained? -> Feature vectors were masked and had to be predicted by the model; very similar in spirit to masked language model of BERT.
+
+- What parts of XLSR-Wav2Vec2 are responsible for what? What is the feature extractor part used for? -> extract feature vectors from the 1D raw audio waveform; What is the transformer part doing? -> mapping feature vectors to contextualized feature vectors; ...
+
+- What part of the model needs to be fine-tuned? -> The pretrained model **does not** include a language head to classify the contextualized features to letters. This is randomly initialized when loading the pretrained checkpoint and has to be fine-tuned. Also, note that the authors recommend to **not** further fine-tune the feature extractor.
+
+- What data was used to XLSR-Wav2Vec2? The checkpoint we will use for further fine-tuning was pretrained on **53** languages. 
+
+- What languages are considered to be similar by XLSR-Wav2Vec2? In the official [XLSR Wav2Vec2 paper](https://arxiv.org/pdf/2006.13979.pdf), the authors show nicely which languages share a common contextualized latent space. It might be useful for you to extend your training data with data of other languages that are considered to be very similar by the model (or you).
+
+
+## FAQ
+
+- Can a participant fine-tune models for more than one language? 
+Yes! A participant can fine-tune models in as many languages she/he likes
+- Can a participant use extra data (apart from the common voice data)?
+Yes! All data except the official common voice `test data` can be used for training.
+If a participant wants to train a model on a language that is not part of Common Voice (which 
+is very much encouraged!), the participant should make sure that some test data is held out to 
+make sure the model is not overfitting.
+- Can we fine-tune for high-resource languages? 
+Yes! While we do not really recommend people to fine-tune models in English since there are
+already so many fine-tuned speech recognition models in English. However, it is very much 
+appreciated if participants want to fine-tune models in other "high-resource" languages, such 
+as French, Spanish, or German. For such cases, one probably needs to train locally and apply 
+might have to apply tricks such as lazy data loading (check the ["Lazy data loading"](#how-to-do-lazy-data-loading) section for more details).
--- a/examples/research_projects/wav2vec2/README.md
+++ b/examples/research_projects/wav2vec2/README.md
@@ -1,8 +1,129 @@
 ## Fine-tuning Wav2Vec2

-The `run_training.py` script allows one to finetune pretrained Wav2Vec2 models that can be found [here](https://huggingface.co/models?search=facebook/wav2vec2).
+The `run_asr.py` script allows one to fine-tune pretrained Wav2Vec2 models that can be found [here](https://huggingface.co/models?search=facebook/wav2vec2).

 This finetuning script can also be run as a google colab [TODO: here]( ).

-The script is actively maintained by [Patrick von Platen](https://github.com/patrickvonplaten). 
+The script is actively maintained by [Patrick von Platen](https://github.com/patrickvonplaten).
 Feel free to ask a question on the [Forum](https://discuss.huggingface.co/) or post an issue on [GitHub](https://github.com/huggingface/transformers/issues/new/choose) and adding `@patrickvonplaten` as a tag.
+
+### Fine-Tuning with TIMIT
+Let's take a look at the [script](./finetune_base_timit_asr.sh) used to fine-tune [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base)
+with the [TIMIT dataset](https://huggingface.co/datasets/timit_asr):
+
+```bash
+#!/usr/bin/env bash
+python run_asr.py \
+--output_dir="./wav2vec2-base-timit-asr" \
+--num_train_epochs="30" \
+--per_device_train_batch_size="20" \
+--per_device_eval_batch_size="20" \
+--evaluation_strategy="steps" \
+--save_steps="500" \
+--eval_steps="100" \
+--logging_steps="50" \
+--learning_rate="5e-4" \
+--warmup_steps="3000" \
+--model_name_or_path="facebook/wav2vec2-base" \
+--fp16 \
+--dataset_name="timit_asr" \
+--train_split_name="train" \
+--validation_split_name="test" \
+--orthography="timit" \
+--preprocessing_num_workers="$(nproc)" \
+--group_by_length \
+--freeze_feature_extractor \
+--verbose_logging \
+```
+
+The resulting model and inference examples can be found [here](https://huggingface.co/elgeish/wav2vec2-base-timit-asr).
+Some of the arguments above may look unfamiliar, let's break down what's going on:
+
+`--orthography="timit"` applies certain text preprocessing rules, for tokenization and normalization, to clean up the dataset.
+In this case, we use the following instance of `Orthography`:
+
+```python
+Orthography(
+    do_lower_case=True,
+    # break compounds like "quarter-century-old" and replace pauses "--"
+    translation_table=str.maketrans({"-": " "}),
+)
+```
+
+The instance above is used as follows:
+* creates a tokenizer with `do_lower_case=True` (ignores casing for input and lowercases output when decoding)
+* replaces `"-"` with `" "` to break compounds like `"quarter-century-old"` and to clean up suspended hyphens
+* cleans up consecutive whitespaces (replaces them with a single space: `" "`)
+* removes characters not in vocabulary (lacking respective sound units)
+
+`--verbose_logging` logs text preprocessing updates and when evaluating, using the validation split every `eval_steps`,
+logs references and predictions.
+
+### Fine-Tuning with Arabic Speech Corpus
+
+Other datasets, like the [Arabic Speech Corpus dataset](https://huggingface.co/datasets/arabic_speech_corpus),
+require more work! Let's take a look at the [script](./finetune_large_xlsr_53_arabic_speech_corpus.sh)
+used to fine-tune [wav2vec2-large-xlsr-53](https://huggingface.co/elgeish/wav2vec2-large-xlsr-53-arabic):
+
+```bash
+#!/usr/bin/env bash
+python run_asr.py \
+--output_dir="./wav2vec2-large-xlsr-53-arabic-speech-corpus" \
+--num_train_epochs="50" \
+--per_device_train_batch_size="1" \
+--per_device_eval_batch_size="1" \
+--gradient_accumulation_steps="8" \
+--evaluation_strategy="steps" \
+--save_steps="500" \
+--eval_steps="100" \
+--logging_steps="50" \
+--learning_rate="5e-4" \
+--warmup_steps="3000" \
+--model_name_or_path="elgeish/wav2vec2-large-xlsr-53-arabic" \
+--fp16 \
+--dataset_name="arabic_speech_corpus" \
+--train_split_name="train" \
+--validation_split_name="test" \
+--max_duration_in_seconds="15" \
+--orthography="buckwalter" \
+--preprocessing_num_workers="$(nproc)" \
+--group_by_length \
+--freeze_feature_extractor \
+--target_feature_extractor_sampling_rate \
+--verbose_logging \
+```
+
+First, let's understand how this dataset represents Arabic text; it uses a format called
+[Buckwalter transliteration](https://en.wikipedia.org/wiki/Buckwalter_transliteration).
+We use the [lang-trans](https://github.com/kariminf/lang-trans) package to convert back to Arabic when logging.
+The Buckwalter format only includes ASCII characters, some of which are non-alpha (e.g., `">"` maps to `"أ"`).
+
+`--orthography="buckwalter"` applies certain text preprocessing rules, for tokenization and normalization, to clean up the dataset. In this case, we use the following instance of `Orthography`:
+
+```python
+Orthography(
+    vocab_file=pathlib.Path(__file__).parent.joinpath("vocab/buckwalter.json"),
+    word_delimiter_token="/",  # "|" is Arabic letter alef with madda above
+    words_to_remove={"sil"},  # fixing "sil" in arabic_speech_corpus dataset
+    untransliterator=arabic.buckwalter.untransliterate,
+    translation_table=str.maketrans(translation_table = {
+        "-": " ",  # sometimes used to represent pauses
+        "^": "v",  # fixing "tha" in arabic_speech_corpus dataset
+    }),
+)
+```
+
+The instance above is used as follows:
+* creates a tokenizer with Buckwalter vocabulary and `word_delimiter_token="/"`
+* replaces `"-"` with `" "` to clean up hyphens and fixes the orthography for `"ث"`
+* removes words used as indicators (in this case, `"sil"` is used for silence)
+* cleans up consecutive whitespaces (replaces them with a single space: `" "`)
+* removes characters not in vocabulary (lacking respective sound units)
+
+`--verbose_logging` logs text preprocessing updates and when evaluating, using the validation split every `eval_steps`,
+logs references and predictions. Using the Buckwalter format, text is also logged in Arabic abjad.
+
+`--target_feature_extractor_sampling_rate` resamples audio to target feature extractor's sampling rate (16kHz).
+
+`--max_duration_in_seconds="15"` filters out examples whose audio is longer than the specified limit,
+which helps with capping GPU memory usage.
--- a/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh
+++ b/examples/research_projects/wav2vec2/finetune_base_timit_asr.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+python run_asr.py \
+--output_dir="./wav2vec2-base-timit-asr" \
+--num_train_epochs="30" \
+--per_device_train_batch_size="20" \
+--per_device_eval_batch_size="20" \
+--evaluation_strategy="steps" \
+--save_steps="500" \
+--eval_steps="100" \
+--logging_steps="50" \
+--learning_rate="5e-4" \
+--warmup_steps="3000" \
+--model_name_or_path="facebook/wav2vec2-base" \
+--fp16 \
+--dataset_name="timit_asr" \
+--train_split_name="train" \
+--validation_split_name="test" \
+--orthography="timit" \
+--preprocessing_num_workers="$(nproc)" \
+--group_by_length \
+--freeze_feature_extractor \
+--verbose_logging \
--- a/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh
+++ b/examples/research_projects/wav2vec2/finetune_large_lv60_timit_asr.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+python run_asr.py \
+--output_dir="./wav2vec2-large-lv60-timit-asr" \
+--num_train_epochs="30" \
+--per_device_train_batch_size="2" \
+--per_device_eval_batch_size="2" \
+--gradient_accumulation_steps="4" \
+--evaluation_strategy="steps" \
+--save_steps="500" \
+--eval_steps="100" \
+--logging_steps="50" \
+--learning_rate="5e-4" \
+--warmup_steps="3000" \
+--model_name_or_path="facebook/wav2vec2-large-lv60" \
+--fp16 \
+--dataset_name="timit_asr" \
+--train_split_name="train" \
+--validation_split_name="test" \
+--orthography="timit" \
+--preprocessing_num_workers="$(nproc)" \
+--group_by_length \
+--freeze_feature_extractor \
+--verbose_logging \
--- a/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh
+++ b/examples/research_projects/wav2vec2/finetune_large_xlsr_53_arabic_speech_corpus.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+python run_asr.py \
+--output_dir="./wav2vec2-large-xlsr-53-arabic-speech-corpus" \
+--num_train_epochs="50" \
+--per_device_train_batch_size="1" \
+--per_device_eval_batch_size="1" \
+--gradient_accumulation_steps="8" \
+--evaluation_strategy="steps" \
+--save_steps="500" \
+--eval_steps="100" \
+--logging_steps="50" \
+--learning_rate="5e-4" \
+--warmup_steps="3000" \
+--model_name_or_path="elgeish/wav2vec2-large-xlsr-53-arabic" \
+--fp16 \
+--dataset_name="arabic_speech_corpus" \
+--train_split_name="train" \
+--validation_split_name="test" \
+--max_duration_in_seconds="15" \
+--orthography="buckwalter" \
+--preprocessing_num_workers="$(nproc)" \
+--group_by_length \
+--freeze_feature_extractor \
+--target_feature_extractor_sampling_rate \
+--verbose_logging \
--- a/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh
+++ b/examples/research_projects/wav2vec2/finetune_wav2vec2_xlsr_turkish.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+python run_common_voice.py \
+    --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
+    --dataset_config_name="tr" \
+    --output_dir=./wav2vec2-large-xlsr-turkish-demo \
+    --overwrite_output_dir \
+    --num_train_epochs="5" \
+    --per_device_train_batch_size="16" \
+    --evaluation_strategy="steps" \
+    --learning_rate="3e-4" \
+    --warmup_steps="500" \
+    --fp16 \
+    --freeze_feature_extractor \
+    --save_steps="400" \
+    --eval_steps="400" \
+    --save_total_limit="3" \
+    --logging_steps="400" \
+    --group_by_length \
+    --feat_proj_dropout="0.0" \
+    --layerdrop="0.1" \
+    --gradient_checkpointing \
+    --do_train --do_eval
--- a/examples/research_projects/wav2vec2/requirements.txt
+++ b/examples/research_projects/wav2vec2/requirements.txt
@@ -1,4 +1,7 @@
 transformers
 datasets
-torch >= 1.5.0
-jiwer
+torch>=1.5.0
+torchaudio
+jiwer==2.2.0
+lang-trans==0.6.0
+librosa==0.8.0
--- a/examples/research_projects/wav2vec2/run_asr.py
+++ b/examples/research_projects/wav2vec2/run_asr.py
@@ -1,6 +1,10 @@
 #!/usr/bin/env python3
+import logging
+import pathlib
+import re
+import sys
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Union

 import datasets
 import numpy as np
@@ -8,26 +12,32 @@ import torch
 import torch.nn as nn
 from packaging import version

-import soundfile as sf
+import librosa
+from lang_trans import arabic
 from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
+    Wav2Vec2CTCTokenizer,
+    Wav2Vec2FeatureExtractor,
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    is_apex_available,
+    trainer_utils,
 )


 if is_apex_available():
    from apex import amp

-
 if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast


+logger = logging.getLogger(__name__)
+
+
@dataclass
 class ModelArguments:
    """
@@ -44,6 +54,27 @@ class ModelArguments:
    freeze_feature_extractor: Optional[bool] = field(
        default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
    )
+    gradient_checkpointing: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
+    )
+    verbose_logging: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether to log verbose messages or not."},
+    )
+
+
+def configure_logger(model_args: ModelArguments, training_args: TrainingArguments):
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logging_level = logging.WARNING
+    if model_args.verbose_logging:
+        logging_level = logging.DEBUG
+    elif trainer_utils.is_main_process(training_args.local_rank):
+        logging_level = logging.INFO
+    logger.setLevel(logging_level)


@dataclass
@@ -68,6 +99,34 @@ class DataTrainingArguments:
            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
        },
    )
+    validation_split_name: Optional[str] = field(
+        default="validation",
+        metadata={
+            "help": "The name of the validation data set split to use (via the datasets library). Defaults to 'validation'"
+        },
+    )
+    target_text_column: Optional[str] = field(
+        default="text",
+        metadata={"help": "Column in the dataset that contains label (target text). Defaults to 'text'"},
+    )
+    speech_file_column: Optional[str] = field(
+        default="file",
+        metadata={"help": "Column in the dataset that contains speech file path. Defaults to 'file'"},
+    )
+    target_feature_extractor_sampling_rate: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Resample loaded audio to target feature extractor's sampling rate or not."},
+    )
+    max_duration_in_seconds: Optional[float] = field(
+        default=None,
+        metadata={"help": "Filters out examples longer than specified. Defaults to no filtering."},
+    )
+    orthography: Optional[str] = field(
+        default="librispeech",
+        metadata={
+            "help": "Orthography used for normalization and tokenization: 'librispeech' (default), 'timit', or 'buckwalter'."
+        },
+    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
@@ -77,6 +136,88 @@ class DataTrainingArguments:
    )


+@dataclass
+class Orthography:
+    """
+    Orthography scheme used for text normalization and tokenization.
+
+    Args:
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to accept lowercase input and lowercase the output when decoding.
+        vocab_file (:obj:`str`, `optional`, defaults to :obj:`None`):
+            File containing the vocabulary.
+        word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`"|"`):
+            The token used for delimiting words; it needs to be in the vocabulary.
+        translation_table (:obj:`Dict[str, str]`, `optional`, defaults to :obj:`{}`):
+            Table to use with `str.translate()` when preprocessing text (e.g., "-" -> " ").
+        words_to_remove (:obj:`Set[str]`, `optional`, defaults to :obj:`set()`):
+            Words to remove when preprocessing text (e.g., "sil").
+        untransliterator (:obj:`Callable[[str], str]`, `optional`, defaults to :obj:`None`):
+            Function that untransliterates text back into native writing system.
+    """
+
+    do_lower_case: bool = False
+    vocab_file: Optional[str] = None
+    word_delimiter_token: Optional[str] = "|"
+    translation_table: Optional[Dict[str, str]] = field(default_factory=dict)
+    words_to_remove: Optional[Set[str]] = field(default_factory=set)
+    untransliterator: Optional[Callable[[str], str]] = None
+
+    @classmethod
+    def from_name(cls, name: str):
+        if name == "librispeech":
+            return cls()
+        if name == "timit":
+            return cls(
+                do_lower_case=True,
+                # break compounds like "quarter-century-old" and replace pauses "--"
+                translation_table=str.maketrans({"-": " "}),
+            )
+        if name == "buckwalter":
+            translation_table = {
+                "-": " ",  # sometimes used to represent pauses
+                "^": "v",  # fixing "tha" in arabic_speech_corpus dataset
+            }
+            return cls(
+                vocab_file=pathlib.Path(__file__).parent.joinpath("vocab/buckwalter.json"),
+                word_delimiter_token="/",  # "|" is Arabic letter alef with madda above
+                translation_table=str.maketrans(translation_table),
+                words_to_remove={"sil"},  # fixing "sil" in arabic_speech_corpus dataset
+                untransliterator=arabic.buckwalter.untransliterate,
+            )
+        raise ValueError(f"Unsupported orthography: '{name}'.")
+
+    def preprocess_for_training(self, text: str) -> str:
+        # TODO(elgeish) return a pipeline (e.g., from jiwer) instead? Or rely on branch predictor as is
+        if len(self.translation_table) > 0:
+            text = text.translate(self.translation_table)
+        if len(self.words_to_remove) == 0:
+            text = " ".join(text.split())  # clean up whitespaces
+        else:
+            text = " ".join(w for w in text.split() if w not in self.words_to_remove)  # and clean up whilespaces
+        return text
+
+    def create_processor(self, model_args: ModelArguments) -> Wav2Vec2Processor:
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir
+        )
+        if self.vocab_file:
+            tokenizer = Wav2Vec2CTCTokenizer(
+                self.vocab_file,
+                cache_dir=model_args.cache_dir,
+                do_lower_case=self.do_lower_case,
+                word_delimiter_token=self.word_delimiter_token,
+            )
+        else:
+            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=model_args.cache_dir,
+                do_lower_case=self.do_lower_case,
+                word_delimiter_token=self.word_delimiter_token,
+            )
+        return Wav2Vec2Processor(feature_extractor, tokenizer)
+
+
@dataclass
 class DataCollatorCTCWithPadding:
    """
@@ -201,25 +342,72 @@ def main():
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))

    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    configure_logger(model_args, training_args)

-    model = Wav2Vec2ForCTC.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
-    processor = Wav2Vec2Processor.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    orthography = Orthography.from_name(data_args.orthography.lower())
+    processor = orthography.create_processor(model_args)
+    model = Wav2Vec2ForCTC.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        gradient_checkpointing=model_args.gradient_checkpointing,
+        vocab_size=len(processor.tokenizer),
+    )

    train_dataset = datasets.load_dataset(
        data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name
    )
-    val_dataset = datasets.load_dataset(data_args.dataset_name, data_args.dataset_config_name, split="validation")
+    val_dataset = datasets.load_dataset(
+        data_args.dataset_name, data_args.dataset_config_name, split=data_args.validation_split_name
+    )

    wer_metric = datasets.load_metric("wer")
+    target_sr = processor.feature_extractor.sampling_rate if data_args.target_feature_extractor_sampling_rate else None
+    vocabulary_chars_str = "".join(t for t in processor.tokenizer.get_vocab().keys() if len(t) == 1)
+    vocabulary_text_cleaner = re.compile(  # remove characters not in vocabulary
+        f"[^\s{re.escape(vocabulary_chars_str)}]",  # allow space in addition to chars in vocabulary
+        flags=re.IGNORECASE if processor.tokenizer.do_lower_case else 0,
+    )
+    text_updates = []

-    def map_to_array(batch):
-        speech_array, sampling_rate = sf.read(batch["file"])
-        batch["speech"] = speech_array
-        batch["sampling_rate"] = sampling_rate
-        return batch
+    def prepare_example(example):  # TODO(elgeish) make use of multiprocessing?
+        example["speech"], example["sampling_rate"] = librosa.load(example[data_args.speech_file_column], sr=target_sr)
+        if data_args.max_duration_in_seconds is not None:
+            example["duration_in_seconds"] = len(example["speech"]) / example["sampling_rate"]
+        # Normalize and clean up text; order matters!
+        updated_text = orthography.preprocess_for_training(example[data_args.target_text_column])
+        updated_text = vocabulary_text_cleaner.sub("", updated_text)
+        if updated_text != example[data_args.target_text_column]:
+            text_updates.append((example[data_args.target_text_column], updated_text))
+            example[data_args.target_text_column] = updated_text
+        return example

-    train_dataset = train_dataset.map(map_to_array, remove_columns=["file"])
-    val_dataset = val_dataset.map(map_to_array, remove_columns=["file"])
+    train_dataset = train_dataset.map(prepare_example, remove_columns=[data_args.speech_file_column])
+    val_dataset = val_dataset.map(prepare_example, remove_columns=[data_args.speech_file_column])
+
+    if data_args.max_duration_in_seconds is not None:
+
+        def filter_by_max_duration(example):
+            return example["duration_in_seconds"] <= data_args.max_duration_in_seconds
+
+        old_train_size = len(train_dataset)
+        old_val_size = len(val_dataset)
+        train_dataset = train_dataset.filter(filter_by_max_duration, remove_columns=["duration_in_seconds"])
+        val_dataset = val_dataset.filter(filter_by_max_duration, remove_columns=["duration_in_seconds"])
+        if len(train_dataset) > old_train_size:
+            logger.warning(
+                f"Filtered out {len(train_dataset) - old_train_size} train example(s) longer than {data_args.max_duration_in_seconds} second(s)."
+            )
+        if len(val_dataset) > old_val_size:
+            logger.warning(
+                f"Filtered out {len(val_dataset) - old_val_size} validation example(s) longer than {data_args.max_duration_in_seconds} second(s)."
+            )
+    logger.info(f"Split sizes: {len(train_dataset)} train and {len(val_dataset)} validation.")
+
+    logger.warning(f"Updated {len(text_updates)} transcript(s) using '{data_args.orthography}' orthography rules.")
+    if logger.isEnabledFor(logging.DEBUG):
+        for original_text, updated_text in text_updates:
+            logger.debug(f'Updated text: "{original_text}" -> "{updated_text}"')
+    text_updates = None

    def prepare_dataset(batch):
        # check that all files have the correct sampling rate
@@ -229,7 +417,7 @@ def main():

        batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
        with processor.as_target_processor():
-            batch["labels"] = processor(batch["text"]).input_ids
+            batch["labels"] = processor(batch[data_args.target_text_column]).input_ids
        return batch

    train_dataset = train_dataset.map(
@@ -256,6 +444,13 @@ def main():
        pred_str = processor.batch_decode(pred_ids)
        # we do not want to group tokens when computing the metrics
        label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
+        if logger.isEnabledFor(logging.DEBUG):
+            for reference, predicted in zip(label_str, pred_str):
+                logger.debug(f'reference: "{reference}"')
+                logger.debug(f'predicted: "{predicted}"')
+                if orthography.untransliterator is not None:
+                    logger.debug(f'reference (untransliterated): "{orthography.untransliterator(reference)}"')
+                    logger.debug(f'predicted (untransliterated): "{orthography.untransliterator(predicted)}"')

        wer = wer_metric.compute(predictions=pred_str, references=label_str)

--- a/examples/research_projects/wav2vec2/run_common_voice.py
+++ b/examples/research_projects/wav2vec2/run_common_voice.py
@@ -0,0 +1,511 @@
+#!/usr/bin/env python3
+import json
+import logging
+import os
+import re
+import sys
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Union
+
+import datasets
+import numpy as np
+import torch
+import torchaudio
+from packaging import version
+from torch import nn
+
+import transformers
+from transformers import (
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    Wav2Vec2CTCTokenizer,
+    Wav2Vec2FeatureExtractor,
+    Wav2Vec2ForCTC,
+    Wav2Vec2Processor,
+    is_apex_available,
+    set_seed,
+)
+from transformers.trainer_utils import get_last_checkpoint, is_main_process
+
+
+if is_apex_available():
+    from apex import amp
+
+
+if version.parse(torch.__version__) >= version.parse("1.6"):
+    _is_native_amp_available = True
+    from torch.cuda.amp import autocast
+
+logger = logging.getLogger(__name__)
+
+
+def list_field(default=None, metadata=None):
+    return field(default_factory=lambda: default, metadata=metadata)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    freeze_feature_extractor: Optional[bool] = field(
+        default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
+    )
+    attention_dropout: Optional[float] = field(
+        default=0.1, metadata={"help": "The dropout ratio for the attention probabilities."}
+    )
+    activation_dropout: Optional[float] = field(
+        default=0.1, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
+    )
+    hidden_dropout: Optional[float] = field(
+        default=0.1,
+        metadata={
+            "help": "The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler."
+        },
+    )
+    feat_proj_dropout: Optional[float] = field(
+        default=0.1,
+        metadata={"help": "The dropout probabilitiy for all 1D convolutional layers in feature extractor."},
+    )
+    mask_time_prob: Optional[float] = field(
+        default=0.05,
+        metadata={
+            "help": "Propability of each feature vector along the time axis to be chosen as the start of the vector"
+            "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
+            "vectors will be masked along the time axis. This is only relevant if ``apply_spec_augment is True``."
+        },
+    )
+    gradient_checkpointing: Optional[bool] = field(
+        default=True,
+        metadata={
+            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
+        },
+    )
+    layerdrop: Optional[float] = field(default=0.0, metadata={"help": "The LayerDrop probability."})
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_split_name: Optional[str] = field(
+        default="train+validation",
+        metadata={
+            "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_val_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+            "value if set."
+        },
+    )
+    chars_to_ignore: List[str] = list_field(
+        default=[",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "<EFBFBD>"],
+        metadata={"help": "A list of characters to remove from the transcripts."},
+    )
+
+
+@dataclass
+class DataCollatorCTCWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        processor (:class:`~transformers.Wav2Vec2Processor`)
+            The processor used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
+        max_length_labels (:obj:`int`, `optional`):
+            Maximum length of the ``labels`` returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+
+    processor: Wav2Vec2Processor
+    padding: Union[bool, str] = True
+    max_length: Optional[int] = None
+    max_length_labels: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    pad_to_multiple_of_labels: Optional[int] = None
+
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lenghts and need
+        # different padding methods
+        input_features = [{"input_values": feature["input_values"]} for feature in features]
+        label_features = [{"input_ids": feature["labels"]} for feature in features]
+
+        batch = self.processor.pad(
+            input_features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        with self.processor.as_target_processor():
+            labels_batch = self.processor.pad(
+                label_features,
+                padding=self.padding,
+                max_length=self.max_length_labels,
+                pad_to_multiple_of=self.pad_to_multiple_of_labels,
+                return_tensors="pt",
+            )
+
+        # replace padding with -100 to ignore loss correctly
+        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+
+        batch["labels"] = labels
+
+        return batch
+
+
+class CTCTrainer(Trainer):
+    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
+        """
+        Perform a training step on a batch of inputs.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (:obj:`nn.Module`):
+                The model to train.
+            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
+
+        Return:
+            :obj:`torch.Tensor`: The tensor with training loss on this batch.
+        """
+
+        model.train()
+        inputs = self._prepare_inputs(inputs)
+
+        if self.use_amp:
+            with autocast():
+                loss = self.compute_loss(model, inputs)
+        else:
+            loss = self.compute_loss(model, inputs)
+
+        if self.args.n_gpu > 1:
+            if model.module.config.ctc_loss_reduction == "mean":
+                loss = loss.mean()
+            elif model.module.config.ctc_loss_reduction == "sum":
+                loss = loss.sum() / (inputs["labels"] >= 0).sum()
+            else:
+                raise ValueError(f"{model.config.ctc_loss_reduction} is not valid. Choose one of ['mean', 'sum']")
+
+        if self.args.gradient_accumulation_steps > 1:
+            loss = loss / self.args.gradient_accumulation_steps
+
+        if self.use_amp:
+            self.scaler.scale(loss).backward()
+        elif self.use_apex:
+            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
+                scaled_loss.backward()
+        elif self.deepspeed:
+            self.deepspeed.backward(loss)
+        else:
+            loss.backward()
+
+        return loss.detach()
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets:
+    train_dataset = datasets.load_dataset(
+        "common_voice", data_args.dataset_config_name, split=data_args.train_split_name
+    )
+    eval_dataset = datasets.load_dataset("common_voice", data_args.dataset_config_name, split="test")
+
+    # Create and save tokenizer
+    chars_to_ignore_regex = f'[{"".join(data_args.chars_to_ignore)}]'
+
+    def remove_special_characters(batch):
+        batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
+        return batch
+
+    train_dataset = train_dataset.map(remove_special_characters, remove_columns=["sentence"])
+    eval_dataset = eval_dataset.map(remove_special_characters, remove_columns=["sentence"])
+
+    def extract_all_chars(batch):
+        all_text = " ".join(batch["text"])
+        vocab = list(set(all_text))
+        return {"vocab": [vocab], "all_text": [all_text]}
+
+    vocab_train = train_dataset.map(
+        extract_all_chars,
+        batched=True,
+        batch_size=-1,
+        keep_in_memory=True,
+        remove_columns=train_dataset.column_names,
+    )
+    vocab_test = train_dataset.map(
+        extract_all_chars,
+        batched=True,
+        batch_size=-1,
+        keep_in_memory=True,
+        remove_columns=eval_dataset.column_names,
+    )
+
+    vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
+    vocab_dict = {v: k for k, v in enumerate(vocab_list)}
+    vocab_dict["|"] = vocab_dict[" "]
+    del vocab_dict[" "]
+    vocab_dict["[UNK]"] = len(vocab_dict)
+    vocab_dict["[PAD]"] = len(vocab_dict)
+
+    with open("vocab.json", "w") as vocab_file:
+        json.dump(vocab_dict, vocab_file)
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    tokenizer = Wav2Vec2CTCTokenizer(
+        "vocab.json",
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        word_delimiter_token="|",
+    )
+    feature_extractor = Wav2Vec2FeatureExtractor(
+        feature_size=1, sampling_rate=16_000, padding_value=0.0, do_normalize=True, return_attention_mask=True
+    )
+    processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+    model = Wav2Vec2ForCTC.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        activation_dropout=model_args.activation_dropout,
+        attention_dropout=model_args.attention_dropout,
+        hidden_dropout=model_args.hidden_dropout,
+        feat_proj_dropout=model_args.feat_proj_dropout,
+        mask_time_prob=model_args.mask_time_prob,
+        gradient_checkpointing=model_args.gradient_checkpointing,
+        layerdrop=model_args.layerdrop,
+        ctc_loss_reduction="mean",
+        pad_token_id=processor.tokenizer.pad_token_id,
+        vocab_size=len(processor.tokenizer),
+    )
+
+    if data_args.max_train_samples is not None:
+        train_dataset = train_dataset.select(range(data_args.max_train_samples))
+
+    if data_args.max_val_samples is not None:
+        eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
+
+    resampler = torchaudio.transforms.Resample(48_000, 16_000)
+
+    # Preprocessing the datasets.
+    # We need to read the aduio files as arrays and tokenize the targets.
+    def speech_file_to_array_fn(batch):
+        speech_array, sampling_rate = torchaudio.load(batch["path"])
+        batch["speech"] = resampler(speech_array).squeeze().numpy()
+        batch["sampling_rate"] = 16_000
+        batch["target_text"] = batch["text"]
+        return batch
+
+    train_dataset = train_dataset.map(
+        speech_file_to_array_fn,
+        remove_columns=train_dataset.column_names,
+        num_proc=data_args.preprocessing_num_workers,
+    )
+    eval_dataset = eval_dataset.map(
+        speech_file_to_array_fn,
+        remove_columns=eval_dataset.column_names,
+        num_proc=data_args.preprocessing_num_workers,
+    )
+
+    def prepare_dataset(batch):
+        # check that all files have the correct sampling rate
+        assert (
+            len(set(batch["sampling_rate"])) == 1
+        ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
+        batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
+        # Setup the processor for targets
+        with processor.as_target_processor():
+            batch["labels"] = processor(batch["target_text"]).input_ids
+        return batch
+
+    train_dataset = train_dataset.map(
+        prepare_dataset,
+        remove_columns=train_dataset.column_names,
+        batch_size=training_args.per_device_train_batch_size,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+    )
+    eval_dataset = eval_dataset.map(
+        prepare_dataset,
+        remove_columns=eval_dataset.column_names,
+        batch_size=training_args.per_device_train_batch_size,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+    )
+
+    # Metric
+    wer_metric = datasets.load_metric("wer")
+
+    def compute_metrics(pred):
+        pred_logits = pred.predictions
+        pred_ids = np.argmax(pred_logits, axis=-1)
+
+        pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
+
+        pred_str = processor.batch_decode(pred_ids)
+        # we do not want to group tokens when computing the metrics
+        label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
+
+        wer = wer_metric.compute(predictions=pred_str, references=label_str)
+
+        return {"wer": wer}
+
+    if model_args.freeze_feature_extractor:
+        model.freeze_feature_extractor()
+
+    # Data collator
+    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
+
+    # Initialize our Trainer
+    trainer = CTCTrainer(
+        model=model,
+        data_collator=data_collator,
+        args=training_args,
+        compute_metrics=compute_metrics,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=eval_dataset if training_args.do_eval else None,
+        tokenizer=processor.feature_extractor,
+    )
+
+    # Training
+    if training_args.do_train:
+        if last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        elif os.path.isdir(model_args.model_name_or_path):
+            checkpoint = model_args.model_name_or_path
+        else:
+            checkpoint = None
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()
+
+        # save the feature_extractor and the tokenizer
+        if is_main_process(training_args.local_rank):
+            processor.save_pretrained(training_args.output_dir)
+
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/research_projects/wav2vec2/vocab/buckwalter.json
+++ b/examples/research_projects/wav2vec2/vocab/buckwalter.json
@@ -0,0 +1,58 @@
+{
+    "<pad>": 0,
+    "<s>": 1,
+    "</s>": 2,
+    "<unk>": 3,
+    "/": 4,
+    "'": 5,
+    "|": 6,
+    ">": 7,
+    "&": 8,
+    "<": 9,
+    "}": 10,
+    "A": 11,
+    "b": 12,
+    "p": 13,
+    "t": 14,
+    "v": 15,
+    "j": 16,
+    "H": 17,
+    "x": 18,
+    "d": 19,
+    "*": 20,
+    "r": 21,
+    "z": 22,
+    "s": 23,
+    "$": 24,
+    "S": 25,
+    "D": 26,
+    "T": 27,
+    "Z": 28,
+    "E": 29,
+    "g": 30,
+    "_": 31,
+    "f": 32,
+    "q": 33,
+    "k": 34,
+    "l": 35,
+    "m": 36,
+    "n": 37,
+    "h": 38,
+    "w": 39,
+    "Y": 40,
+    "y": 41,
+    "F": 42,
+    "N": 43,
+    "K": 44,
+    "a": 45,
+    "u": 46,
+    "i": 47,
+    "~": 48,
+    "o": 49,
+    "`": 50,
+    "{": 51,
+    "P": 52,
+    "J": 53,
+    "V": 54,
+    "G": 55
+}
--- a/examples/research_projects/zero-shot-distillation/distill_classifier.py
+++ b/examples/research_projects/zero-shot-distillation/distill_classifier.py
@@ -152,7 +152,7 @@ def get_entailment_id(config):
    for label, ind in config.label2id.items():
        if label.lower().startswith("entail"):
            return ind
-    logging.warning("Could not identify entailment dimension from teacher config label2id. Setting to -1.")
+    logger.warning("Could not identify entailment dimension from teacher config label2id. Setting to -1.")
    return -1


--- a/examples/seq2seq/README.md
+++ b/examples/seq2seq/README.md
@@ -24,10 +24,10 @@ For the old `finetune_trainer.py` and related utils, see [`examples/legacy/seq2s
 ### Supported Architectures

 - `BartForConditionalGeneration`
+- `FSMTForConditionalGeneration` (translation only)
+- `MBartForConditionalGeneration`
 - `MarianMTModel`
 - `PegasusForConditionalGeneration`
- `MBartForConditionalGeneration`
- `FSMTForConditionalGeneration` (translation only)
 - `T5ForConditionalGeneration`

 `run_summarization.py` and `run_translation.py` are lightweight examples of how to download and preprocess a dataset from the [🤗 Datasets](https://github.com/huggingface/datasets) library or use your own files (jsonlines or csv), then fine-tune one of the architectures above on it.
@@ -43,17 +43,21 @@ python examples/seq2seq/run_summarization.py \
    --model_name_or_path t5-small \
    --do_train \
    --do_eval \
-    --dataset_name xsum \
+    --dataset_name cnn_dailymail \
+    --dataset_config "3.0.0" \
+    --source_prefix "summarize: " \
    --output_dir /tmp/tst-summarization \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --overwrite_output_dir \
-    --predict_with_generate \
-    --max_train_samples 500 \
-    --max_val_samples 500
+    --predict_with_generate
 ```

-CNN/DailyMail dataset is another commonly used dataset for the task of summarization. To use it replace `--dataset_name xsum` with `--dataset_name cnn_dailymail --dataset_config "3.0.0"`.
+Only T5 models `t5-small`, `t5-base`, `t5-large`, `t5-3b` and `t5-11b` must use an additional argument: `--source_prefix "summarize: "`.
+
+We used CNN/DailyMail dataset in this example as `t5-small` was trained on it and one can get good scores even when pre-training with a very small sample.
+
+Extreme Summarization (XSum) Dataset is another commonly used dataset for the task of summarization. To use it replace `--dataset_name cnn_dailymail --dataset_config "3.0.0"` with  `--dataset_name xsum`.

 And here is how you would use it on your own files, after adjusting the values for the arguments
 `--train_file`, `--validation_file`, `--text_column` and `--summary_column` to match your setup:
@@ -65,13 +69,12 @@ python examples/seq2seq/run_summarization.py \
    --do_eval \
    --train_file path_to_csv_or_jsonlines_file \
    --validation_file path_to_csv_or_jsonlines_file \
+    --source_prefix "summarize: " \
    --output_dir /tmp/tst-summarization \
    --overwrite_output_dir \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
-    --predict_with_generate \
-    --max_train_samples 500 \
-    --max_val_samples 500
+    --predict_with_generate
 ```

 The task of summarization supports custom CSV and JSONLINES formats.
@@ -111,7 +114,7 @@ and you wanted to select only `text` and `summary`, then you'd pass these additi
    --summary_column summary \
 ```

-#### Custom JSONFILES Files
+#### Custom JSONLINES Files

 The second supported format is jsonlines. Here is an example of a jsonlines custom data file.

@@ -135,11 +138,11 @@ And as with the CSV files, you can specify which values to select from the file,

 ### Translation

-Here is an example of a translation fine-tuning with T5:
+Here is an example of a translation fine-tuning with a MarianMT model:

 ```bash
 python examples/seq2seq/run_translation.py \
-    --model_name_or_path t5-small \
+    --model_name_or_path Helsinki-NLP/opus-mt-en-ro \
    --do_train \
    --do_eval \
    --source_lang en \
@@ -150,12 +153,35 @@ python examples/seq2seq/run_translation.py \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --overwrite_output_dir \
-    --predict_with_generate \
-    --max_train_samples 500 \
-    --max_val_samples 500
+    --predict_with_generate
 ```

-And the same with MBart:
+MBart and some T5 models require special handling.
+
+T5 models `t5-small`, `t5-base`, `t5-large`, `t5-3b` and `t5-11b` must use an additional argument: `--source_prefix "translate {source_lang} to {target_lang}"`. For example:
+
+```bash
+python examples/seq2seq/run_translation.py \
+    --model_name_or_path t5-small \
+    --do_train \
+    --do_eval \
+    --source_lang en \
+    --target_lang ro \
+    --source_prefix "translate English to Romanian: " \
+    --dataset_name wmt16 \
+    --dataset_config_name ro-en \
+    --output_dir /tmp/tst-translation \
+    --per_device_train_batch_size=4 \
+    --per_device_eval_batch_size=4 \
+    --overwrite_output_dir \
+    --predict_with_generate
+```
+
+If you get a terrible BLEU score, make sure that you didn't forget to use the `--source_prefix` argument.
+
+For the aforementioned group of T5 models it's important to remember that if you switch to a different language pair, make sure to adjust the source and target values in all 3 language-specific command line argument: `--source_lang`, `--target_lang` and `--source_prefix`.
+
+MBart models require a different format for `--source_lang` and `--target_lang` values, e.g. instead of `en` it expects `en_XX`, for `ro` it expects `ro_RO`. The full MBart specification for language codes can be found [here](https://huggingface.co/facebook/mbart-large-cc25). For example:

 ```bash
 python examples/seq2seq/run_translation.py \
@@ -170,18 +196,9 @@ python examples/seq2seq/run_translation.py \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --overwrite_output_dir \
-    --predict_with_generate \
-    --max_train_samples 500 \
-    --max_val_samples 500
+    --predict_with_generate
 ```

-Note, that depending on the used model additional language-specific command-line arguments are sometimes required. Specifically:
-
-* MBart models require different `--{source,target}_lang` values, e.g. in place of `en` it expects `en_XX`, for `ro` it expects `ro_RO`. The full MBart specification for language codes can be looked up [here](https://huggingface.co/facebook/mbart-large-cc25)
-* T5 models can use a `--source_prefix` argument to override the otherwise automated prefix of the form `translate {source_lang} to {target_lang}` for `run_translation.py` and `summarize: ` for `run_summarization.py`
-
-Also, if you switch to a different language pair, make sure to adjust the source and target values in all command line arguments.
-
 And here is how you would use the translation finetuning on your own files, after adjusting the
 values for the arguments `--train_file`, `--validation_file` to match your setup:

@@ -192,6 +209,7 @@ python examples/seq2seq/run_translation.py \
    --do_eval \
    --source_lang en \
    --target_lang ro \
+    --source_prefix "translate English to Romanian: " \
    --dataset_name wmt16 \
    --dataset_config_name ro-en \
    --train_file path_to_jsonlines_file \
@@ -200,9 +218,7 @@ python examples/seq2seq/run_translation.py \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --overwrite_output_dir \
-    --predict_with_generate \
-    --max_train_samples 500 \
-    --max_val_samples 500
+    --predict_with_generate
 ```

 The task of translation supports only custom JSONLINES files, with each line being a dictionary with a key `"translation"` and its value another dictionary whose keys is the language pair. For example:
@@ -213,7 +229,7 @@ The task of translation supports only custom JSONLINES files, with each line bei
 ```
 Here the languages are Romanian (`ro`) and English (`en`).

-If you want to use a pre-processed dataset that leads to high bleu scores, but for the `en-de` language pair, you can use `--dataset_name wmt14-en-de-pre-processed`, as following:
+If you want to use a pre-processed dataset that leads to high BLEU scores, but for the `en-de` language pair, you can use `--dataset_name stas/wmt14-en-de-pre-processed`, as following:

 ```bash
 python examples/seq2seq/run_translation.py \
@@ -222,12 +238,11 @@ python examples/seq2seq/run_translation.py \
    --do_eval \
    --source_lang en \
    --target_lang de \
-    --dataset_name wmt14-en-de-pre-processed \
+    --source_prefix "translate English to German: " \
+    --dataset_name stas/wmt14-en-de-pre-processed \
    --output_dir /tmp/tst-translation \
    --per_device_train_batch_size=4 \
    --per_device_eval_batch_size=4 \
    --overwrite_output_dir \
-    --predict_with_generate \
-    --max_train_samples 500 \
-    --max_val_samples 500
+    --predict_with_generate
 ```
--- a/examples/seq2seq/requirements.txt
+++ b/examples/seq2seq/requirements.txt
@@ -3,4 +3,5 @@ sentencepiece != 0.1.92
 protobuf
 sacrebleu >= 1.4.12
 rouge-score
-nltk
+nltk
+py7zr
--- a/examples/seq2seq/run_summarization.py
+++ b/examples/seq2seq/run_summarization.py
@@ -38,7 +38,6 @@ from transformers import (
    HfArgumentParser,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
-    default_data_collator,
    set_seed,
 )
 from transformers.file_utils import is_offline_mode
@@ -47,7 +46,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
+check_min_version("4.5.0")

 logger = logging.getLogger(__name__)

@@ -295,7 +294,7 @@ def main():
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
-    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)
@@ -466,15 +465,12 @@ def main():

    # Data collator
    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
-    if data_args.pad_to_max_length:
-        data_collator = default_data_collator
-    else:
-        data_collator = DataCollatorForSeq2Seq(
-            tokenizer,
-            model=model,
-            label_pad_token_id=label_pad_token_id,
-            pad_to_multiple_of=8 if training_args.fp16 else None,
-        )
+    data_collator = DataCollatorForSeq2Seq(
+        tokenizer,
+        model=model,
+        label_pad_token_id=label_pad_token_id,
+        pad_to_multiple_of=8 if training_args.fp16 else None,
+    )

    # Metric
    metric = load_metric("rouge")
--- a/examples/seq2seq/run_translation.py
+++ b/examples/seq2seq/run_translation.py
@@ -46,7 +46,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
+check_min_version("4.5.0")

 logger = logging.getLogger(__name__)

@@ -264,7 +264,7 @@ def main():
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
-    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)
--- a/examples/tests/deepspeed/test_deepspeed.py
+++ b/examples/tests/deepspeed/test_deepspeed.py
@@ -12,11 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import dataclasses
+import io
 import json
 import os
 import sys
 import unittest
+from copy import deepcopy

+from transformers import TrainingArguments
+from transformers.file_utils import WEIGHTS_NAME
 from transformers.integrations import is_deepspeed_available
 from transformers.testing_utils import (
    CaptureStd,
@@ -33,7 +38,7 @@ from transformers.trainer_utils import set_seed

 bindir = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(f"{bindir}/../../../tests")
-from test_trainer import get_regression_trainer  # noqa
+from test_trainer import TrainerIntegrationCommon, get_regression_trainer  # noqa


 set_seed(42)
@@ -58,25 +63,95 @@ def require_deepspeed(test_case):

@require_deepspeed
@require_torch_gpu
-class TrainerIntegrationDeepSpeed(TestCasePlus):
-    """ This class is for testing directly via get_regression_trainer """
+class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
+    """
+
+    This class is for testing directly via get_regression_trainer
+
+    It mixes in `TrainerIntegrationCommon` which already has a lot of helper validation methods which we can re-use here.
+    """

    def setUp(self):
        super().setUp()
+
+        args = TrainingArguments(".")
+        self.n_epochs = args.num_train_epochs
+        self.batch_size = args.train_batch_size
+
        self.dist_env_1_gpu = dict(
            MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
        )
        self.ds_config_file = f"{self.test_file_dir_str}/ds_config.json"
+        with io.open(self.ds_config_file, "r", encoding="utf-8") as f:
+            self.ds_config_dict = json.load(f)

    def test_fake_notebook_no_launcher(self):
-
        # this setup emulates a notebook where a launcher needs to be emulated by hand
-
-        with CaptureStd() as cs:
+        with CaptureStd() as cs:  # noqa
            with mockenv_context(**self.dist_env_1_gpu):
                trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file)
                trainer.train()
-        assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
+        # fixme:
+        # assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
+
+    # Test various combos
+    # 1. DS scheduler + DS optimizer: this is already tested by most other tests
+    # 2. HF scheduler + HF optimizer:
+    # 3. DS scheduler + HF optimizer:
+    # 4. HF scheduler + DS optimizer:
+
+    def test_hf_scheduler_hf_optimizer(self):
+        a = 0
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_dict = deepcopy(self.ds_config_dict)
+            del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
+            del ds_config_dict["scheduler"]  # force default HF Trainer scheduler
+            ds_config_dict["zero_optimization"]["cpu_offload"] = False
+            ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(a=a, local_rank=0, deepspeed=ds_config_dict)
+            trainer.train()
+        new_a = trainer.model.a.item()
+        self.assertNotEqual(new_a, a)
+
+    def test_ds_scheduler_hf_optimizer(self):
+        a = 0
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_dict = deepcopy(self.ds_config_dict)
+            del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
+            ds_config_dict["zero_optimization"]["cpu_offload"] = False
+            ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(a=a, local_rank=0, deepspeed=ds_config_dict)
+            trainer.train()
+        new_a = trainer.model.a.item()
+        self.assertNotEqual(new_a, a)
+
+    def test_hf_scheduler_ds_optimizer(self):
+        # this combo is not possible at the moment
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_dict = deepcopy(self.ds_config_dict)
+            del ds_config_dict["scheduler"]  # force default HF Trainer scheduler
+            ds_config_dict["zero_optimization"]["cpu_offload"] = False
+            ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict)
+            with self.assertRaises(Exception) as context:
+                trainer.train()
+        self.assertTrue("HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception))
+
+    def test_hf_optimizer_with_offload(self):
+        # must not allow non-DS optimizer when using ZERO-offload
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_dict = deepcopy(self.ds_config_dict)
+            del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
+            ds_config_dict["zero_optimization"]["cpu_offload"] = True
+            # sanity check - should the default config change
+            assert (
+                "cpu_offload" in ds_config_dict["zero_optimization"]
+                and ds_config_dict["zero_optimization"]["cpu_offload"] is True
+            ), "ensure the config is set up correctly"
+            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict)
+            with self.assertRaises(Exception) as context:
+                trainer.train()
+        self.assertTrue("ZeRO Offload can only work with DeepSpeed optimizers" in str(context.exception))

    def test_early_get_last_lr(self):
        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
@@ -160,6 +235,101 @@ class TrainerIntegrationDeepSpeed(TestCasePlus):
        # see the note above how to get identical loss on a small bs
        self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=5)

+    def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, is_pretrained=True):
+        # adapted from TrainerIntegrationCommon.check_saved_checkpoints
+
+        file_list = [WEIGHTS_NAME, "training_args.bin", "trainer_state.json", "config.json"]
+        ds_file_list = ["mp_rank_00_model_states.pt", "zero_pp_rank_0_mp_rank_00optim_states.pt"]
+
+        for step in range(freq, total, freq):
+            checkpoint = os.path.join(output_dir, f"checkpoint-{step}")
+            self.assertTrue(os.path.isdir(checkpoint))
+
+            # common files
+            for filename in file_list:
+                self.assertTrue(os.path.isfile(os.path.join(checkpoint, filename)))
+
+            # ds files
+            ds_path = os.path.join(checkpoint, f"global_step{step}")
+            for filename in ds_file_list:
+                # filename = os.path.join(path, filename)
+                # print(filename)
+                self.assertTrue(os.path.isfile(os.path.join(ds_path, filename)))
+
+    def test_save_checkpoints(self):
+        # adapted from  TrainerIntegrationTest.test_save_checkpoints
+
+        output_dir = self.get_auto_remove_tmp_dir()
+        ds_config_dict = deepcopy(self.ds_config_dict)
+        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+        freq = 5
+
+        # save checkpoints
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(
+                output_dir=output_dir,
+                save_steps=freq,
+                deepspeed=ds_config_dict,
+            )
+            trainer.train()
+
+        total = int(self.n_epochs * 64 / self.batch_size)
+        self.check_saved_checkpoints_deepspeed(output_dir, freq, total)
+
+    def test_can_resume_training(self):
+        # adapted from TrainerIntegrationTest.test_can_resume_training
+
+        output_dir = self.get_auto_remove_tmp_dir()
+        ds_config_dict = deepcopy(self.ds_config_dict)
+        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
+        kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict)
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(**kwargs)
+            trainer.train()
+            (a, b) = trainer.model.a.item(), trainer.model.b.item()
+            state = dataclasses.asdict(trainer.state)
+
+            checkpoint = os.path.join(output_dir, "checkpoint-5")
+
+            # Reinitialize trainer
+            trainer = get_regression_trainer(**kwargs)
+
+            trainer.train(resume_from_checkpoint=checkpoint)
+            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+            state1 = dataclasses.asdict(trainer.state)
+            self.assertEqual(a, a1)
+            self.assertEqual(b, b1)
+            self.check_trainer_state_are_the_same(state, state1)
+
+            # Now check with a later checkpoint that it also works when we span over one epoch
+            checkpoint = os.path.join(output_dir, "checkpoint-15")
+
+            # Reinitialize trainer and load model
+            trainer = get_regression_trainer(**kwargs)
+
+            trainer.train(resume_from_checkpoint=checkpoint)
+            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
+            state1 = dataclasses.asdict(trainer.state)
+            self.assertEqual(a, a1)
+            self.assertEqual(b, b1)
+            self.check_trainer_state_are_the_same(state, state1)
+
+            # Now check failures
+
+            # 1. fail to find a bogus checkpoint
+            trainer = get_regression_trainer(**kwargs)
+            with self.assertRaises(Exception) as context:
+                trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
+            self.assertTrue("failed to resume from checkpoint" in str(context.exception))
+
+            # 2. fail to find any checkpoint - due a fresh output_dir
+            output_dir2 = self.get_auto_remove_tmp_dir()
+            trainer = get_regression_trainer(output_dir=output_dir2, deepspeed=ds_config_dict)
+            with self.assertRaises(Exception) as context:
+                trainer.train(resume_from_checkpoint=True)
+            self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))
+

@slow
@require_deepspeed
--- a/examples/text-classification/run_glue.py
+++ b/examples/text-classification/run_glue.py
@@ -45,7 +45,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
+check_min_version("4.5.0")

 task_to_keys = {
    "cola": ("sentence", None),
--- a/examples/text-classification/run_glue_no_trainer.py
+++ b/examples/text-classification/run_glue_no_trainer.py
@@ -222,13 +222,13 @@ def main():
            num_labels = 1
    else:
        # Trying to have good defaults here, don't hesitate to tweak to your needs.
-        is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"]
+        is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
-            label_list = datasets["train"].unique("label")
+            label_list = raw_datasets["train"].unique("label")
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)

@@ -249,7 +249,7 @@ def main():
        sentence1_key, sentence2_key = task_to_keys[args.task_name]
    else:
        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
-        non_label_column_names = [name for name in datasets["train"].column_names if name != "label"]
+        non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
            sentence1_key, sentence2_key = "sentence1", "sentence2"
        else:
--- a/examples/text-classification/run_tf_glue.py
+++ b/examples/text-classification/run_tf_glue.py
@@ -160,18 +160,16 @@ def main():
        level=logging.INFO,
    )
    logger.info(
-        "n_replicas: %s, distributed training: %s, 16-bits training: %s",
-        training_args.n_replicas,
-        bool(training_args.n_replicas > 1),
-        training_args.fp16,
+        f"n_replicas: {training_args.n_replicas}, distributed training: {bool(training_args.n_replicas > 1)}, "
+        f"16-bits training: {training_args.fp16}",
    )
-    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info(f"Training/evaluation parameters {training_args}")

    try:
        num_labels = glue_tasks_num_labels["mnli" if data_args.task_name == "mnli-mm" else data_args.task_name]
        output_mode = glue_output_modes[data_args.task_name]
    except KeyError:
-        raise ValueError("Task not found: %s" % (data_args.task_name))
+        raise ValueError(f"Task not found: {data_args.task_name}")

    # Load pretrained model and tokenizer
    #
@@ -255,8 +253,8 @@ def main():
            logger.info("***** Eval results *****")

            for key, value in result.items():
-                logger.info("  %s = %s", key, value)
-                writer.write("%s = %s\n" % (key, value))
+                logger.info(f"  {key} = {value}")
+                writer.write(f"{key} = {value}\n")

            results.update(result)

--- a/examples/text-classification/run_tf_text_classification.py
+++ b/examples/text-classification/run_tf_text_classification.py
@@ -225,12 +225,10 @@ def main():
        level=logging.INFO,
    )
    logger.info(
-        "n_replicas: %s, distributed training: %s, 16-bits training: %s",
-        training_args.n_replicas,
-        bool(training_args.n_replicas > 1),
-        training_args.fp16,
+        f"n_replicas: {training_args.n_replicas}, distributed training: {bool(training_args.n_replicas > 1)}, "
+        f"16-bits training: {training_args.fp16}"
    )
-    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info(f"Training/evaluation parameters {training_args}")

    # Load pretrained model and tokenizer
    #
@@ -300,8 +298,8 @@ def main():
            logger.info("***** Eval results *****")

            for key, value in result.items():
-                logger.info("  %s = %s", key, value)
-                writer.write("%s = %s\n" % (key, value))
+                logger.info(f"  {key} = {value}")
+                writer.write(f"{key} = {value}\n")

            results.update(result)

--- a/examples/text-classification/run_xnli.py
+++ b/examples/text-classification/run_xnli.py
@@ -45,7 +45,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
+check_min_version("4.5.0")

 logger = logging.getLogger(__name__)

@@ -207,14 +207,22 @@ def main():
    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
    # download the dataset.
    # Downloading and loading xnli dataset from the hub.
-    if model_args.train_language is None:
-        train_dataset = load_dataset("xnli", model_args.language, split="train")
-    else:
-        train_dataset = load_dataset("xnli", model_args.train_language, split="train")
+    if training_args.do_train:
+        if model_args.train_language is None:
+            train_dataset = load_dataset("xnli", model_args.language, split="train")
+        else:
+            train_dataset = load_dataset("xnli", model_args.train_language, split="train")
+        label_list = train_dataset.features["label"].names
+
+    if training_args.do_eval:
+        eval_dataset = load_dataset("xnli", model_args.language, split="validation")
+        label_list = eval_dataset.features["label"].names
+
+    if training_args.do_predict:
+        test_dataset = load_dataset("xnli", model_args.language, split="test")
+        label_list = test_dataset.features["label"].names

-    eval_dataset = load_dataset("xnli", model_args.language, split="validation")
    # Labels
-    label_list = train_dataset.features["label"].names
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
@@ -271,6 +279,9 @@ def main():
            batched=True,
            load_from_cache_file=not data_args.overwrite_cache,
        )
+        # Log a few random samples from the training set:
+        for index in random.sample(range(len(train_dataset)), 3):
+            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    if training_args.do_eval:
        if data_args.max_val_samples is not None:
@@ -281,9 +292,14 @@ def main():
            load_from_cache_file=not data_args.overwrite_cache,
        )

-    # Log a few random samples from the training set:
-    for index in random.sample(range(len(train_dataset)), 3):
-        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+    if training_args.do_predict:
+        if data_args.max_test_samples is not None:
+            test_dataset = test_dataset.select(range(data_args.max_test_samples))
+        test_dataset = test_dataset.map(
+            preprocess_function,
+            batched=True,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )

    # Get the metric function
    metric = load_metric("xnli")
@@ -307,7 +323,7 @@ def main():
    trainer = Trainer(
        model=model,
        args=training_args,
-        train_dataset=train_dataset,
+        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
@@ -346,6 +362,26 @@ def main():
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

+    # Prediction
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+        predictions, labels, metrics = trainer.predict(test_dataset)
+
+        max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len(test_dataset)
+        metrics["test_samples"] = min(max_test_samples, len(test_dataset))
+
+        trainer.log_metrics("test", metrics)
+        trainer.save_metrics("test", metrics)
+
+        predictions = np.argmax(predictions, axis=1)
+        output_test_file = os.path.join(training_args.output_dir, "test_predictions.txt")
+        if trainer.is_world_process_zero():
+            with open(output_test_file, "w") as writer:
+                writer.write("index\tprediction\n")
+                for index, item in enumerate(predictions):
+                    item = label_list[item]
+                    writer.write(f"{index}\t{item}\n")
+

 if __name__ == "__main__":
    main()
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -201,12 +201,7 @@ def main():
    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()

-    logger.warning(
-        "device: %s, n_gpu: %s, 16-bits training: %s",
-        args.device,
-        args.n_gpu,
-        args.fp16,
-    )
+    logger.warning(f"device: {args.device}, n_gpu: {args.n_gpu}, 16-bits training: {args.fp16}")

    set_seed(args)

@@ -271,7 +266,7 @@ def main():
    generated_sequences = []

    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
-        print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
+        print(f"=== GENERATED SEQUENCE {generated_sequence_idx + 1} ===")
        generated_sequence = generated_sequence.tolist()

        # Decode text
--- a/examples/token-classification/README.md
+++ b/examples/token-classification/README.md
@@ -14,14 +14,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->

-## Token classification
+# Token classification

-Fine-tuning the library models for token classification task such as Named Entity Recognition (NER) or Parts-of-speech
-tagging (POS). The main scrip `run_ner.py` leverages the 🤗 Datasets library and the Trainer API. You can easily
+## PyTorch version
+
+Fine-tuning the library models for token classification task such as Named Entity Recognition (NER), Parts-of-speech
+tagging (POS) pr phrase extraction (CHUNKS). The main scrip `run_ner.py` leverages the 🤗 Datasets library and the Trainer API. You can easily
 customize it to your needs if you need extra processing on your datasets.

 It will either run on a datasets hosted on our [hub](https://huggingface.co/datasets) or with your own text files for
-training and validation.
+training and validation, you might just need to add some tweaks in the data preprocessing.

 The following example fine-tunes BERT on CoNLL-2003:

@@ -57,6 +59,74 @@ of the script.

 You can find the old version of the PyTorch script [here](https://github.com/huggingface/transformers/blob/master/examples/legacy/token-classification/run_ner.py).

+## Pytorch version, no Trainer
+
+Based on the script [run_ner_no_trainer.py](https://github.com/huggingface/transformers/blob/master/examples/token-classification/run_ner_no_trainer.py).
+
+Like `run_ner.py`, this script allows you to fine-tune any of the models on the [hub](https://huggingface.co/models) on a
+token classification task, either NER, POS or CHUNKS tasks or your own data in a csv or a JSON file. The main difference is that this
+script exposes the bare training loop, to allow you to quickly experiment and add any customization you would like.
+
+It offers less options than the script with `Trainer` (for instance you can easily change the options for the optimizer
+or the dataloaders directly in the script) but still run in a distributed setup, on TPU and supports mixed precision by
+the mean of the [🤗 `Accelerate`](https://github.com/huggingface/accelerate) library. You can use the script normally
+after installing it:
+
+```bash
+pip install accelerate
+```
+
+then
+
+```bash
+export TASK_NAME=ner
+
+python run_ner_no_trainer.py \
+  --model_name_or_path bert-base-cased \
+  --task_name $TASK_NAME \
+  --max_seq_length 128 \
+  --per_device_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --output_dir /tmp/$TASK_NAME/
+```
+
+You can then use your usual launchers to run in it in a distributed environment, but the easiest way is to run
+
+```bash
+accelerate config
+```
+
+and reply to the questions asked. Then
+
+```bash
+accelerate test
+```
+
+that will check everything is ready for training. Finally, you cna launch training with
+
+```bash
+export TASK_NAME=ner
+
+accelerate launch run_ner_no_trainer.py \
+  --model_name_or_path bert-base-cased \
+  --task_name $TASK_NAME \
+  --max_seq_length 128 \
+  --per_device_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --output_dir /tmp/$TASK_NAME/
+```
+
+This command is the same and will work for:
+
+- a CPU-only setup
+- a setup with one GPU
+- a distributed training with several GPUs (single or multi node)
+- a training on TPUs
+
+Note that this library is in alpha release so your feedback is more than welcome if you encounter any problem using it.
+
 ### TensorFlow version

 The following examples are covered in this section:
--- a/examples/token-classification/run_ner.py
+++ b/examples/token-classification/run_ner.py
@@ -45,7 +45,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.4.0")
+check_min_version("4.5.0")

 logger = logging.getLogger(__name__)

@@ -213,7 +213,7 @@ def main():
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
-    logger.info("Training/evaluation parameters %s", training_args)
+    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)
--- a/examples/token-classification/run_ner_no_trainer.py
+++ b/examples/token-classification/run_ner_no_trainer.py
@@ -0,0 +1,530 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning a 🤗 Transformers model on token classification tasks (NER, POS, CHUNKS) relying on the accelerate library
+without using a Trainer.
+"""
+
+import argparse
+import logging
+import math
+import os
+import random
+
+import datasets
+import torch
+from datasets import ClassLabel, load_dataset, load_metric
+from torch.utils.data.dataloader import DataLoader
+from tqdm.auto import tqdm
+
+import transformers
+from accelerate import Accelerator
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_MAPPING,
+    AdamW,
+    AutoConfig,
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    DataCollatorForTokenClassification,
+    SchedulerType,
+    default_data_collator,
+    get_scheduler,
+    set_seed,
+)
+
+
+logger = logging.getLogger(__name__)
+# You should update this to your particular problem to have better documentation of `model_type`
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Finetune a transformers model on a text classification task (NER) with accelerate library"
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help="The name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The configuration name of the dataset to use (via the datasets library).",
+    )
+    parser.add_argument(
+        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
+    )
+    parser.add_argument(
+        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
+    )
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        default=128,
+        help=(
+            "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated,"
+            " sequences shorter will be padded if `--pad_to_max_lenght` is passed."
+        ),
+    )
+    parser.add_argument(
+        "--pad_to_max_length",
+        action="store_true",
+        help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument(
+        "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Model type to use if training from scratch.",
+        choices=MODEL_TYPES,
+    )
+    parser.add_argument(
+        "--label_all_tokens",
+        action="store_true",
+        help="Setting labels of all special tokens to -100 and thus PyTorch will ignore them.",
+    )
+    parser.add_argument(
+        "--return_entity_level_metrics",
+        action="store_true",
+        help="Indication whether entity level metrics are to be returner.",
+    )
+    parser.add_argument(
+        "--task_name",
+        type=str,
+        default="ner",
+        choices=["ner", "pos", "chunk"],
+        help="The name of the task.",
+    )
+    parser.add_argument(
+        "--debug",
+        action="store_true",
+        help="Activate debug mode and run training only with a subset of data.",
+    )
+    args = parser.parse_args()
+
+    # Sanity checks
+    if args.task_name is None and args.train_file is None and args.validation_file is None:
+        raise ValueError("Need either a task name or a training/validation file.")
+    else:
+        if args.train_file is not None:
+            extension = args.train_file.split(".")[-1]
+            assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+        if args.validation_file is not None:
+            extension = args.validation_file.split(".")[-1]
+            assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    accelerator = Accelerator()
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state)
+
+    # Setup logging, we only want one process per machine to log things on the screen.
+    # accelerator.is_local_main_process is only True for one process per machine.
+    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called
+    # 'tokens' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+    else:
+        data_files = {}
+        if args.train_file is not None:
+            data_files["train"] = args.train_file
+        if args.validation_file is not None:
+            data_files["validation"] = args.validation_file
+        extension = args.train_file.split(".")[-1]
+        raw_datasets = load_dataset(extension, data_files=data_files)
+    # Trim a number of training examples
+    if args.debug:
+        for split in raw_datasets.keys():
+            raw_datasets[split] = raw_datasets[split].select(range(100))
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    if raw_datasets["train"] is not None:
+        column_names = raw_datasets["train"].column_names
+        features = raw_datasets["train"].features
+    else:
+        column_names = raw_datasets["validation"].column_names
+        features = raw_datasets["validation"].features
+    text_column_name = "tokens" if "tokens" in column_names else column_names[0]
+    label_column_name = f"{args.task_name}_tags" if f"{args.task_name}_tags" in column_names else column_names[1]
+
+    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
+    # unique labels.
+    def get_label_list(labels):
+        unique_labels = set()
+        for label in labels:
+            unique_labels = unique_labels | set(label)
+        label_list = list(unique_labels)
+        label_list.sort()
+        return label_list
+
+    if isinstance(features[label_column_name].feature, ClassLabel):
+        label_list = features[label_column_name].feature.names
+        # No need to convert the labels since they are already ints.
+        label_to_id = {i: i for i in range(len(label_list))}
+    else:
+        label_list = get_label_list(raw_datasets["train"][label_column_name])
+        label_to_id = {l: i for i, l in enumerate(label_list)}
+    num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    #
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if args.config_name:
+        config = AutoConfig.from_pretrained(args.config_name, num_labels=num_labels)
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels)
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True)
+    elif args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=True)
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+
+    if args.model_name_or_path:
+        model = AutoModelForTokenClassification.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForTokenClassification.from_config(config)
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    # Preprocessing the raw_datasets.
+    # First we tokenize all the texts.
+    padding = "max_length" if args.pad_to_max_length else False
+
+    # Tokenize all texts and align the labels with them.
+
+    def tokenize_and_align_labels(examples):
+        tokenized_inputs = tokenizer(
+            examples[text_column_name],
+            max_length=args.max_length,
+            padding=padding,
+            truncation=True,
+            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
+            is_split_into_words=True,
+        )
+
+        labels = []
+        for i, label in enumerate(examples[label_column_name]):
+            word_ids = tokenized_inputs.word_ids(batch_index=i)
+            previous_word_idx = None
+            label_ids = []
+            for word_idx in word_ids:
+                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
+                # ignored in the loss function.
+                if word_idx is None:
+                    label_ids.append(-100)
+                # We set the label for the first token of each word.
+                elif word_idx != previous_word_idx:
+                    label_ids.append(label_to_id[label[word_idx]])
+                # For the other tokens in a word, we set the label to either the current label or -100, depending on
+                # the label_all_tokens flag.
+                else:
+                    label_ids.append(label_to_id[label[word_idx]] if args.label_all_tokens else -100)
+                previous_word_idx = word_idx
+
+            labels.append(label_ids)
+        tokenized_inputs["labels"] = labels
+        return tokenized_inputs
+
+    processed_raw_datasets = raw_datasets.map(
+        tokenize_and_align_labels, batched=True, remove_columns=raw_datasets["train"].column_names
+    )
+
+    train_dataset = processed_raw_datasets["train"]
+    eval_dataset = processed_raw_datasets["validation"]
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # DataLoaders creation:
+    if args.pad_to_max_length:
+        # If padding was already done ot max length, we use the default data collator that will just convert everything
+        # to tensors.
+        data_collator = default_data_collator
+    else:
+        # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of
+        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
+        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+        data_collator = DataCollatorForTokenClassification(
+            tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
+        )
+
+    train_dataloader = DataLoader(
+        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
+    )
+    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)
+
+    # Optimizer
+    # Split weights in two groups, one with weight decay and the other not.
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
+
+    # Use the device given by the `accelerator` object.
+    device = accelerator.device
+    model.to(device)
+
+    # Prepare everything with our `accelerator`.
+    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
+        model, optimizer, train_dataloader, eval_dataloader
+    )
+
+    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
+    # shorter in multiprocess)
+
+    # Scheduler and math around the number of training steps.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    else:
+        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=args.num_warmup_steps,
+        num_training_steps=args.max_train_steps,
+    )
+
+    # Metrics
+    metric = load_metric("seqeval")
+
+    def get_labels(predictions, references):
+        # Transform predictions and references tensos to numpy arrays
+        if device.type == "cpu":
+            y_pred = predictions.detach().clone().numpy()
+            y_true = references.detach().clone().numpy()
+        else:
+            y_pred = predictions.detach().cpu().clone().numpy()
+            y_true = references.detach().cpu().clone().numpy()
+
+        # Remove ignored index (special tokens)
+        true_predictions = [
+            [label_list[p] for (p, l) in zip(pred, gold_label) if l != -100]
+            for pred, gold_label in zip(y_pred, y_true)
+        ]
+        true_labels = [
+            [label_list[l] for (p, l) in zip(pred, gold_label) if l != -100]
+            for pred, gold_label in zip(y_pred, y_true)
+        ]
+        return true_predictions, true_labels
+
+    def compute_metrics():
+        results = metric.compute()
+        if args.return_entity_level_metrics:
+            # Unpack nested dictionaries
+            final_results = {}
+            for key, value in results.items():
+                if isinstance(value, dict):
+                    for n, v in value.items():
+                        final_results[f"{key}_{n}"] = v
+                else:
+                    final_results[key] = value
+            return final_results
+        else:
+            return {
+                "precision": results["overall_precision"],
+                "recall": results["overall_recall"],
+                "f1": results["overall_f1"],
+                "accuracy": results["overall_accuracy"],
+            }
+
+    # Train!
+    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
+    completed_steps = 0
+
+    for epoch in range(args.num_train_epochs):
+        model.train()
+        for step, batch in enumerate(train_dataloader):
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss = loss / args.gradient_accumulation_steps
+            accelerator.backward(loss)
+            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                progress_bar.update(1)
+                completed_steps += 1
+
+            if completed_steps >= args.max_train_steps:
+                break
+
+        model.eval()
+        for step, batch in enumerate(eval_dataloader):
+            with torch.no_grad():
+                outputs = model(**batch)
+            predictions = outputs.logits.argmax(dim=-1)
+            labels = batch["labels"]
+            if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
+                predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
+                labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
+
+            predictions_gathered = accelerator.gather(predictions)
+            labels_gathered = accelerator.gather(labels)
+            preds, refs = get_labels(predictions_gathered, labels_gathered)
+            metric.add_batch(
+                predictions=preds,
+                references=refs,
+            )  # predictions and preferences are expected to be a nested list of labels, not label_ids
+
+        # eval_metric = metric.compute()
+        eval_metric = compute_metrics()
+        accelerator.print(f"epoch {epoch}:", eval_metric)
+
+    if args.output_dir is not None:
+        accelerator.wait_for_everyone()
+        unwrapped_model = accelerator.unwrap_model(model)
+        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/token-classification/run_no_trainer.sh
+++ b/examples/token-classification/run_no_trainer.sh
@@ -0,0 +1,21 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+accelerate launch run_ner_no_trainer.py \
+  --model_name_or_path bert-base-uncased \
+  --dataset_name conll2003 \
+  --output_dir /tmp/test-ner \
+  --pad_to_max_length \
+  --task_name ner \
+  --return_entity_level_metrics
--- a/hubconf.py
+++ b/hubconf.py
@@ -22,9 +22,10 @@ sys.path.append(SRC_DIR)
 from transformers import (
    AutoConfig,
    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForMaskedLM,
    AutoModelForQuestionAnswering,
    AutoModelForSequenceClassification,
-    AutoModelWithLMHead,
    AutoTokenizer,
    add_start_docstrings,
 )
@@ -78,7 +79,7 @@ def model(*args, **kwargs):
            model = torch.hub.load('huggingface/transformers', 'model', 'bert-base-uncased', output_attentions=True)  # Update configuration during loading
            assert model.config.output_attentions == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            config = AutoConfig.from_pretrained('./tf_model/bert_tf_model_config.json')
            model = torch.hub.load('huggingface/transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
@@ -86,22 +87,41 @@ def model(*args, **kwargs):
    return AutoModel.from_pretrained(*args, **kwargs)


-@add_start_docstrings(AutoModelWithLMHead.__doc__)
-def modelWithLMHead(*args, **kwargs):
+@add_start_docstrings(AutoModelForCausalLM.__doc__)
+def modelForCausalLM(*args, **kwargs):
    r"""
        # Using torch.hub !
        import torch

-        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from huggingface.co and cache.
-        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'bert-base-uncased', output_attentions=True)  # Update configuration during loading
+        model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', 'gpt2')    # Download model and configuration from huggingface.co and cache.
+        model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', './test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', 'gpt2', output_attentions=True)  # Update configuration during loading
        assert model.config.output_attentions == True
        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-        model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        config = AutoConfig.from_pretrained('./tf_model/gpt_tf_model_config.json')
+        model = torch.hub.load('huggingface/transformers', 'modelForCausalLM', './tf_model/gpt_tf_checkpoint.ckpt.index', from_tf=True, config=config)

    """
-    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
+    return AutoModelForCausalLM.from_pretrained(*args, **kwargs)
+
+
+@add_start_docstrings(AutoModelForMaskedLM.__doc__)
+def modelForMaskedLM(*args, **kwargs):
+    r"""
+            # Using torch.hub !
+            import torch
+
+            model = torch.hub.load('huggingface/transformers', 'modelForMaskedLM', 'bert-base-uncased')    # Download model and configuration from huggingface.co and cache.
+            model = torch.hub.load('huggingface/transformers', 'modelForMaskedLM', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/transformers', 'modelForMaskedLM', 'bert-base-uncased', output_attentions=True)  # Update configuration during loading
+            assert model.config.output_attentions == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_pretrained('./tf_model/bert_tf_model_config.json')
+            model = torch.hub.load('huggingface/transformers', 'modelForMaskedLM', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+
+    return AutoModelForMaskedLM.from_pretrained(*args, **kwargs)


@add_start_docstrings(AutoModelForSequenceClassification.__doc__)
@@ -115,7 +135,7 @@ def modelForSequenceClassification(*args, **kwargs):
            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attentions=True)  # Update configuration during loading
            assert model.config.output_attentions == True
            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            config = AutoConfig.from_pretrained('./tf_model/bert_tf_model_config.json')
            model = torch.hub.load('huggingface/transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
@@ -134,7 +154,7 @@ def modelForQuestionAnswering(*args, **kwargs):
        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attentions=True)  # Update configuration during loading
        assert model.config.output_attentions == True
        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+        config = AutoConfig.from_pretrained('./tf_model/bert_tf_model_config.json')
        model = torch.hub.load('huggingface/transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)

    """
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -35,6 +35,9 @@ Pull Request so it can be included under the Community notebooks.
 | [How to fine-tune a model on language modeling](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on a causal or masked LM task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/language_modeling.ipynb)|
 | [How to fine-tune a model on token classification](https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on a token classification task (NER, PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification.ipynb)|
 | [How to fine-tune a model on question answering](https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/question_answering.ipynb)|
+| [How to fine-tune a model on multiple choice](https://github.com/huggingface/notebooks/blob/master/examples/multiple_choice.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/multiple_choice.ipynb)|
+| [How to fine-tune a model on translation](https://github.com/huggingface/notebooks/blob/master/examples/translation.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/translation.ipynb)|
+| [How to fine-tune a model on summarization](https://github.com/huggingface/notebooks/blob/master/examples/summarization.ipynb) | Show how to preprocess the data and fine-tune a pretrained model on XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/summarization.ipynb)|
 | [How to train a language model from scratch](https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)|
 | [How to generate text](https://github.com/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)| How to use different decoding methods for language generation with transformers | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)|
 | [How to export model to ONNX](https://github.com/huggingface/transformers/blob/master/notebooks/04-onnx-export.ipynb) | Highlight how to export and run inference workloads through ONNX |
--- a/setup.py
+++ b/setup.py
@@ -19,15 +19,17 @@ To create the package for pypi.

 1. Run `make pre-release` (or `make pre-patch` for a patch release) then run `make fix-copies` to fix the index of the
   documentation.
+   
+2. Run Tests for Amazon Sagemaker. The documentation is located in `./tests/sagemaker/README.md`, otherwise @philschmid.

-2. Unpin specific versions from setup.py that use a git install.
+3. Unpin specific versions from setup.py that use a git install.

-3. Commit these changes with the message: "Release: VERSION"
+4. Commit these changes with the message: "Release: VERSION"

-4. Add a tag in git to mark the release: "git tag VERSION -m 'Adds tag VERSION for pypi' "
+5. Add a tag in git to mark the release: "git tag VERSION -m 'Adds tag VERSION for pypi' "
   Push the tag to git: git push --tags origin master

-5. Build both the sources and the wheel. Do not change anything in setup.py between
+6. Build both the sources and the wheel. Do not change anything in setup.py between
   creating the wheel and the source distribution (obviously).

   For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
@@ -36,7 +38,7 @@ To create the package for pypi.
   For the sources, run: "python setup.py sdist"
   You should now have a /dist directory with both .whl and .tar.gz source versions.

-6. Check that everything looks correct by uploading the package to the pypi test server:
+7. Check that everything looks correct by uploading the package to the pypi test server:

   twine upload dist/* -r pypitest
   (pypi suggest using twine as other methods upload files via plaintext.)
@@ -46,12 +48,12 @@ To create the package for pypi.
   Check that you can install it in a virtualenv by running:
   pip install -i https://testpypi.python.org/pypi transformers

-7. Upload the final version to actual pypi:
+8. Upload the final version to actual pypi:
   twine upload dist/* -r pypi

-8. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.
+9. Copy the release notes from RELEASE.md to the tag in github once everything is looking hunky-dory.

-9. Run `make post-release` (or `make post-patch` for a patch release).
+10. Run `make post-release` (or `make post-patch` for a patch release).
 """

 import os
@@ -87,6 +89,7 @@ _deps = [
    "cookiecutter==1.7.2",
    "dataclasses",
    "datasets",
+    "docutils==0.16.0",
    "faiss-cpu",
    "fastapi",
    "filelock",
@@ -105,6 +108,7 @@ _deps = [
    "onnxruntime>=1.4.0",
    "packaging",
    "parameterized",
+    "Pillow",
    "protobuf",
    "psutil",
    "pydantic",
@@ -122,6 +126,7 @@ _deps = [
    "sphinx-copybutton",
    "sphinx-markdown-tables",
    "sphinx-rtd-theme==0.4.3",  # sphinx-rtd-theme==0.5.0 introduced big changes in the style.
+    "sphinxext-opengraph==0.4.1",
    "sphinx==3.2.1",
    "starlette",
    "tensorflow-cpu>=2.3",
@@ -134,6 +139,7 @@ _deps = [
    "unidic>=1.0.2",
    "unidic_lite>=1.0.7",
    "uvicorn",
+    "sagemaker>=2.31.0",
 ]


@@ -223,19 +229,40 @@ extras["onnxruntime"] = deps_list("onnxruntime", "onnxruntime-tools")
 extras["onnx"] = deps_list("onnxconverter-common", "keras2onnx") + extras["onnxruntime"]
 extras["modelcreation"] = deps_list("cookiecutter")

+extras["sagemaker"] = deps_list("sagemaker")
+
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
 extras["speech"] = deps_list("soundfile", "torchaudio")
+extras["vision"] = deps_list("Pillow")

 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["testing"] = (
-    deps_list("pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar")
+    deps_list(
+        "pytest", "pytest-xdist", "timeout-decorator", "parameterized", "psutil", "datasets", "pytest-sugar", "black"
+    )
    + extras["retrieval"]
    + extras["modelcreation"]
 )
-extras["docs"] = deps_list("recommonmark", "sphinx", "sphinx-markdown-tables", "sphinx-rtd-theme", "sphinx-copybutton")
+extras["docs"] = deps_list(
+    "docutils",
+    "recommonmark",
+    "sphinx",
+    "sphinx-markdown-tables",
+    "sphinx-rtd-theme",
+    "sphinx-copybutton",
+    "sphinxext-opengraph",
+)
 extras["quality"] = deps_list("black", "isort", "flake8")

-extras["all"] = extras["tf"] + extras["torch"] + extras["flax"] + extras["sentencepiece"] + extras["tokenizers"]
+extras["all"] = (
+    extras["tf"]
+    + extras["torch"]
+    + extras["flax"]
+    + extras["sentencepiece"]
+    + extras["tokenizers"]
+    + extras["speech"]
+    + extras["vision"]
+)

 extras["dev"] = (
    extras["all"]
@@ -278,7 +305,7 @@ install_requires = [

 setup(
    name="transformers",
-    version="4.4.2",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.5.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Sylvain Gugger, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
    author_email="thomas@huggingface.co",
    description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -22,7 +22,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).

-__version__ = "4.4.2"
+__version__ = "4.5.0"

 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.
@@ -48,6 +48,7 @@ from .file_utils import (
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
+    is_vision_available,
 )
 from .utils import logging

@@ -78,6 +79,7 @@ _import_structure = {
        "xnli_processors",
        "xnli_tasks_num_labels",
    ],
+    "feature_extraction_sequence_utils": ["BatchFeature", "SequenceFeatureExtractor"],
    "file_utils": [
        "CONFIG_NAME",
        "MODEL_CARD_NAME",
@@ -104,6 +106,7 @@ _import_structure = {
        "is_tokenizers_available",
        "is_torch_available",
        "is_torch_tpu_available",
+        "is_vision_available",
    ],
    "hf_argparser": ["HfArgumentParser"],
    "integrations": [
@@ -124,23 +127,8 @@ _import_structure = {
        "load_tf2_model_in_pytorch_model",
        "load_tf2_weights_in_pytorch_model",
    ],
-    "models": [],
    # Models
-    "models.wav2vec2": [
-        "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "Wav2Vec2Config",
-        "Wav2Vec2CTCTokenizer",
-        "Wav2Vec2Tokenizer",
-        "Wav2Vec2FeatureExtractor",
-        "Wav2Vec2Processor",
-    ],
-    "models.m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config"],
-    "models.speech_to_text": [
-        "SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "Speech2TextConfig",
-        "Speech2TextFeatureExtractor",
-    ],
-    "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
+    "models": [],
    "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
    "models.auto": [
        "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -162,6 +150,7 @@ _import_structure = {
    "models.bert_generation": ["BertGenerationConfig"],
    "models.bert_japanese": ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"],
    "models.bertweet": ["BertweetTokenizer"],
+    "models.big_bird": ["BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP", "BigBirdConfig", "BigBirdTokenizer"],
    "models.blenderbot": ["BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BlenderbotConfig", "BlenderbotTokenizer"],
    "models.blenderbot_small": [
        "BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -169,6 +158,7 @@ _import_structure = {
        "BlenderbotSmallTokenizer",
    ],
    "models.camembert": ["CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CamembertConfig"],
+    "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
    "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
    "models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
    "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
@@ -187,12 +177,14 @@ _import_structure = {
    "models.fsmt": ["FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FSMTConfig", "FSMTTokenizer"],
    "models.funnel": ["FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP", "FunnelConfig", "FunnelTokenizer"],
    "models.gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2Tokenizer"],
+    "models.gpt_neo": ["GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoConfig"],
    "models.herbert": ["HerbertTokenizer"],
    "models.ibert": ["IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "IBertConfig"],
    "models.layoutlm": ["LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "LayoutLMConfig", "LayoutLMTokenizer"],
    "models.led": ["LED_PRETRAINED_CONFIG_ARCHIVE_MAP", "LEDConfig", "LEDTokenizer"],
    "models.longformer": ["LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongformerConfig", "LongformerTokenizer"],
    "models.lxmert": ["LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LxmertConfig", "LxmertTokenizer"],
+    "models.m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config"],
    "models.marian": ["MarianConfig"],
    "models.mbart": ["MBartConfig"],
    "models.mmbt": ["MMBTConfig"],
@@ -207,6 +199,11 @@ _import_structure = {
    "models.reformer": ["REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "ReformerConfig"],
    "models.retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig", "RetriBertTokenizer"],
    "models.roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaTokenizer"],
+    "models.speech_to_text": [
+        "SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Speech2TextConfig",
+        "Speech2TextFeatureExtractor",
+    ],
    "models.squeezebert": ["SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "SqueezeBertConfig", "SqueezeBertTokenizer"],
    "models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"],
    "models.tapas": ["TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP", "TapasConfig", "TapasTokenizer"],
@@ -216,6 +213,15 @@ _import_structure = {
        "TransfoXLCorpus",
        "TransfoXLTokenizer",
    ],
+    "models.vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
+    "models.wav2vec2": [
+        "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "Wav2Vec2Config",
+        "Wav2Vec2CTCTokenizer",
+        "Wav2Vec2FeatureExtractor",
+        "Wav2Vec2Processor",
+        "Wav2Vec2Tokenizer",
+    ],
    "models.xlm": ["XLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMConfig", "XLMTokenizer"],
    "models.xlm_prophetnet": ["XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMProphetNetConfig"],
    "models.xlm_roberta": ["XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMRobertaConfig"],
@@ -251,7 +257,6 @@ _import_structure = {
        "SpecialTokensMixin",
        "TokenSpan",
    ],
-    "feature_extraction_sequence_utils": ["SequenceFeatureExtractor", "BatchFeature"],
    "trainer_callback": [
        "DefaultFlowCallback",
        "EarlyStoppingCallback",
@@ -295,7 +300,7 @@ else:
        name for name in dir(dummy_sentencepiece_objects) if not name.startswith("_")
    ]

-# tokenziers-backed objects
+# tokenizers-backed objects
 if is_tokenizers_available():
    # Fast tokenizers
    _import_structure["models.convbert"].append("ConvBertTokenizerFast")
@@ -341,6 +346,17 @@ else:
        name for name in dir(dummy_tokenizers_objects) if not name.startswith("_")
    ]

+# Vision-specific objects
+if is_vision_available():
+    _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"]
+    _import_structure["models.vit"].append("ViTFeatureExtractor")
+else:
+    from .utils import dummy_vision_objects
+
+    _import_structure["utils.dummy_vision_objects"] = [
+        name for name in dir(dummy_vision_objects) if not name.startswith("_")
+    ]
+
 # PyTorch-backed objects
 if is_torch_available():
    _import_structure["benchmark.benchmark"] = ["PyTorchBenchmark"]
@@ -369,7 +385,10 @@ if is_torch_available():
    ]
    _import_structure["generation_beam_search"] = ["BeamScorer", "BeamSearchScorer"]
    _import_structure["generation_logits_process"] = [
+        "ForcedBOSTokenLogitsProcessor",
+        "ForcedEOSTokenLogitsProcessor",
        "HammingDiversityLogitsProcessor",
+        "InfNanRemoveLogitsProcessor",
        "LogitsProcessor",
        "LogitsProcessorList",
        "LogitsWarper",
@@ -383,54 +402,15 @@ if is_torch_available():
        "TopPLogitsWarper",
    ]
    _import_structure["generation_stopping_criteria"] = [
-        "StoppingCriteria",
-        "StoppingCriteriaList",
        "MaxLengthCriteria",
        "MaxTimeCriteria",
+        "StoppingCriteria",
+        "StoppingCriteriaList",
    ]
    _import_structure["generation_utils"] = ["top_k_top_p_filtering"]
    _import_structure["modeling_utils"] = ["Conv1D", "PreTrainedModel", "apply_chunking_to_forward", "prune_layer"]
    # PyTorch models structure

-    _import_structure["models.speech_to_text"].extend(
-        [
-            "SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "Speech2TextForConditionalGeneration",
-            "Speech2TextModel",
-        ]
-    )
-
-    _import_structure["models.wav2vec2"].extend(
-        [
-            "WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "Wav2Vec2ForCTC",
-            "Wav2Vec2ForMaskedLM",
-            "Wav2Vec2Model",
-            "Wav2Vec2PreTrainedModel",
-        ]
-    )
-    _import_structure["models.m2m_100"].extend(
-        [
-            "M2M_100_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "M2M100ForConditionalGeneration",
-            "M2M100Model",
-        ]
-    )
-
-    _import_structure["models.convbert"].extend(
-        [
-            "CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "ConvBertForMaskedLM",
-            "ConvBertForMultipleChoice",
-            "ConvBertForQuestionAnswering",
-            "ConvBertForSequenceClassification",
-            "ConvBertForTokenClassification",
-            "ConvBertLayer",
-            "ConvBertModel",
-            "ConvBertPreTrainedModel",
-            "load_tf_weights_in_convbert",
-        ]
-    )
    _import_structure["models.albert"].extend(
        [
            "ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -448,6 +428,7 @@ if is_torch_available():
    _import_structure["models.auto"].extend(
        [
            "MODEL_FOR_CAUSAL_LM_MAPPING",
+            "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
            "MODEL_FOR_MASKED_LM_MAPPING",
            "MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
            "MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
@@ -509,20 +490,36 @@ if is_torch_available():
            "load_tf_weights_in_bert_generation",
        ]
    )
+    _import_structure["models.big_bird"].extend(
+        [
+            "BIG_BIRD_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "BigBirdForCausalLM",
+            "BigBirdForMaskedLM",
+            "BigBirdForMultipleChoice",
+            "BigBirdForPreTraining",
+            "BigBirdForQuestionAnswering",
+            "BigBirdForSequenceClassification",
+            "BigBirdForTokenClassification",
+            "BigBirdLayer",
+            "BigBirdModel",
+            "BigBirdPreTrainedModel",
+            "load_tf_weights_in_big_bird",
+        ]
+    )
    _import_structure["models.blenderbot"].extend(
        [
            "BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "BlenderbotForCausalLM",
            "BlenderbotForConditionalGeneration",
            "BlenderbotModel",
-            "BlenderbotForCausalLM",
        ]
    )
    _import_structure["models.blenderbot_small"].extend(
        [
            "BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "BlenderbotSmallForCausalLM",
            "BlenderbotSmallForConditionalGeneration",
            "BlenderbotSmallModel",
-            "BlenderbotSmallForCausalLM",
        ]
    )
    _import_structure["models.camembert"].extend(
@@ -537,6 +534,20 @@ if is_torch_available():
            "CamembertModel",
        ]
    )
+    _import_structure["models.convbert"].extend(
+        [
+            "CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ConvBertForMaskedLM",
+            "ConvBertForMultipleChoice",
+            "ConvBertForQuestionAnswering",
+            "ConvBertForSequenceClassification",
+            "ConvBertForTokenClassification",
+            "ConvBertLayer",
+            "ConvBertModel",
+            "ConvBertPreTrainedModel",
+            "load_tf_weights_in_convbert",
+        ]
+    )
    _import_structure["models.ctrl"].extend(
        [
            "CTRL_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -549,23 +560,23 @@ if is_torch_available():
    _import_structure["models.deberta"].extend(
        [
            "DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "DebertaForSequenceClassification",
-            "DebertaModel",
            "DebertaForMaskedLM",
-            "DebertaPreTrainedModel",
-            "DebertaForTokenClassification",
            "DebertaForQuestionAnswering",
+            "DebertaForSequenceClassification",
+            "DebertaForTokenClassification",
+            "DebertaModel",
+            "DebertaPreTrainedModel",
        ]
    )
    _import_structure["models.deberta_v2"].extend(
        [
            "DEBERTA_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "DebertaV2ForSequenceClassification",
-            "DebertaV2Model",
            "DebertaV2ForMaskedLM",
-            "DebertaV2PreTrainedModel",
-            "DebertaV2ForTokenClassification",
            "DebertaV2ForQuestionAnswering",
+            "DebertaV2ForSequenceClassification",
+            "DebertaV2ForTokenClassification",
+            "DebertaV2Model",
+            "DebertaV2PreTrainedModel",
        ]
    )
    _import_structure["models.distilbert"].extend(
@@ -646,6 +657,15 @@ if is_torch_available():
            "load_tf_weights_in_gpt2",
        ]
    )
+    _import_structure["models.gpt_neo"].extend(
+        [
+            "GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "GPTNeoForCausalLM",
+            "GPTNeoModel",
+            "GPTNeoPreTrainedModel",
+            "load_tf_weights_in_gpt_neo",
+        ]
+    )
    _import_structure["models.ibert"].extend(
        [
            "IBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -699,7 +719,14 @@ if is_torch_available():
            "LxmertXLayer",
        ]
    )
-    _import_structure["models.marian"].extend(["MarianModel", "MarianMTModel", "MarianForCausalLM"])
+    _import_structure["models.m2m_100"].extend(
+        [
+            "M2M_100_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "M2M100ForConditionalGeneration",
+            "M2M100Model",
+        ]
+    )
+    _import_structure["models.marian"].extend(["MarianForCausalLM", "MarianModel", "MarianMTModel"])
    _import_structure["models.mbart"].extend(
        [
            "MBartForCausalLM",
@@ -752,7 +779,7 @@ if is_torch_available():
        ]
    )
    _import_structure["models.pegasus"].extend(
-        ["PegasusForConditionalGeneration", "PegasusModel", "PegasusForCausalLM"]
+        ["PegasusForCausalLM", "PegasusForConditionalGeneration", "PegasusModel"]
    )
    _import_structure["models.prophetnet"].extend(
        [
@@ -793,6 +820,13 @@ if is_torch_available():
            "RobertaModel",
        ]
    )
+    _import_structure["models.speech_to_text"].extend(
+        [
+            "SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "Speech2TextForConditionalGeneration",
+            "Speech2TextModel",
+        ]
+    )
    _import_structure["models.squeezebert"].extend(
        [
            "SQUEEZEBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -836,6 +870,23 @@ if is_torch_available():
            "load_tf_weights_in_transfo_xl",
        ]
    )
+    _import_structure["models.vit"].extend(
+        [
+            "VIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "ViTForImageClassification",
+            "ViTModel",
+            "ViTPreTrainedModel",
+        ]
+    )
+    _import_structure["models.wav2vec2"].extend(
+        [
+            "WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "Wav2Vec2ForCTC",
+            "Wav2Vec2ForMaskedLM",
+            "Wav2Vec2Model",
+            "Wav2Vec2PreTrainedModel",
+        ]
+    )
    _import_structure["models.xlm"].extend(
        [
            "XLM_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -916,20 +967,6 @@ if is_tf_available():
        "shape_list",
    ]
    # TensorFlow models structure
-
-    _import_structure["models.convbert"].extend(
-        [
-            "TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "TFConvBertForMaskedLM",
-            "TFConvBertForMultipleChoice",
-            "TFConvBertForQuestionAnswering",
-            "TFConvBertForSequenceClassification",
-            "TFConvBertForTokenClassification",
-            "TFConvBertLayer",
-            "TFConvBertModel",
-            "TFConvBertPreTrainedModel",
-        ]
-    )
    _import_structure["models.albert"].extend(
        [
            "TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1002,6 +1039,19 @@ if is_tf_available():
            "TFCamembertModel",
        ]
    )
+    _import_structure["models.convbert"].extend(
+        [
+            "TF_CONVBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFConvBertForMaskedLM",
+            "TFConvBertForMultipleChoice",
+            "TFConvBertForQuestionAnswering",
+            "TFConvBertForSequenceClassification",
+            "TFConvBertForTokenClassification",
+            "TFConvBertLayer",
+            "TFConvBertModel",
+            "TFConvBertPreTrainedModel",
+        ]
+    )
    _import_structure["models.ctrl"].extend(
        [
            "TF_CTRL_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1085,6 +1135,17 @@ if is_tf_available():
            "TFGPT2PreTrainedModel",
        ]
    )
+    _import_structure["models.layoutlm"].extend(
+        [
+            "TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFLayoutLMForMaskedLM",
+            "TFLayoutLMForSequenceClassification",
+            "TFLayoutLMForTokenClassification",
+            "TFLayoutLMMainLayer",
+            "TFLayoutLMModel",
+            "TFLayoutLMPreTrainedModel",
+        ]
+    )
    _import_structure["models.led"].extend(["TFLEDForConditionalGeneration", "TFLEDModel", "TFLEDPreTrainedModel"])
    _import_structure["models.longformer"].extend(
        [
@@ -1108,7 +1169,7 @@ if is_tf_available():
            "TFLxmertVisualFeatureEncoder",
        ]
    )
-    _import_structure["models.marian"].extend(["TFMarianMTModel", "TFMarianModel"])
+    _import_structure["models.marian"].extend(["TFMarianModel", "TFMarianMTModel"])
    _import_structure["models.mbart"].extend(["TFMBartForConditionalGeneration", "TFMBartModel"])
    _import_structure["models.mobilebert"].extend(
        [
@@ -1239,8 +1300,39 @@ else:
 # FLAX-backed objects
 if is_flax_available():
    _import_structure["modeling_flax_utils"] = ["FlaxPreTrainedModel"]
-    _import_structure["models.auto"].extend(["FLAX_MODEL_MAPPING", "FlaxAutoModel"])
-    _import_structure["models.bert"].extend(["FlaxBertForMaskedLM", "FlaxBertModel"])
+    _import_structure["models.auto"].extend(
+        [
+            "FLAX_MODEL_FOR_MASKED_LM_MAPPING",
+            "FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING",
+            "FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING",
+            "FLAX_MODEL_FOR_PRETRAINING_MAPPING",
+            "FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
+            "FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
+            "FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
+            "FLAX_MODEL_MAPPING",
+            "FlaxAutoModel",
+            "FlaxAutoModelForMaskedLM",
+            "FlaxAutoModelForMultipleChoice",
+            "FlaxAutoModelForNextSentencePrediction",
+            "FlaxAutoModelForPreTraining",
+            "FlaxAutoModelForQuestionAnswering",
+            "FlaxAutoModelForSequenceClassification",
+            "FlaxAutoModelForTokenClassification",
+        ]
+    )
+    _import_structure["models.bert"].extend(
+        [
+            "FlaxBertForMaskedLM",
+            "FlaxBertForMultipleChoice",
+            "FlaxBertForNextSentencePrediction",
+            "FlaxBertForPreTraining",
+            "FlaxBertForQuestionAnswering",
+            "FlaxBertForSequenceClassification",
+            "FlaxBertForTokenClassification",
+            "FlaxBertModel",
+            "FlaxBertPreTrainedModel",
+        ]
+    )
    _import_structure["models.roberta"].append("FlaxRobertaModel")
 else:
    from .utils import dummy_flax_objects
@@ -1249,7 +1341,6 @@ else:
        name for name in dir(dummy_flax_objects) if not name.startswith("_")
    ]

-
 # Direct imports for type-checking
 if TYPE_CHECKING:
    # Configuration
@@ -1307,6 +1398,7 @@ if TYPE_CHECKING:
        is_tokenizers_available,
        is_torch_available,
        is_torch_tpu_available,
+        is_vision_available,
    )
    from .hf_argparser import HfArgumentParser

@@ -1353,6 +1445,7 @@ if TYPE_CHECKING:
    from .models.bert_generation import BertGenerationConfig
    from .models.bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
    from .models.bertweet import BertweetTokenizer
+    from .models.big_bird import BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP, BigBirdConfig, BigBirdTokenizer
    from .models.blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig, BlenderbotTokenizer
    from .models.blenderbot_small import (
        BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -1379,6 +1472,7 @@ if TYPE_CHECKING:
    from .models.fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTConfig, FSMTTokenizer
    from .models.funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig, FunnelTokenizer
    from .models.gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config, GPT2Tokenizer
+    from .models.gpt_neo import GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoConfig
    from .models.herbert import HerbertTokenizer
    from .models.ibert import IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, IBertConfig
    from .models.layoutlm import LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMConfig, LayoutLMTokenizer
@@ -1414,6 +1508,7 @@ if TYPE_CHECKING:
        TransfoXLCorpus,
        TransfoXLTokenizer,
    )
+    from .models.vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
    from .models.wav2vec2 import (
        WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
        Wav2Vec2Config,
@@ -1476,6 +1571,7 @@ if TYPE_CHECKING:
    from .training_args import TrainingArguments
    from .training_args_seq2seq import Seq2SeqTrainingArguments
    from .training_args_tf import TFTrainingArguments
+    from .utils import logging

    if is_sentencepiece_available():
        from .models.albert import AlbertTokenizer
@@ -1534,6 +1630,12 @@ if TYPE_CHECKING:
    else:
        from .utils.dummy_tokenizers_objects import *

+    if is_vision_available():
+        from .image_utils import ImageFeatureExtractionMixin
+        from .models.vit import ViTFeatureExtractor
+    else:
+        from .utils.dummy_vision_objects import *
+
    # Modeling
    if is_torch_available():

@@ -1564,7 +1666,10 @@ if TYPE_CHECKING:
        )
        from .generation_beam_search import BeamScorer, BeamSearchScorer
        from .generation_logits_process import (
+            ForcedBOSTokenLogitsProcessor,
+            ForcedEOSTokenLogitsProcessor,
            HammingDiversityLogitsProcessor,
+            InfNanRemoveLogitsProcessor,
            LogitsProcessor,
            LogitsProcessorList,
            LogitsWarper,
@@ -1577,6 +1682,12 @@ if TYPE_CHECKING:
            TopKLogitsWarper,
            TopPLogitsWarper,
        )
+        from .generation_stopping_criteria import (
+            MaxLengthCriteria,
+            MaxTimeCriteria,
+            StoppingCriteria,
+            StoppingCriteriaList,
+        )
        from .generation_utils import top_k_top_p_filtering
        from .modeling_utils import Conv1D, PreTrainedModel, apply_chunking_to_forward, prune_layer
        from .models.albert import (
@@ -1593,6 +1704,7 @@ if TYPE_CHECKING:
        )
        from .models.auto import (
            MODEL_FOR_CAUSAL_LM_MAPPING,
+            MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
            MODEL_FOR_MASKED_LM_MAPPING,
            MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
            MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
@@ -1647,6 +1759,20 @@ if TYPE_CHECKING:
            BertGenerationEncoder,
            load_tf_weights_in_bert_generation,
        )
+        from .models.big_bird import (
+            BIG_BIRD_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BigBirdForCausalLM,
+            BigBirdForMaskedLM,
+            BigBirdForMultipleChoice,
+            BigBirdForPreTraining,
+            BigBirdForQuestionAnswering,
+            BigBirdForSequenceClassification,
+            BigBirdForTokenClassification,
+            BigBirdLayer,
+            BigBirdModel,
+            BigBirdPreTrainedModel,
+            load_tf_weights_in_big_bird,
+        )
        from .models.blenderbot import (
            BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST,
            BlenderbotForCausalLM,
@@ -1772,6 +1898,13 @@ if TYPE_CHECKING:
            GPT2PreTrainedModel,
            load_tf_weights_in_gpt2,
        )
+        from .models.gpt_neo import (
+            GPT_NEO_PRETRAINED_MODEL_ARCHIVE_LIST,
+            GPTNeoForCausalLM,
+            GPTNeoModel,
+            GPTNeoPreTrainedModel,
+            load_tf_weights_in_gpt_neo,
+        )
        from .models.ibert import (
            IBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            IBertForMaskedLM,
@@ -1780,6 +1913,7 @@ if TYPE_CHECKING:
            IBertForSequenceClassification,
            IBertForTokenClassification,
            IBertModel,
+            IBertPreTrainedModel,
        )
        from .models.layoutlm import (
            LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -1931,6 +2065,12 @@ if TYPE_CHECKING:
            TransfoXLPreTrainedModel,
            load_tf_weights_in_transfo_xl,
        )
+        from .models.vit import (
+            VIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            ViTForImageClassification,
+            ViTModel,
+            ViTPreTrainedModel,
+        )
        from .models.wav2vec2 import (
            WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
            Wav2Vec2ForCTC,
@@ -2008,6 +2148,15 @@ if TYPE_CHECKING:
        # Benchmarks
        from .benchmark.benchmark_tf import TensorFlowBenchmark
        from .generation_tf_utils import tf_top_k_top_p_filtering
+        from .modeling_tf_layoutlm import (
+            TF_LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFLayoutLMForMaskedLM,
+            TFLayoutLMForSequenceClassification,
+            TFLayoutLMForTokenClassification,
+            TFLayoutLMMainLayer,
+            TFLayoutLMModel,
+            TFLayoutLMPreTrainedModel,
+        )
        from .modeling_tf_utils import TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, shape_list
        from .models.albert import (
            TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -2170,7 +2319,7 @@ if TYPE_CHECKING:
            TFLxmertPreTrainedModel,
            TFLxmertVisualFeatureEncoder,
        )
-        from .models.marian import TFMarian, TFMarianMTModel
+        from .models.marian import TFMarianModel, TFMarianMTModel
        from .models.mbart import TFMBartForConditionalGeneration, TFMBartModel
        from .models.mobilebert import (
            TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -2280,13 +2429,41 @@ if TYPE_CHECKING:

    if is_flax_available():
        from .modeling_flax_utils import FlaxPreTrainedModel
-        from .models.auto import FLAX_MODEL_MAPPING, FlaxAutoModel
-        from .models.bert import FlaxBertForMaskedLM, FlaxBertModel
+        from .models.auto import (
+            FLAX_MODEL_FOR_MASKED_LM_MAPPING,
+            FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
+            FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
+            FLAX_MODEL_FOR_PRETRAINING_MAPPING,
+            FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+            FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+            FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+            FLAX_MODEL_MAPPING,
+            FlaxAutoModel,
+            FlaxAutoModelForMaskedLM,
+            FlaxAutoModelForMultipleChoice,
+            FlaxAutoModelForNextSentencePrediction,
+            FlaxAutoModelForPreTraining,
+            FlaxAutoModelForQuestionAnswering,
+            FlaxAutoModelForSequenceClassification,
+            FlaxAutoModelForTokenClassification,
+        )
+        from .models.bert import (
+            FlaxBertForMaskedLM,
+            FlaxBertForMultipleChoice,
+            FlaxBertForNextSentencePrediction,
+            FlaxBertForPreTraining,
+            FlaxBertForQuestionAnswering,
+            FlaxBertForSequenceClassification,
+            FlaxBertForTokenClassification,
+            FlaxBertModel,
+            FlaxBertPreTrainedModel,
+        )
        from .models.roberta import FlaxRobertaModel
    else:
        # Import the same objects as dummies to get them in the namespace.
        # They will raise an import error if the user tries to instantiate / use them.
        from .utils.dummy_flax_objects import *
+
 else:
    import importlib
    import os
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@@ -95,4 +95,4 @@ def get_activation(activation_string):
    if activation_string in ACT2FN:
        return ACT2FN[activation_string]
    else:
-        raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys())))
+        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
--- a/src/transformers/activations_tf.py
+++ b/src/transformers/activations_tf.py
@@ -91,4 +91,4 @@ def get_tf_activation(activation_string):
    if activation_string in ACT2FN:
        return ACT2FN[activation_string]
    else:
-        raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys())))
+        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
--- a/src/transformers/benchmark/benchmark.py
+++ b/src/transformers/benchmark/benchmark.py
@@ -218,7 +218,7 @@ class PyTorchBenchmark(Benchmark):

            return min(runtimes) / 10.0
        except RuntimeError as e:
-            self.print_fn("Doesn't fit on GPU. {}".format(e))
+            self.print_fn(f"Doesn't fit on GPU. {e}")
            return "N/A"

    def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
@@ -263,5 +263,5 @@ class PyTorchBenchmark(Benchmark):

            return memory, summary
        except RuntimeError as e:
-            self.print_fn("Doesn't fit on GPU. {}".format(e))
+            self.print_fn(f"Doesn't fit on GPU. {e}")
            return "N/A", None
--- a/src/transformers/benchmark/benchmark_tf.py
+++ b/src/transformers/benchmark/benchmark_tf.py
@@ -227,7 +227,7 @@ class TensorFlowBenchmark(Benchmark):

                return min(runtimes) / 10.0
            except ResourceExhaustedError as e:
-                self.print_fn("Doesn't fit on GPU. {}".format(e))
+                self.print_fn(f"Doesn't fit on GPU. {e}")

    def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]:
        logger.info(
@@ -290,5 +290,5 @@ class TensorFlowBenchmark(Benchmark):

                return memory, summary
            except ResourceExhaustedError as e:
-                self.print_fn("Doesn't fit on GPU. {}".format(e))
+                self.print_fn(f"Doesn't fit on GPU. {e}")
                return "N/A", None
--- a/src/transformers/benchmark/benchmark_utils.py
+++ b/src/transformers/benchmark/benchmark_utils.py
@@ -758,9 +758,7 @@ class Benchmark(ABC):

        if self.args.env_print:
            self.print_fn("\n" + 20 * "=" + ("ENVIRONMENT INFORMATION").center(40) + 20 * "=")
-            self.print_fn(
-                "\n".join(["- {}: {}".format(prop, val) for prop, val in self.environment_info.items()]) + "\n"
-            )
+            self.print_fn("\n".join([f"- {prop}: {val}" for prop, val in self.environment_info.items()]) + "\n")

        if self.args.save_to_csv:
            with open(self.args.env_info_csv_file, mode="w", newline="") as csv_file:
@@ -888,9 +886,7 @@ class Benchmark(ABC):
        self.print_fn("Saving results to csv.")
        with open(filename, mode="w") as csv_file:

-            assert len(self.args.model_names) > 0, "At least 1 model should be defined, but got {}".format(
-                self.model_names
-            )
+            assert len(self.args.model_names) > 0, f"At least 1 model should be defined, but got {self.model_names}"

            fieldnames = ["model", "batch_size", "sequence_length"]
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames + ["result"])
--- a/src/transformers/commands/convert.py
+++ b/src/transformers/commands/convert.py
@@ -76,7 +76,7 @@ class ConvertCommand(BaseTransformersCLICommand):
    ):
        self._logger = logging.get_logger("transformers-cli/converting")

-        self._logger.info("Loading model {}".format(model_type))
+        self._logger.info(f"Loading model {model_type}")
        self._model_type = model_type
        self._tf_checkpoint = tf_checkpoint
        self._pytorch_dump_output = pytorch_dump_output
--- a/src/transformers/commands/env.py
+++ b/src/transformers/commands/env.py
@@ -56,8 +56,8 @@ class EnvironmentCommand(BaseTransformersCLICommand):
            "`transformers` version": version,
            "Platform": platform.platform(),
            "Python version": platform.python_version(),
-            "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available),
-            "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available),
+            "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
+            "Tensorflow version (GPU?)": f"{tf_version} ({tf_cuda_available})",
            "Using GPU in script?": "<fill in>",
            "Using distributed or parallel set-up in script?": "<fill in>",
        }
@@ -69,4 +69,4 @@ class EnvironmentCommand(BaseTransformersCLICommand):

    @staticmethod
    def format_dict(d):
-        return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n"
+        return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"
--- a/src/transformers/commands/run.py
+++ b/src/transformers/commands/run.py
@@ -31,8 +31,8 @@ def try_infer_format_from_ext(path: str):
            return ext

    raise Exception(
-        "Unable to determine file format from file extension {}. "
-        "Please provide the format through --format {}".format(path, PipelineDataFormat.SUPPORTED_FORMATS)
+        f"Unable to determine file format from file extension {path}. "
+        f"Please provide the format through --format {PipelineDataFormat.SUPPORTED_FORMATS}"
    )


@@ -105,6 +105,6 @@ class RunCommand(BaseTransformersCLICommand):
        # Saving data
        if self._nlp.binary_output:
            binary_path = self._reader.save_binary(outputs)
-            logger.warning("Current pipeline requires output to be in binary format, saving at {}".format(binary_path))
+            logger.warning(f"Current pipeline requires output to be in binary format, saving at {binary_path}")
        else:
            self._reader.save(outputs)
--- a/src/transformers/commands/serving.py
+++ b/src/transformers/commands/serving.py
@@ -133,7 +133,7 @@ class ServeCommand(BaseTransformersCLICommand):
                "Or install FastAPI and unicorn separately."
            )
        else:
-            logger.info("Serving model over {}:{}".format(host, port))
+            logger.info(f"Serving model over {host}:{port}")
            self._app = FastAPI(
                routes=[
                    APIRoute(
--- a/src/transformers/commands/train.py
+++ b/src/transformers/commands/train.py
@@ -104,7 +104,7 @@ class TrainCommand(BaseTransformersCLICommand):
        self.column_text = args.column_text
        self.column_id = args.column_id

-        self.logger.info("Loading {} pipeline for {}".format(args.task, args.model))
+        self.logger.info(f"Loading {args.task} pipeline for {args.model}")
        if args.task == "text_classification":
            self.pipeline = TextClassificationPipeline.from_pretrained(args.model)
        elif args.task == "token_classification":
@@ -112,7 +112,7 @@ class TrainCommand(BaseTransformersCLICommand):
        elif args.task == "question_answering":
            raise NotImplementedError

-        self.logger.info("Loading dataset from {}".format(args.train_data))
+        self.logger.info(f"Loading dataset from {args.train_data}")
        self.train_dataset = Processor.create_from_csv(
            args.train_data,
            column_label=args.column_label,
@@ -122,7 +122,7 @@ class TrainCommand(BaseTransformersCLICommand):
        )
        self.valid_dataset = None
        if args.validation_data:
-            self.logger.info("Loading validation dataset from {}".format(args.validation_data))
+            self.logger.info(f"Loading validation dataset from {args.validation_data}")
            self.valid_dataset = Processor.create_from_csv(
                args.validation_data,
                column_label=args.column_label,
--- a/src/transformers/commands/user.py
+++ b/src/transformers/commands/user.py
@@ -99,15 +99,15 @@ class ANSI:

    @classmethod
    def bold(cls, s):
-        return "{}{}{}".format(cls._bold, s, cls._reset)
+        return f"{cls._bold}{s}{cls._reset}"

    @classmethod
    def red(cls, s):
-        return "{}{}{}".format(cls._bold + cls._red, s, cls._reset)
+        return f"{cls._bold}{cls._red}{s}{cls._reset}"

    @classmethod
    def gray(cls, s):
-        return "{}{}{}".format(cls._gray, s, cls._reset)
+        return f"{cls._gray}{s}{cls._reset}"


 def tabulate(rows: List[List[Union[str, int]]], headers: List[str]) -> str:
@@ -268,8 +268,8 @@ class RepoCreateCommand(BaseUserCommand):

        user, _ = self._api.whoami(token)
        namespace = self.args.organization if self.args.organization is not None else user
-
-        print("You are about to create {}".format(ANSI.bold(namespace + "/" + self.args.name)))
+        full_name = f"{namespace}/{self.args.name}"
+        print(f"You are about to create {ANSI.bold(full_name)}")

        if not self.args.yes:
            choice = input("Proceed? [Y/n] ").lower()
@@ -283,7 +283,7 @@ class RepoCreateCommand(BaseUserCommand):
            print(ANSI.red(e.response.text))
            exit(1)
        print("\nYour repo now lives at:")
-        print("  {}".format(ANSI.bold(url)))
+        print(f"  {ANSI.bold(url)}")
        print("\nYou can clone it locally with the command below," " and commit/push as usual.")
        print(f"\n  git clone {url}")
        print("")
@@ -328,16 +328,15 @@ class UploadCommand(BaseUserCommand):
            filename = self.args.filename if self.args.filename is not None else os.path.basename(local_path)
            files = [(local_path, filename)]
        else:
-            raise ValueError("Not a valid file or directory: {}".format(local_path))
+            raise ValueError(f"Not a valid file or directory: {local_path}")

        if sys.platform == "win32":
            files = [(filepath, filename.replace(os.sep, "/")) for filepath, filename in files]

        if len(files) > UPLOAD_MAX_FILES:
            print(
-                "About to upload {} files to S3. This is probably wrong. Please filter files before uploading.".format(
-                    ANSI.bold(len(files))
-                )
+                f"About to upload {ANSI.bold(len(files))} files to S3. This is probably wrong. Please filter files "
+                "before uploading."
            )
            exit(1)

@@ -346,9 +345,8 @@ class UploadCommand(BaseUserCommand):

        for filepath, filename in files:
            print(
-                "About to upload file {} to S3 under filename {} and namespace {}".format(
-                    ANSI.bold(filepath), ANSI.bold(filename), ANSI.bold(namespace)
-                )
+                f"About to upload file {ANSI.bold(filepath)} to S3 under filename {ANSI.bold(filename)} and namespace "
+                f"{ANSI.bold(namespace)}"
            )

        if not self.args.yes:
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -34,19 +34,30 @@ class PretrainedConfig(object):
    Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as
    methods for loading/downloading/saving configurations.

-    Note: A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
-    initialize a model does **not** load the model weights. It only affects the model's configuration.
+    Note:
+        A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
+        initialize a model does **not** load the model weights. It only affects the model's configuration.

    Class attributes (overridden by derived classes)

-        - **model_type** (:obj:`str`): An identifier for the model type, serialized into the JSON file, and used to
+        - **model_type** (:obj:`str`) -- An identifier for the model type, serialized into the JSON file, and used to
          recreate the correct object in :class:`~transformers.AutoConfig`.
-        - **is_composition** (:obj:`bool`): Whether the config class is composed of multiple sub-configs. In this case
-          the config has to be initialized from two or more configs of type :class:`~transformers.PretrainedConfig`
-          like: :class:`~transformers.EncoderDecoderConfig` or :class:`~RagConfig`.
-        - **keys_to_ignore_at_inference** (:obj:`List[str]`): A list of keys to ignore by default when looking at
+        - **is_composition** (:obj:`bool`) -- Whether the config class is composed of multiple sub-configs. In this
+          case the config has to be initialized from two or more configs of type
+          :class:`~transformers.PretrainedConfig` like: :class:`~transformers.EncoderDecoderConfig` or
+          :class:`~RagConfig`.
+        - **keys_to_ignore_at_inference** (:obj:`List[str]`) -- A list of keys to ignore by default when looking at
          dictionary outputs of the model during inference.

+    Common attributes (present in all subclasses)
+
+        - **vocab_size** (:obj:`int`) -- The number of tokens in the vocabulary, which is also the first dimension of
+          the embeddings matrix (this attribute may be missing for models that don't have a text modality like ViT).
+        - **hidden_size** (:obj:`int`) -- The hidden size of the model.
+        - **num_attention_heads** (:obj:`int`) -- The number of attention heads used in the multi-head attention layers
+          of the model.
+        - **num_hidden_layers** (:obj:`int`) -- The number of blocks in the model.
+
    Args:
        name_or_path (:obj:`str`, `optional`, defaults to :obj:`""`):
            Store the string that was passed to :func:`~transformers.PreTrainedModel.from_pretrained` or
@@ -134,6 +145,9 @@ class PretrainedConfig(object):
          <../model_doc/mbart>` where the first generated token needs to be the target language token.
        - **forced_eos_token_id** (:obj:`int`, `optional`) -- The id of the token to force as the last generated token
          when :obj:`max_length` is reached.
+        - **remove_invalid_values** (:obj:`bool`, `optional`) -- Whether to remove possible `nan` and `inf` outputs of
+          the model to prevent the generation method to crash. Note that using ``remove_invalid_values`` can slow down
+          generation.


    Parameters for fine-tuning tasks
@@ -219,6 +233,7 @@ class PretrainedConfig(object):
        self.return_dict_in_generate = kwargs.pop("return_dict_in_generate", False)
        self.forced_bos_token_id = kwargs.pop("forced_bos_token_id", None)
        self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None)
+        self.remove_invalid_values = kwargs.pop("remove_invalid_values", False)

        # Fine-tuning task arguments
        self.architectures = kwargs.pop("architectures", None)
@@ -263,7 +278,7 @@ class PretrainedConfig(object):
            try:
                setattr(self, key, value)
            except AttributeError as err:
-                logger.error("Can't set {} with value {} for {}".format(key, value, self))
+                logger.error(f"Can't set {key} with value {value} for {self}")
                raise err

    @property
@@ -292,7 +307,7 @@ class PretrainedConfig(object):
    @num_labels.setter
    def num_labels(self, num_labels: int):
        if self.id2label is None or len(self.id2label) != num_labels:
-            self.id2label = {i: "LABEL_{}".format(i) for i in range(num_labels)}
+            self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
            self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))

    def save_pretrained(self, save_directory: Union[str, os.PathLike]):
@@ -305,7 +320,7 @@ class PretrainedConfig(object):
                Directory where the configuration JSON file will be saved (will be created if it does not exist).
        """
        if os.path.isfile(save_directory):
-            raise AssertionError("Provided path ({}) should be a directory, not a file".format(save_directory))
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
        os.makedirs(save_directory, exist_ok=True)
        # If we save using the predefined names, we can load using `from_pretrained`
        output_config_file = os.path.join(save_directory, CONFIG_NAME)
@@ -384,6 +399,11 @@ class PretrainedConfig(object):

        """
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if config_dict.get("model_type", False) and hasattr(cls, "model_type"):
+            assert (
+                config_dict["model_type"] == cls.model_type
+            ), f"You tried to initiate a model of type '{cls.model_type}' with a pretrained model of type '{config_dict['model_type']}'"
+
        return cls.from_dict(config_dict, **kwargs)

    @classmethod
@@ -411,6 +431,12 @@ class PretrainedConfig(object):
        use_auth_token = kwargs.pop("use_auth_token", None)
        local_files_only = kwargs.pop("local_files_only", False)
        revision = kwargs.pop("revision", None)
+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+
+        user_agent = {"file_type": "config", "from_auto_class": from_auto_class}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline

        if is_offline_mode() and not local_files_only:
            logger.info("Offline mode: forcing local_files_only=True")
@@ -436,6 +462,7 @@ class PretrainedConfig(object):
                resume_download=resume_download,
                local_files_only=local_files_only,
                use_auth_token=use_auth_token,
+                user_agent=user_agent,
            )
            # Load config dict
            config_dict = cls._dict_from_json_file(resolved_config_file)
@@ -451,16 +478,16 @@ class PretrainedConfig(object):

        except json.JSONDecodeError:
            msg = (
-                "Couldn't reach server at '{}' to download configuration file or "
+                f"Couldn't reach server at '{config_file}' to download configuration file or "
                "configuration file is not a valid JSON file. "
-                "Please check network or file content here: {}.".format(config_file, resolved_config_file)
+                f"Please check network or file content here: {resolved_config_file}."
            )
            raise EnvironmentError(msg)

        if resolved_config_file == config_file:
-            logger.info("loading configuration file {}".format(config_file))
+            logger.info(f"loading configuration file {config_file}")
        else:
-            logger.info("loading configuration file {} from cache at {}".format(config_file, resolved_config_file))
+            logger.info(f"loading configuration file {config_file} from cache at {resolved_config_file}")

        return config_dict, kwargs

@@ -496,7 +523,7 @@ class PretrainedConfig(object):
        for key in to_remove:
            kwargs.pop(key, None)

-        logger.info("Model config %s", str(config))
+        logger.info(f"Model config {config}")
        if return_unused_kwargs:
            return config, kwargs
        else:
@@ -528,7 +555,7 @@ class PretrainedConfig(object):
        return self.__dict__ == other.__dict__

    def __repr__(self):
-        return "{} {}".format(self.__class__.__name__, self.to_json_string())
+        return f"{self.__class__.__name__} {self.to_json_string()}"

    def to_diff_dict(self) -> Dict[str, Any]:
        """
--- a/src/transformers/convert_graph_to_onnx.py
+++ b/src/transformers/convert_graph_to_onnx.py
@@ -19,9 +19,9 @@ from typing import Dict, List, Optional, Tuple

 from packaging.version import Version, parse

-from .file_utils import ModelOutput, is_tf_available, is_torch_available
-from .pipelines import Pipeline, pipeline
-from .tokenization_utils import BatchEncoding
+from transformers.file_utils import ModelOutput, is_tf_available, is_torch_available
+from transformers.pipelines import Pipeline, pipeline
+from transformers.tokenization_utils import BatchEncoding


 # This is the minimal required version to
@@ -154,7 +154,7 @@ def ensure_valid_input(model, tokens, input_names):
            print(f"{arg_name} is not present in the generated input list.")
            break

-    print("Generated inputs order: {}".format(ordered_input_names))
+    print(f"Generated inputs order: {ordered_input_names}")
    return ordered_input_names, tuple(model_args)


--- a/Show More
+++ b/Show More