Release v4.11.2

Fix gather for TPU (#13813 )
Release v4.11.1
2021-09-30 11:54:39 -04:00 · 2021-09-30 11:53:13 -04:00 · 2021-09-29 12:04:25 -04:00 · 2021-09-29 12:03:56 -04:00 · 2021-09-29 12:03:51 -04:00 · 2021-09-29 12:03:46 -04:00
619 changed files with 69047 additions and 11578 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,7 +80,7 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,speech,vision]
+            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
@@ -97,6 +97,37 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_tests_torch_and_tf_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        environment:
+            OMP_NUM_THREADS: 1
+            RUN_PT_TF_CROSS_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch_and_tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - save_cache:
+                key: v0.4-{{ checksum "setup.py" }}
+                paths:
+                    - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf tests -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_torch_and_flax:
        working_directory: ~/transformers
@@ -116,7 +147,7 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,speech,vision]
+            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
@@ -133,6 +164,37 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_tests_torch_and_flax_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        environment:
+            OMP_NUM_THREADS: 1
+            RUN_PT_FLAX_CROSS_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch_and_flax-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - save_cache:
+                key: v0.4-{{ checksum "setup.py" }}
+                paths:
+                    - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax tests -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_torch:
        working_directory: ~/transformers
@@ -151,7 +213,7 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision,timm]
+            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
@@ -168,6 +230,36 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_tests_torch_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - save_cache:
+                  key: v0.4-torch-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 3 --dist=loadfile -s --make-reports=tests_torch tests | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_tf:
        working_directory: ~/transformers
@@ -185,7 +277,7 @@ jobs:
                      - v0.4-tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech]
            - save_cache:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
@@ -201,6 +293,34 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_tests_tf_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech]
+            - save_cache:
+                  key: v0.4-tf-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_tf tests | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_flax:
        working_directory: ~/transformers
@@ -218,7 +338,7 @@ jobs:
                    - v0.4-flax-{{ checksum "setup.py" }}
                    - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: sudo pip install .[flax,testing,sentencepiece]
+            - run: sudo pip install .[flax,testing,sentencepiece,flax-speech,vision]
            - save_cache:
                  key: v0.4-flax-{{ checksum "setup.py" }}
                  paths:
@@ -234,6 +354,34 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_tests_flax_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                keys:
+                    - v0.4-flax-{{ checksum "setup.py" }}
+                    - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: sudo pip install .[flax,testing,sentencepiece,vision,flax-speech]
+            - save_cache:
+                  key: v0.4-flax-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_flax tests | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_pipelines_torch:
        working_directory: ~/transformers
@@ -253,7 +401,7 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision]
+            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
@@ -270,6 +418,37 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_tests_pipelines_torch_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            RUN_PIPELINE_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - save_cache:
+                  key: v0.4-torch-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test tests | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_pipelines_tf:
        working_directory: ~/transformers
@@ -305,6 +484,35 @@ jobs:
            - store_artifacts:
                  path: ~/transformers/reports

+    run_tests_pipelines_tf_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            RUN_PIPELINE_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
+            - save_cache:
+                  key: v0.4-tf-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf tests -m is_pipeline_test | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
    run_tests_custom_tokenizers:
        working_directory: ~/transformers
        docker:
@@ -349,24 +557,55 @@ jobs:
                  keys:
                      - v0.4-torch_examples-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,sentencepiece,testing]
+            - run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
            - run: pip install -r examples/pytorch/_tests_requirements.txt
            - save_cache:
                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - run: python utils/tests_fetcher.py --filters examples tests | tee test_preparation.txt
            - store_artifacts:
                  path: ~/transformers/test_preparation.txt
            - run: |
                  if [ -f test_list.txt ]; then
-                    TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee examples_output.txt
+                    python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee tests_output.txt
                  fi
            - store_artifacts:
                  path: ~/transformers/examples_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_examples_torch_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch_examples-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
+            - run: pip install -r examples/pytorch/_tests_requirements.txt
+            - save_cache:
+                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee examples_output.txt
+            - store_artifacts:
+                  path: ~/transformers/examples_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_hub:
        working_directory: ~/transformers
@@ -399,8 +638,45 @@ jobs:
                  path: ~/transformers/test_preparation.txt
            - run: |
                  if [ -f test_list.txt ]; then
-                    python -m pytest -sv $(cat test_list.txt) -m is_staging_test
+                    python -m pytest -sv --make-reports=tests_hub $(cat test_list.txt) -m is_staging_test | tee tests_output.txt
                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+    
+    run_tests_hub_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            HUGGINGFACE_CO_STAGING: yes
+            RUN_GIT_LFS_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-hub-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get install git-lfs
+            - run: |
+                git config --global user.email "ci@dummy.com"
+                git config --global user.name "ci"
+            - run: pip install --upgrade pip
+            - run: pip install .[torch,sentencepiece,testing]
+            - save_cache:
+                  key: v0.4-hub-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -sv --make-reports=tests_hub tests -m is_staging_test | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_onnxruntime:
        working_directory: ~/transformers
@@ -428,12 +704,41 @@ jobs:
                  path: ~/transformers/test_preparation.txt
            - run: |
                  if [ -f test_list.txt ]; then
-                    python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch $(cat test_list.txt) -k onnx | tee tests_output.txt
+                    python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_onnx $(cat test_list.txt) -k onnx | tee tests_output.txt
                  fi
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_tests_onnxruntime_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[torch,testing,sentencepiece,onnxruntime]
+            - save_cache:
+                  key: v0.4-onnx-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_onnx tests -k onnx | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
    build_doc:
        working_directory: ~/transformers
        docker:
@@ -524,6 +829,44 @@ jobs:
            - run: pip install requests
            - run: python ./utils/link_tester.py

+    run_tests_layoutlmv2:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[torch,testing,vision]
+            - run: pip install torchvision
+            - run: python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
+            - run: sudo apt install tesseract-ocr
+            - run: pip install pytesseract
+            - save_cache:
+                  key: v0.4-torch-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 1 tests/*layoutlmv2* --dist=loadfile -s --make-reports=tests_layoutlmv2 --durations=100
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
 # TPU JOBS
    run_examples_tpu:
        docker:
@@ -578,7 +921,28 @@ workflows:
            - run_tests_onnxruntime
            - run_tests_hub
            - build_doc
+            - run_tests_layoutlmv2
            - deploy_doc: *workflow_filters
+    nightly:
+        triggers:
+            - schedule:
+                cron: "0 0 * * *"
+                filters:
+                    branches:
+                        only:
+                            - master
+        jobs:
+            - run_examples_torch_all
+            - run_tests_torch_and_tf_all
+            - run_tests_torch_and_flax_all
+            - run_tests_torch_all
+            - run_tests_tf_all
+            - run_tests_flax_all
+            - run_tests_pipelines_torch_all
+            - run_tests_pipelines_tf_all
+            - run_tests_onnxruntime_all
+            - run_tests_hub_all
+
 #    tpu_testing_jobs:
 #        triggers:
 #            - schedule:
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -67,4 +67,9 @@ deploy_doc "25dee4a" v4.6.0
 deploy_doc "7a6c9fa" v4.7.0
 deploy_doc "9252a51" v4.8.0
 deploy_doc "1366172" v4.8.1
-deploy_doc "96d1cfb"  # v4.8.2 Latest stable release
+deploy_doc "96d1cfb" v4.8.2
+deploy_doc "72aee83" v4.9.0
+deploy_doc "bff1c71" v4.9.1
+deploy_doc "41981a2" v4.9.2
+deploy_doc "39cb6f5" v4.10.0
+deploy_doc "28e2787"  # v4.10.1 Latest stable release
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@@ -0,0 +1,42 @@
+name: Doctests
+
+on:
+  push:
+    branches:
+      - doctest*
+  repository_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  RUN_SLOW: yes
+  OMP_NUM_THREADS: 16
+  MKL_NUM_THREADS: 16
+  PYTEST_TIMEOUT: 600
+
+jobs:
+  run_doctests:
+    runs-on: [self-hosted, docker-gpu, single-gpu]
+    container:
+      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          apt -y update && apt install -y libsndfile1-dev
+          pip install --upgrade pip
+          pip install .[dev]
+
+      - name: Run doctests
+        run: |
+          pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure
--- a/.github/workflows/model-templates.yml
+++ b/.github/workflows/model-templates.yml
@@ -47,6 +47,8 @@ jobs:
          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/flax-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/flax-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
          make style
          python utils/check_table.py --fix_and_overwrite
          python utils/check_dummies.py --fix_and_overwrite
@@ -59,7 +61,7 @@ jobs:
      - name: Run style changes
        run: |
          git fetch origin master:master
-          make fixup
+          make style && make quality

      - name: Failure short reports
        if: ${{ always() }}
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@@ -0,0 +1,257 @@
+name: Self-hosted runner; Nightly (scheduled)
+
+on:
+    push:
+        branches:
+            - nightly_ci*
+    repository_dispatch:
+    schedule:
+        - cron: "0 0 */3 * *"
+
+env:
+    HF_HOME: /mnt/cache
+    TRANSFORMERS_IS_CI: yes
+    RUN_SLOW: yes
+    OMP_NUM_THREADS: 16
+    MKL_NUM_THREADS: 16
+    PYTEST_TIMEOUT: 600
+    SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+
+jobs:
+    run_all_tests_torch_gpu:
+        runs-on: [self-hosted, docker-gpu, single-gpu]
+        container:
+            image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
+            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        steps:
+            - name: Launcher docker
+              uses: actions/checkout@v2
+
+            - name: NVIDIA-SMI
+              run: |
+                  nvidia-smi
+
+            - name: Install dependencies
+              run: |
+                  apt -y update && apt install -y libsndfile1-dev git
+                  pip install --upgrade pip
+                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
+
+            - name: Are GPUs recognized by our DL frameworks
+              run: |
+                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
+                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+            - name: Run all tests on GPU
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_gpu_failures_short.txt
+
+            - name: Run examples tests on GPU
+              if: ${{ always() }}
+              env:
+                  OMP_NUM_THREADS: 16
+                  MKL_NUM_THREADS: 16
+                  RUN_SLOW: yes
+                  HF_HOME: /mnt/cache
+                  TRANSFORMERS_IS_CI: yes
+              run: |
+                  pip install -r examples/pytorch/_tests_requirements.txt
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/examples_torch_gpu_failures_short.txt
+
+            - name: Run all pipeline tests on GPU
+              if: ${{ always() }}
+              env:
+                  RUN_PIPELINE_TESTS: yes
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_pipeline_gpu_failures_short.txt
+
+            - name: Test suite reports artifacts
+              if: ${{ always() }}
+              uses: actions/upload-artifact@v2
+              with:
+                  name: run_all_tests_torch_gpu_test_reports
+                  path: reports
+
+    run_all_tests_torch_multi_gpu:
+        runs-on: [self-hosted, docker-gpu, multi-gpu]
+        container:
+            image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
+            options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        steps:
+            - name: Launcher docker
+              uses: actions/checkout@v2
+
+            - name: NVIDIA-SMI
+              continue-on-error: true
+              run: |
+                  nvidia-smi
+
+            - name: Install dependencies
+              run: |
+                  apt -y update && apt install -y libsndfile1-dev git
+                  pip install --upgrade pip
+                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
+
+            - name: Are GPUs recognized by our DL frameworks
+              run: |
+                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
+                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+            - name: Run all tests on GPU
+              env:
+                  MKL_SERVICE_FORCE_INTEL: 1
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_multi_gpu_failures_short.txt
+
+            - name: Run all pipeline tests on GPU
+              if: ${{ always() }}
+              env:
+                  RUN_PIPELINE_TESTS: yes
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt
+
+            - name: Test suite reports artifacts
+              if: ${{ always() }}
+              uses: actions/upload-artifact@v2
+              with:
+                  name: run_all_tests_torch_multi_gpu_test_reports
+                  path: reports
+
+    run_all_tests_torch_cuda_extensions_gpu:
+        runs-on: [self-hosted, docker-gpu, single-gpu]
+        container:
+            image: nvcr.io/nvidia/pytorch:21.03-py3
+            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        steps:
+            - name: Launcher docker
+              uses: actions/checkout@v2
+
+            - name: NVIDIA-SMI
+              run: |
+                  nvidia-smi
+
+            - name: Install dependencies
+              run: |
+                  apt -y update && apt install -y libaio-dev
+                  pip install --upgrade pip
+                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
+                  pip install .[testing,deepspeed]
+                  pip install git+https://github.com/microsoft/DeepSpeed
+
+            - name: Are GPUs recognized by our DL frameworks
+              run: |
+                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
+                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+            - name: Run all tests on GPU
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
+
+            - name: Test suite reports artifacts
+              if: ${{ always() }}
+              uses: actions/upload-artifact@v2
+              with:
+                  name: run_tests_torch_cuda_extensions_gpu_test_reports
+                  path: reports
+
+    run_all_tests_torch_cuda_extensions_multi_gpu:
+        runs-on: [self-hosted, docker-gpu, multi-gpu]
+        container:
+            image: nvcr.io/nvidia/pytorch:21.03-py3
+            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        steps:
+            - name: Launcher docker
+              uses: actions/checkout@v2
+
+            - name: NVIDIA-SMI
+              continue-on-error: true
+              run: |
+                  nvidia-smi
+
+            - name: Install dependencies
+              run: |
+                  apt -y update && apt install -y libaio-dev
+                  pip install --upgrade pip
+                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
+                  pip install .[testing,deepspeed,fairscale]
+                  pip install git+https://github.com/microsoft/DeepSpeed
+
+            - name: Are GPUs recognized by our DL frameworks
+              run: |
+                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
+                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+            - name: Run all tests on GPU
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
+ 
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
+
+            - name: Test suite reports artifacts
+              if: ${{ always() }}
+              uses: actions/upload-artifact@v2
+              with:
+                  name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
+                  path: reports
+
+    send_results:
+        name: Send results to webhook
+        runs-on: ubuntu-latest
+        if: always()
+        needs: [
+                run_all_tests_torch_gpu,
+                run_all_tests_torch_multi_gpu,
+                run_all_tests_torch_cuda_extensions_gpu,
+                run_all_tests_torch_cuda_extensions_multi_gpu
+        ]
+        steps:
+            - uses: actions/checkout@v2
+
+            - uses: actions/download-artifact@v2
+
+            - name: Send message to Slack
+              env:
+                  CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+                  CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+                  CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+                  CI_SLACK_CHANNEL_ID_PAST_FUTURE: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
+
+              run: |
+                  pip install slack_sdk
+                  python utils/notification_service.py scheduled nightly-torch
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -11,6 +11,7 @@ on:
      - "tests/**"
      - ".github/**"
      - "templates/**"
+      - "utils/**"
  repository_dispatch:

 env:
@@ -27,32 +28,47 @@ jobs:
      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
+      - name: Install dependencies
+        run: |
+          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+          apt install -y libsndfile1-dev
+          pip install --upgrade pip
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+
      - name: Launcher docker
        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2

      - name: NVIDIA-SMI
        run: |
          nvidia-smi

-      - name: Install dependencies
-        run: |
-          apt -y update && apt install -y libsndfile1-dev
-          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
-
      - name: Are GPUs recognized by our DL frameworks
        run: |
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+      
+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt

      - name: Run all non-slow tests on GPU
        run: |
-          python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_gpu tests
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_gpu $(cat test_list.txt)
+          fi

      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
        run: cat reports/tests_torch_gpu_failures_short.txt

      - name: Test suite reports artifacts
@@ -62,6 +78,61 @@ jobs:
          name: run_all_tests_torch_gpu_test_reports
          path: reports

+  run_tests_flax_gpu:
+    runs-on: [self-hosted, docker-gpu-test, single-gpu]
+    container:
+      image: tensorflow/tensorflow:2.4.1-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Install dependencies
+        run: |
+          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
+          pip install --upgrade pip
+          pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
+
+      - name: Launcher docker
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2
+
+      - name: NVIDIA-SMI
+        continue-on-error: true
+        run: |
+          nvidia-smi
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
+          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
+      
+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt
+
+      - name: Run all non-slow tests on GPU
+        run: |
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu $(cat test_list.txt)
+          fi
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        run: cat reports/tests_flax_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_flax_gpu_test_reports
+          path: reports
+
 #  run_tests_tf_gpu:
 #    runs-on: [self-hosted, docker-gpu, single-gpu]
 #    timeout-minutes: 120
@@ -69,32 +140,47 @@ jobs:
 #      image: tensorflow/tensorflow:2.4.1-gpu
 #      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 #    steps:
+#      - name: Install dependencies
+#        run: |
+#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+#          pip install --upgrade pip
+#          pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
+#
 #      - name: Launcher docker
 #        uses: actions/checkout@v2
+#        with:
+#          fetch-depth: 2
 #
 #      - name: NVIDIA-SMI
 #        run: |
 #          nvidia-smi
 #
-#      - name: Install dependencies
-#        run: |
-#          pip install --upgrade pip
-#          pip install .[sklearn,testing,onnxruntime,sentencepiece]
-#
 #      - name: Are GPUs recognized by our DL frameworks
 #        run: |
 #          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
 #          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
 #
+#      - name: Fetch the tests to run
+#        run: |
+#          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+#
+#      - name: Report fetched tests
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: test_fetched
+#          path: test_preparation.txt
+#
 #      - name: Run all non-slow tests on GPU
 #        env:
 #          TF_NUM_INTRAOP_THREADS: 8
 #          TF_NUM_INTEROP_THREADS: 1
 #        run: |
-#          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests
+#          if [ -f test_list.txt ]; then
+#            python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu $(cat test_list.txt)
+#          fi
 #
 #      - name: Failure short reports
-#        if: ${{ always() }}
+#        if: ${{ failure() }}
 #        run: cat reports/tests_tf_gpu_failures_short.txt
 #
 #      - name: Test suite reports artifacts
@@ -111,18 +197,22 @@ jobs:
      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+          apt install -y libsndfile1-dev
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+      
+      - name: Launcher docker
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2
+
+      - name: NVIDIA-SMI
+        continue-on-error: true
+        run: |
+          nvidia-smi

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -131,14 +221,26 @@ jobs:
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt
+
      - name: Run all non-slow tests on GPU
        env:
          MKL_SERVICE_FORCE_INTEL: 1
        run: |
-          python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_multi_gpu tests
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_multi_gpu $(cat test_list.txt)
+          fi

      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
        run: cat reports/tests_torch_multi_gpu_failures_short.txt

      - name: Test suite reports artifacts
@@ -148,6 +250,61 @@ jobs:
          name: run_all_tests_torch_multi_gpu_test_reports
          path: reports

+#  run_tests_flax_multi_gpu:
+#    runs-on: [self-hosted, docker-gpu, multi-gpu]
+#    container:
+#      image: tensorflow/tensorflow:2.4.1-gpu
+#      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#    steps:
+#      - name: Install dependencies
+#        run: |
+#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+#          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
+#          pip install --upgrade pip
+#          pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
+#
+#      - name: Launcher docker
+#        uses: actions/checkout@v2
+#        with:
+#          fetch-depth: 2
+#
+#      - name: NVIDIA-SMI
+#        continue-on-error: true
+#        run: |
+#          nvidia-smi
+#
+#      - name: Are GPUs recognized by our DL frameworks
+#        run: |
+#          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
+#          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
+#      
+#      - name: Fetch the tests to run
+#        run: |
+#          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+#
+#      - name: Report fetched tests
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: test_fetched
+#          path: test_preparation.txt
+#
+#      - name: Run all non-slow tests on GPU
+#        run: |
+#          if [ -f test_list.txt ]; then
+#            python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_multi_gpu $(cat test_list.txt)
+#          fi
+#
+#      - name: Failure short reports
+#        if: ${{ failure() }}
+#        run: cat reports/tests_flax_multi_gpu_failures_short.txt
+#
+#      - name: Test suite reports artifacts
+#        if: ${{ always() }}
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: run_all_tests_flax_multi_gpu_test_reports
+#          path: reports
+
 #  run_tests_tf_multi_gpu:
 #    runs-on: [self-hosted, docker-gpu, multi-gpu]
 #    timeout-minutes: 120
@@ -155,32 +312,47 @@ jobs:
 #      image: tensorflow/tensorflow:2.4.1-gpu
 #      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 #    steps:
+#      - name: Install dependencies
+#        run: |
+#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+#          pip install --upgrade pip
+#          pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
+#
 #      - name: Launcher docker
 #        uses: actions/checkout@v2
+#        with:
+#          fetch-depth: 2
 #
 #      - name: NVIDIA-SMI
 #        run: |
 #          nvidia-smi
 #
-#      - name: Install dependencies
-#        run: |
-#          pip install --upgrade pip
-#          pip install .[sklearn,testing,onnxruntime,sentencepiece]
-#
 #      - name: Are GPUs recognized by our DL frameworks
 #        run: |
 #          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
 #          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
 #
+#      - name: Fetch the tests to run
+#        run: |
+#          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+#
+#      - name: Report fetched tests
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: test_fetched
+#          path: test_preparation.txt
+#
 #      - name: Run all non-slow tests on GPU
 #        env:
 #          TF_NUM_INTRAOP_THREADS: 8
 #          TF_NUM_INTEROP_THREADS: 1
 #        run: |
-#          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
+#          if [ -f test_list.txt ]; then
+#            python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu $(cat test_list.txt)
+#          fi
 #
 #      - name: Failure short reports
-#        if: ${{ always() }}
+#        if: ${{ failure() }}
 #        run: cat reports/tests_tf_multi_gpu_failures_short.txt
 #
 #      - name: Test suite reports artifacts
@@ -198,6 +370,8 @@ jobs:
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2

      - name: NVIDIA-SMI
        run: |
@@ -215,13 +389,25 @@ jobs:
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+      
+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt
+      
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt

      - name: Run all tests on GPU
        run: |
-          python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_gpu $(cat test_list.txt)
+          fi

      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
        run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt

      - name: Test suite reports artifacts
@@ -239,8 +425,11 @@ jobs:
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2

      - name: NVIDIA-SMI
+        continue-on-error: true
        run: |
          nvidia-smi

@@ -257,12 +446,24 @@ jobs:
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt
+
      - name: Run all tests on GPU
        run: |
-          python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_multi_gpu $(cat test_list.txt)
+          fi

      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
        run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt

      - name: Test suite reports artifacts
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -15,6 +15,7 @@ env:
  OMP_NUM_THREADS: 16
  MKL_NUM_THREADS: 16
  PYTEST_TIMEOUT: 600
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}

 jobs:
  run_all_tests_torch_gpu:
@@ -32,9 +33,9 @@ jobs:

      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
-          pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
+          pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -85,6 +86,46 @@ jobs:
          name: run_all_tests_torch_gpu_test_reports
          path: reports

+  run_all_tests_flax_gpu:
+    runs-on: [self-hosted, docker-gpu-test, single-gpu]
+    container:
+      image: tensorflow/tensorflow:2.4.1-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
+        continue-on-error: true
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
+          pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
+          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
+
+      - name: Run all tests on GPU
+        run: |
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_flax_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_flax_gpu_test_reports
+          path: reports
+
  run_all_tests_tf_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
    container:
@@ -100,8 +141,9 @@ jobs:

      - name: Install dependencies
        run: |
+          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnx,sentencepiece]
+          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -149,14 +191,15 @@ jobs:
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
+        continue-on-error: true
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
-          pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
+          pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -203,13 +246,15 @@ jobs:
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
+        continue-on-error: true
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
+          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnx,sentencepiece]
+          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -247,6 +292,45 @@ jobs:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports

+#  run_all_tests_flax_multi_gpu:
+#    runs-on: [self-hosted, docker-gpu, multi-gpu]
+#    container:
+#      image: tensorflow/tensorflow:2.4.1-gpu
+#      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#    steps:
+#      - name: Launcher docker
+#        uses: actions/checkout@v2
+#
+#      - name: NVIDIA-SMI
+#        run: |
+#          nvidia-smi
+#
+#      - name: Install dependencies
+#        run: |
+#          pip install --upgrade pip
+#          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
+#          pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
+#
+#      - name: Are GPUs recognized by our DL frameworks
+#        run: |
+#          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
+#          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
+#
+#      - name: Run all tests on GPU
+#        run: |
+#          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
+#
+#      - name: Failure short reports
+#        if: ${{ always() }}
+#        run: cat reports/tests_flax_gpu_failures_short.txt
+#
+#      - name: Test suite reports artifacts
+#        if: ${{ always() }}
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: run_all_tests_flax_gpu_test_reports
+#          path: reports
+
  run_all_tests_torch_cuda_extensions_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
    container:
@@ -298,6 +382,7 @@ jobs:
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
+        continue-on-error: true
        run: |
          nvidia-smi

@@ -350,6 +435,7 @@ jobs:
        env:
          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}


        run: |
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,82 @@
+cff-version: "1.2.0"
+date-released: 2020-10
+message: "If you use this software, please cite it using these metadata."
+title: "Transformers: State-of-the-Art Natural Language Processing"
+url: "https://github.com/huggingface/transformers"
+authors: 
+  - family-names: Wolf
+    given-names: Thomas
+  - family-names: Debut
+    given-names: Lysandre
+  - family-names: Sanh
+    given-names: Victor
+  - family-names: Chaumond
+    given-names: Julien
+  - family-names: Delangue
+    given-names: Clement
+  - family-names: Moi
+    given-names: Anthony
+  - family-names: Cistac
+    given-names: Perric
+  - family-names: Ma
+    given-names: Clara
+  - family-names: Jernite
+    given-names: Yacine
+  - family-names: Plu
+    given-names: Julien
+  - family-names: Xu
+    given-names: Canwen
+  - family-names: "Le Scao"
+    given-names: Teven
+  - family-names: Gugger
+    given-names: Sylvain
+  - family-names: Drame
+    given-names: Mariama
+  - family-names: Lhoest
+    given-names: Quentin
+  - family-names: Rush
+    given-names: "Alexander M."
+preferred-citation:
+  type: inproceedings
+  authors:
+  - family-names: Wolf
+    given-names: Thomas
+  - family-names: Debut
+    given-names: Lysandre
+  - family-names: Sanh
+    given-names: Victor
+  - family-names: Chaumond
+    given-names: Julien
+  - family-names: Delangue
+    given-names: Clement
+  - family-names: Moi
+    given-names: Anthony
+  - family-names: Cistac
+    given-names: Perric
+  - family-names: Ma
+    given-names: Clara
+  - family-names: Jernite
+    given-names: Yacine
+  - family-names: Plu
+    given-names: Julien
+  - family-names: Xu
+    given-names: Canwen
+  - family-names: "Le Scao"
+    given-names: Teven
+  - family-names: Gugger
+    given-names: Sylvain
+  - family-names: Drame
+    given-names: Mariama
+  - family-names: Lhoest
+    given-names: Quentin
+  - family-names: Rush
+    given-names: "Alexander M."
+  booktitle: "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations"
+  month: 10
+  start: 38
+  end: 45
+  title: "Transformers: State-of-the-Art Natural Language Processing"
+  year: 2020
+  publisher: "Association for Computational Linguistics"
+  url: "https://www.aclweb.org/anthology/2020.emnlp-demos.6"
+  address: "Online"
--- a/1
+++ b/1
@@ -30,7 +30,6 @@ deps_table_check_updated:
 # autogenerating code

 autogenerate_code: deps_table_update
-	python utils/class_mapping_update.py

 # Check that source code meets quality standards

--- a/README.md
+++ b/README.md
@@ -211,6 +211,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
 1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
@@ -234,20 +235,25 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
 for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
 Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
 1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FNet](https://huggingface.co/transformers/model_doc/fnet.html)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
 1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
+1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
 1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
 1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
@@ -255,14 +261,19 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SpeechEncoderDecoder](https://huggingface.co/transformers/model_doc/speechencoderdecoder.html)**
 1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/model_doc/speech_to_text_2.html)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -235,10 +235,11 @@ conda install -c huggingface transformers
 1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。
 1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
 1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
+1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (来自 Microsoft) 伴随论文 [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) 由 Hangbo Bao, Li Dong, Furu Wei 发布。
 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (来自 Google) 伴随论文 [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) 由 Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova 发布。
 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (来自 Google) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
-1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
 1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
+1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
 1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。
@@ -255,18 +256,21 @@ conda install -c huggingface transformers
 1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。
 1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
 1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) 和德语版 DistilBERT。
-1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval
-for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon
-Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
+1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
 1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
+1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
 1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
+1. **[FNet](https://huggingface.co/transformers/master/model_doc/fnet.html)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
 1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
 1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
-1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
 1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
+1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
+1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
 1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
 1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
+1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
+1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) 由 Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei 发布。
 1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
 1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
 1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。
@@ -279,14 +283,19 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布
 1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
-1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
+1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
 1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
+1. **[SpeechEncoderDecoder](https://huggingface.co/transformers/master/model_doc/speechencoderdecoder.html)** 
 1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (来自 Facebook), 伴随论文 [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino 发布。
-1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
+1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/master/model_doc/speech_to_text_2.html)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
+1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
 1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
+1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
 1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
 1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -247,10 +247,11 @@ conda install -c huggingface transformers
 1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
 1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
@@ -267,23 +268,26 @@ conda install -c huggingface transformers
 1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
-1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
-for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
-Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FNet](https://huggingface.co/transformers/master/model_doc/fnet.html)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
+1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
 1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
 1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
@@ -291,14 +295,19 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SpeechEncoderDecoder](https://huggingface.co/transformers/master/model_doc/speechencoderdecoder.html)** 
 1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/master/model_doc/speech_to_text_2.html)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,10 +1,12 @@
 // These two things need to be updated at each release for the version selector.
 // Last stable version
-const stableVersion = "v4.8.2"
+const stableVersion = "v4.10.1"
 // Dictionary doc folder to label. The last stable version should have an empty key.
 const versionMapping = {
    "master": "master",
-    "": "v4.8.0/v4.8.1/v4.8.2 (stable)",
+    "": "v4.10.0/v4.10.1 (stable)",
+    "v4.9.2": "v4.9.0/v4.9.1/v4.9.2",
+    "v4.8.2": "v4.8.0/v4.8.1/v4.8.2",
    "v4.7.0": "v4.7.0",
    "v4.6.0": "v4.6.0",
    "v4.5.1": "v4.5.0/v4.5.1",
--- a/docs/source/add_new_pipeline.rst
+++ b/docs/source/add_new_pipeline.rst
@@ -0,0 +1,143 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+How to add a pipeline to 🤗 Transformers?
+=======================================================================================================================
+
+First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
+dictionnaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as
+possible as it makes compatibility easier (even through other languages via JSON). Those will be the :obj:`inputs` of
+the pipeline (:obj:`preprocess`).
+
+Then define the :obj:`outputs`. Same policy as the :obj:`inputs`. The simpler, the better. Those will be the outputs of
+:obj:`postprocess` method.
+
+Start by inheriting the base class :obj:`Pipeline`. with the 4 methods needed to implement :obj:`preprocess`,
+:obj:`_forward`, :obj:`postprocess` and :obj:`_sanitize_parameters`.
+
+
+.. code-block::
+
+    from transformers import Pipeline
+
+    class MyPipeline(Pipeline):
+        def _sanitize_parameters(self, **kwargs)
+            preprocess_kwargs = {}
+            if "maybe_arg" in kwargs:
+                preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+            return preprocess_kwargs, {}, {}
+
+        def preprocess(self, inputs, maybe_arg=2)
+            model_input = Tensor(....)
+            return {"model_input": model_input}
+
+        def _forward(self, model_inputs)
+            # model_inputs == {"model_input": model_input}
+            oututs = self.model(**model_inputs)
+            # Maybe {"logits": Tensor(...)}
+            return outputs
+
+        def postprocess(self, model_outputs)
+            best_class = model_outputs["logits"].softmax(-1)
+            return best_class
+
+
+The structure of this breakdown is to support relatively seemless support for CPU/GPU, while supporting doing
+pre/postprocessing on the CPU on different threads
+
+:obj:`preprocess` will take the original defined inputs, and turn them something feedable to the model. It might
+contain more information and is usally a :obj:`Dict`.
+
+:obj:`_forward` is the implementation detail and is not meant to be called directly :obj:`forward` is the preferred
+called method as it contains safeguards to make sure everything is working on the expected device. If anything is
+linked to a real model it belongs in the :obj:`_forward` method, anything else is in the preprocess/postrocess.
+
+:obj:`postprocess` methods will take the output of :obj:`_forward` and turn it into the final output that were decided
+earlier.
+
+:obj:`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
+time ``pipeline(...., maybe_arg=4)`` or at call time ``pipe = pipeline(...); output = pipe(...., maybe_arg=4)``.
+
+The returns of :obj:`_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to :obj:`preprocess`,
+:obj:`_forward` and :obj:`postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
+allows to keep the default arguments in the function definition which is always more "natural".
+
+A classic example would be a :obj:`top_k` argument in the post processing in classification tasks.
+
+.. code-block::
+
+    >>> pipe = pipeline("my-new-task")
+    >>> pipe("This is a test")
+    [{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
+    {"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
+
+    >>> pipe("This is a test", top_k=2)
+    [{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
+
+In order to achieve that, we'll update our :obj:`postprocess` method with a default parameter to :obj:`5`. and edit
+:obj:`_sanitize_parameters` to allow this new parameter.
+
+
+.. code-block::
+
+
+        def postprocess(self, model_outputs, top_k=5)
+            best_class = model_outputs["logits"].softmax(-1)
+            # Add logic to handle top_k
+            return best_class
+
+        def _sanitize_parameters(self, **kwargs)
+            preprocess_kwargs = {}
+            if "maybe_arg" in kwargs:
+                preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+
+            postprocess_kwargs = {}
+            if "top_k" in kwargs:
+                preprocess_kwargs["top_k"] = kwargs["top_k"]
+            return preprocess_kwargs, {}, postprocess_kwargs
+
+Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
+without requiring users to understand new kind of objects. It's also relatively common to support many different types
+of arguments for ease of use (audio files, can be filenames, URLs or pure bytes)
+
+
+
+Adding it to the list of supported tasks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Go to ``src/transformers/pipelines/__init__.py`` and fill in :obj:`SUPPORTED_TASKS` with your newly created pipeline.
+If possible it should provide a default model.
+
+Adding tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Create a new file ``tests/test_pipelines_MY_PIPELINE.py`` with example with the other tests.
+
+The :obj:`run_pipeline_test` function will be very generic and run on small random models on every possible
+architecture as defined by :obj:`model_mapping` and :obj:`tf_model_mapping`.
+
+This is very important to test future compatibilty, meaning if someone adds a new model for
+:obj:`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
+impossible to check for actual values, that's why There is a helper :obj:`ANY` that will simply attempt to match the
+output of the pipeline TYPE.
+
+You also *need* to implement 2 (ideally 4) tests.
+
+- :obj:`test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
+  and test the pipeline outputs. The results should be the same as :obj:`test_small_model_tf`.
+- :obj:`test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
+  and test the pipeline outputs. The results should be the same as :obj:`test_small_model_pt`.
+- :obj:`test_large_model_pt` (:obj:`optional`): Tests the pipeline on a real pipeline where the results are supposed to
+  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
+  sure there is no drift in future releases
+- :obj:`test_large_model_tf` (:obj:`optional`): Tests the pipeline on a real pipeline where the results are supposed to
+  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
+  sure there is no drift in future releases
--- a/docs/source/community.md
+++ b/docs/source/community.md
@@ -1,4 +1,4 @@
-# Community
+# Community

 This page regroups resources around 🤗 Transformers developed by the community.

@@ -12,6 +12,7 @@ This page regroups resources around 🤗 Transformers developed by the community

 | Notebook     |      Description      |      Author      |      |
 |:----------|:-------------|:-------------|------:|
+| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | How to generate lyrics in the style of your favorite artist by fine-tuning a GPT-2 model |  [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
 | [Train T5 in Tensorflow 2 ](https://github.com/snapthat/TF-T5-text-to-text) | How to train T5 for any task using Tensorflow 2. This notebook demonstrates a Question & Answer task implemented in Tensorflow 2 using SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
 | [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb)  | How to train T5 on SQUAD with Transformers and Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
 | [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb)  | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning |  [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -27,7 +27,12 @@ author = "huggingface"
 # The short X.Y version
 version = ""
 # The full version, including alpha/beta/rc tags
-release = u'4.7.0'
+release = "4.11.2"
+
+
+
+
+



@@ -208,6 +213,9 @@ epub_title = project
 # A list of files that should not be packed into the epub file.
 epub_exclude_files = ["search.html"]

+# Localization
+locale_dirs = ['locale/']
+gettext_compact = False

 def setup(app):
    app.add_css_file("css/huggingface.css")
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -105,187 +105,219 @@ Supported models
 3. :doc:`BARThez <model_doc/barthez>` (from École polytechnique) released with the paper `BARThez: a Skilled Pretrained
   French Sequence-to-Sequence Model <https://arxiv.org/abs/2010.12321>`__ by Moussa Kamal Eddine, Antoine J.-P.
   Tixier, Michalis Vazirgiannis.
-4. :doc:`BERT <model_doc/bert>` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional
+4. :doc:`BEiT <model_doc/beit>` (from Microsoft) released with the paper `BEiT: BERT Pre-Training of Image Transformers
+   <https://arxiv.org/abs/2106.08254>`__ by Hangbo Bao, Li Dong, Furu Wei.
+5. :doc:`BERT <model_doc/bert>` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional
   Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__ by Jacob Devlin, Ming-Wei Chang,
   Kenton Lee and Kristina Toutanova.
-5. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
+6. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
   Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
   Narayan, Aliaksei Severyn.
-6. :doc:`BigBird-RoBERTa <model_doc/bigbird>` (from Google Research) released with the paper `Big Bird: Transformers
+7. :doc:`BigBird-RoBERTa <model_doc/bigbird>` (from Google Research) released with the paper `Big Bird: Transformers
   for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua
   Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-7. :doc:`BigBird-Pegasus <model_doc/bigbird_pegasus>` (from Google Research) released with the paper `Big Bird:
+8. :doc:`BigBird-Pegasus <model_doc/bigbird_pegasus>` (from Google Research) released with the paper `Big Bird:
   Transformers for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by Manzil Zaheer, Guru Guruganesh, Avinava
   Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-8. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
+9. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-9. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building an
-   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
-   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-10. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
+10. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building
+    an open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju,
+    Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+11. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
    <https://arxiv.org/abs/2010.10499>`__ by Adrian de Wynter and Daniel J. Perry.
-11. :doc:`ByT5 <model_doc/byt5>` (from Google Research) released with the paper `ByT5: Towards a token-free future with
+12. :doc:`ByT5 <model_doc/byt5>` (from Google Research) released with the paper `ByT5: Towards a token-free future with
    pre-trained byte-to-byte models <https://arxiv.org/abs/2105.13626>`__ by Linting Xue, Aditya Barua, Noah Constant,
    Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-12. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
+13. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
    French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
    Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-13. :doc:`CANINE <model_doc/canine>` (from Google Research) released with the paper `CANINE: Pre-training an Efficient
+14. :doc:`CANINE <model_doc/canine>` (from Google Research) released with the paper `CANINE: Pre-training an Efficient
    Tokenization-Free Encoder for Language Representation <https://arxiv.org/abs/2103.06874>`__ by Jonathan H. Clark,
    Dan Garrette, Iulia Turc, John Wieting.
-14. :doc:`CLIP <model_doc/clip>` (from OpenAI) released with the paper `Learning Transferable Visual Models From
+15. :doc:`CLIP <model_doc/clip>` (from OpenAI) released with the paper `Learning Transferable Visual Models From
    Natural Language Supervision <https://arxiv.org/abs/2103.00020>`__ by Alec Radford, Jong Wook Kim, Chris Hallacy,
    Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen
    Krueger, Ilya Sutskever.
-15. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
+16. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
    Span-based Dynamic Convolution <https://arxiv.org/abs/2008.02496>`__ by Zihang Jiang, Weihao Yu, Daquan Zhou,
    Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-16. :doc:`CPM <model_doc/cpm>` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative
+17. :doc:`CPM <model_doc/cpm>` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative
    Chinese Pre-trained Language Model <https://arxiv.org/abs/2012.00413>`__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei
    Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng,
    Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang,
    Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-17. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
+18. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
    Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
    Lav R. Varshney, Caiming Xiong and Richard Socher.
-18. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
+19. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
    Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu
    Chen.
-19. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
+20. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
    with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
    Weizhu Chen.
-20. :doc:`DeiT <model_doc/deit>` (from Facebook) released with the paper `Training data-efficient image transformers &
+21. :doc:`DeiT <model_doc/deit>` (from Facebook) released with the paper `Training data-efficient image transformers &
    distillation through attention <https://arxiv.org/abs/2012.12877>`__ by Hugo Touvron, Matthieu Cord, Matthijs
    Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-21. :doc:`DETR <model_doc/detr>` (from Facebook) released with the paper `End-to-End Object Detection with Transformers
+22. :doc:`DETR <model_doc/detr>` (from Facebook) released with the paper `End-to-End Object Detection with Transformers
    <https://arxiv.org/abs/2005.12872>`__ by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier,
    Alexander Kirillov, Sergey Zagoruyko.
-22. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+23. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
    Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
    Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-23. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+24. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
    distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
    Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
    `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
    version of DistilBERT.
-24. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+25. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
    Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
    Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-25. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+26. :doc:`EncoderDecoder <model_doc/encoderdecoder>` (from Google Research) released with the paper `Leveraging
+    Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
+    Narayan, Aliaksei Severyn.
+27. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
    Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
    Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-26. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+28. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
    Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
    Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-27. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+29. :doc:`FNet <model_doc/fnet>` (from Google Research) released with the paper `FNet: Mixing Tokens with Fourier
+    Transforms <https://arxiv.org/abs/2105.03824>`__ by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago
+    Ontanon.
+30. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
    Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
    Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-28. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+31. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
    Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
    and Ilya Sutskever.
-29. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+32. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
    Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
    Luan, Dario Amodei** and Ilya Sutskever**.
-30. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
+33. :doc:`GPT-J <model_doc/gptj>` (from EleutherAI) released in the repository `kingoflolz/mesh-transformer-jax
+    <https://github.com/kingoflolz/mesh-transformer-jax/>`__ by Ben Wang and Aran Komatsuzaki.
+34. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
    <https://github.com/EleutherAI/gpt-neo>`__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-31. :doc:`Hubert <model_doc/hubert>` (from Facebook) released with the paper `HuBERT: Self-Supervised Speech
+35. :doc:`Hubert <model_doc/hubert>` (from Facebook) released with the paper `HuBERT: Self-Supervised Speech
    Representation Learning by Masked Prediction of Hidden Units <https://arxiv.org/abs/2106.07447>`__ by Wei-Ning Hsu,
    Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-32. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
-    <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
-33. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+36. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
+    <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+37. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-34. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+38. :doc:`LayoutLMv2 <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutLMv2:
+    Multi-modal Pre-training for Visually-Rich Document Understanding <https://arxiv.org/abs/2012.14740>`__ by Yang Xu,
+    Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min
+    Zhang, Lidong Zhou.
+39. :doc:`LayoutXLM <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutXLM:
+    Multimodal Pre-training for Multilingual Visually-rich Document Understanding <https://arxiv.org/abs/2104.08836>`__
+    by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+40. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-35. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+41. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-36. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
+42. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
    Representations with Entity-aware Self-attention <https://arxiv.org/abs/2010.01057>`__ by Ikuya Yamada, Akari Asai,
    Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-37. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+43. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
    by Hao Tan and Mohit Bansal.
-38. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
-    Machine Translation <https://arxiv.org/abs/2010.11125>`__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi
-    Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman
-    Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-39. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+44. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
+    Machine Translation <https://arxiv.org/abs/2010.11125>`__ by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma,
+    Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal,
+    Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+45. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
    Translator Team.
-40. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+46. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-41. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
+47. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
    Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
    Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-42. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
+48. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-43. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
+49. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-44. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+50. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
    Jianfeng Lu, Tie-Yan Liu.
-45. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+51. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-46. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
-    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
+52. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__ by Jingqing Zhang, Yao Zhao,
    Mohammad Saleh and Peter J. Liu.
-47. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+53. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-48. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+54. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-49. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+55. :doc:`RemBERT <model_doc/rembert>` (from Google Research) released with the paper `Rethinking embedding coupling in
+    pre-trained language models <https://arxiv.org/pdf/2010.12821.pdf>`__ by Hyung Won Chung, Thibault Févry, Henry
+    Tsai, M. Johnson, Sebastian Ruder.
+56. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-50. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
+57. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
    Enhanced Transformer with Rotary Position Embedding <https://arxiv.org/pdf/2104.09864v1.pdf>`__ by Jianlin Su and
    Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-51. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
+58. :doc:`SpeechEncoderDecoder <model_doc/speechencoderdecoder>`
+59. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
    `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
    Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-52. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
-    about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
-    Krishna, and Kurt W. Keutzer.
-53. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+60. :doc:`SpeechToTextTransformer2 <model_doc/speech_to_text_2>` (from Facebook), released together with the paper
+    `Large-Scale Self- and Semi-Supervised Learning for Speech Translation <https://arxiv.org/abs/2104.06678>`__ by
+    Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+61. :doc:`Splinter <model_doc/splinter>` (from Tel Aviv University), released together with the paper `Few-Shot
+    Question Answering by Pretraining Span Selection <https://arxiv.org/abs/2101.00438>`__ by Ori Ram, Yuval Kirstain,
+    Jonathan Berant, Amir Globerson, Omer Levy.
+62. :doc:`SqueezeBert <model_doc/squeezebert>` (from Berkeley) released with the paper `SqueezeBERT: What can computer
+    vision teach NLP about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola,
+    Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+63. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-54. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+64. :doc:`T5v1.1 <model_doc/t5v1.1>` (from Google AI) released in the repository
+    `google-research/text-to-text-transfer-transformer
+    <https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__ by
+    Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi
+    Zhou and Wei Li and Peter J. Liu.
+65. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
    Francesco Piccinno and Julian Martin Eisenschlos.
-55. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+66. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-56. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
+67. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
    Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
    Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
    Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-57. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
+68. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
    Performant Baseline for Vision and Language <https://arxiv.org/pdf/1908.03557>`__ by Liunian Harold Li, Mark
    Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-58. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+69. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
    Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
    Zhou, Abdelrahman Mohamed, Michael Auli.
-59. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+70. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-60. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+71. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-61. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+72. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
    Zettlemoyer and Veselin Stoyanov.
-62. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+73. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-63. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
+74. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
    Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
    Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.

@@ -305,10 +337,12 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
 +=============================+================+================+=================+====================+==============+
-|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            BeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
@@ -319,71 +353,81 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|       BlenderbotSmall       |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            CLIP             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           Canine            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            CLIP             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 | FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            GPT-J            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -391,14 +435,22 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         Speech2Text         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
@@ -407,10 +459,10 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             ViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             ViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
@@ -421,10 +473,6 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+

 .. toctree::
    :maxdepth: 2
@@ -462,6 +510,7 @@ Flax), PyTorch, and/or TensorFlow.
    migration
    contributing
    add_new_model
+    add_new_pipeline
    fast_tokenizers
    performance
    parallelism
@@ -503,6 +552,7 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/auto
    model_doc/bart
    model_doc/barthez
+    model_doc/beit
    model_doc/bert
    model_doc/bertweet
    model_doc/bertgeneration
@@ -529,11 +579,14 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/electra
    model_doc/encoderdecoder
    model_doc/flaubert
+    model_doc/fnet
    model_doc/fsmt
    model_doc/funnel
    model_doc/herbert
    model_doc/ibert
    model_doc/layoutlm
+    model_doc/layoutlmv2
+    model_doc/layoutxlm
    model_doc/led
    model_doc/longformer
    model_doc/luke
@@ -548,6 +601,7 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/mt5
    model_doc/gpt
    model_doc/gpt2
+    model_doc/gptj
    model_doc/gpt_neo
    model_doc/hubert
    model_doc/pegasus
@@ -555,12 +609,17 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/prophetnet
    model_doc/rag
    model_doc/reformer
+    model_doc/rembert
    model_doc/retribert
    model_doc/roberta
    model_doc/roformer
+    model_doc/speechencoderdecoder
    model_doc/speech_to_text
+    model_doc/speech_to_text_2
+    model_doc/splinter
    model_doc/squeezebert
    model_doc/t5
+    model_doc/t5v1.1
    model_doc/tapas
    model_doc/transformerxl
    model_doc/vit
--- a/docs/source/internal/modeling_utils.rst
+++ b/docs/source/internal/modeling_utils.rst
@@ -63,7 +63,6 @@ TensorFlow custom layers
    :members: call

 .. autoclass:: transformers.modeling_tf_utils.TFSequenceSummary
-    :members: call


 TensorFlow loss functions
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@@ -17,6 +17,11 @@ The base class :class:`~transformers.PretrainedConfig` implements the common met
 either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
 from HuggingFace's AWS S3 repository).

+Each derived config class implements model specific attributes. Common attributes present in all config classes are:
+:obj:`hidden_size`, :obj:`num_attention_heads`, and :obj:`num_hidden_layers`. Text models further implement:
+:obj:`vocab_size`.
+
+

 PretrainedConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/main_classes/data_collator.rst
+++ b/docs/source/main_classes/data_collator.rst
@@ -18,7 +18,7 @@ the same type as the elements of :obj:`train_dataset` or :obj:`eval_dataset`.

 To be able to build batches, data collators may apply some processing (like padding). Some of them (like
 :class:`~transformers.DataCollatorForLanguageModeling`) also apply some random data augmentation (like random masking)
-oin the formed batch.
+on the formed batch.

 Examples of use can be found in the :doc:`example scripts <../examples>` or :doc:`example notebooks <../notebooks>`.

@@ -54,18 +54,18 @@ DataCollatorForLanguageModeling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.data.data_collator.DataCollatorForLanguageModeling
-    :members: mask_tokens
+    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens


 DataCollatorForWholeWordMask
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.data.data_collator.DataCollatorForWholeWordMask
-    :members: mask_tokens
+    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens


 DataCollatorForPermutationLanguageModeling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.data.data_collator.DataCollatorForPermutationLanguageModeling
-    :members: mask_tokens
+    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
--- a/docs/source/main_classes/deepspeed.rst
+++ b/docs/source/main_classes/deepspeed.rst
@@ -1728,7 +1728,7 @@ For example for a pretrained model:
 .. code-block:: python

    from transformers.deepspeed import HfDeepSpeedConfig
-    from transformers import AugoModel
+    from transformers import AutoModel, deepspeed

    ds_config = { ... } # deepspeed config object or path to the file
    # must run before instantiating the model
@@ -1741,7 +1741,7 @@ or for non-pretrained model:
 .. code-block:: python

    from transformers.deepspeed import HfDeepSpeedConfig
-    from transformers import AugoModel, AutoConfig
+    from transformers import AutoModel, AutoConfig, deepspeed

    ds_config = { ... } # deepspeed config object or path to the file
    # must run before instantiating the model
--- a/docs/source/main_classes/output.rst
+++ b/docs/source/main_classes/output.rst
@@ -299,3 +299,93 @@ TFSeq2SeqQuestionAnsweringModelOutput

 .. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput
    :members:
+
+
+FlaxBaseModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutput
+
+
+FlaxBaseModelOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPast
+
+
+FlaxBaseModelOutputWithPooling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPooling
+
+
+FlaxBaseModelOutputWithPastAndCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPastAndCrossAttentions
+
+
+FlaxSeq2SeqModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqModelOutput
+
+
+FlaxCausalLMOutputWithCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxCausalLMOutputWithCrossAttentions
+
+
+FlaxMaskedLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxMaskedLMOutput
+
+
+FlaxSeq2SeqLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqLMOutput
+
+
+FlaxNextSentencePredictorOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxNextSentencePredictorOutput
+
+
+FlaxSequenceClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSequenceClassifierOutput
+
+
+FlaxSeq2SeqSequenceClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqSequenceClassifierOutput
+
+
+FlaxMultipleChoiceModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxMultipleChoiceModelOutput
+
+
+FlaxTokenClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxTokenClassifierOutput
+
+
+FlaxQuestionAnsweringModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxQuestionAnsweringModelOutput
+
+
+FlaxSeq2SeqQuestionAnsweringModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqQuestionAnsweringModelOutput
--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@@ -23,20 +23,22 @@ There are two categories of pipeline abstractions to be aware about:
 - The :func:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines.
 - The other task-specific pipelines:

+    - :class:`~transformers.AudioClassificationPipeline`
    - :class:`~transformers.AutomaticSpeechRecognitionPipeline`
    - :class:`~transformers.ConversationalPipeline`
    - :class:`~transformers.FeatureExtractionPipeline`
    - :class:`~transformers.FillMaskPipeline`
    - :class:`~transformers.ImageClassificationPipeline`
+    - :class:`~transformers.ObjectDetectionPipeline`
    - :class:`~transformers.QuestionAnsweringPipeline`
    - :class:`~transformers.SummarizationPipeline`
+    - :class:`~transformers.TableQuestionAnsweringPipeline`
    - :class:`~transformers.TextClassificationPipeline`
    - :class:`~transformers.TextGenerationPipeline`
+    - :class:`~transformers.Text2TextGenerationPipeline`
    - :class:`~transformers.TokenClassificationPipeline`
    - :class:`~transformers.TranslationPipeline`
    - :class:`~transformers.ZeroShotClassificationPipeline`
-    - :class:`~transformers.Text2TextGenerationPipeline`
-    - :class:`~transformers.TableQuestionAnsweringPipeline`

 The pipeline abstraction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -44,12 +46,60 @@ The pipeline abstraction
 The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any other
 pipeline but requires an additional argument which is the `task`.

+Simple call on one item:
+
+.. code-block::
+
+    >>> pipe = pipeline("text-classification")
+    >>> pipe("This restaurant is awesome")
+    [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+
+To call a pipeline on many items, you can either call with a `list`.
+
+.. code-block::
+
+    >>> pipe = pipeline("text-classification")
+    >>> pipe(["This restaurant is awesome", "This restaurant is aweful"])
+    [{'label': 'POSITIVE', 'score': 0.9998743534088135},
+     {'label': 'NEGATIVE', 'score': 0.9996669292449951}]
+
+
+To iterate of full datasets it is recommended to use a :obj:`dataset` directly. This means you don't need to allocate
+the whole dataset at once, nor do you need to do batching yourself. This should work just as fast as custom loops on
+GPU. If it doesn't don't hesitate to create an issue.
+
+.. code-block::
+
+    pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
+    dataset = datasets.load_dataset("superb", name="asr", split="test")
+
+    # KeyDataset (only `pt`) will simply return the item in the dict returned by the dataset item
+    # as we're not interested in the `target` part of the dataset.
+    for out in tqdm.tqdm(pipe(KeyDataset(dataset, "file"))):
+        print(out)
+        # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
+        # {"text": ....}
+        # ....
+
+
 .. autofunction:: transformers.pipeline

+Implementing a pipeline
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:doc:`Implementing a new pipeline <../add_new_pipeline>`

 The task specific pipelines
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+
+AudioClassificationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.AudioClassificationPipeline
+    :special-members: __call__
+    :members:
+
 AutomaticSpeechRecognitionPipeline
 =======================================================================================================================

@@ -94,6 +144,13 @@ NerPipeline

 See :class:`~transformers.TokenClassificationPipeline` for all details.

+ObjectDetectionPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.ObjectDetectionPipeline
+    :special-members: __call__
+    :members:
+
 QuestionAnsweringPipeline
 =======================================================================================================================

--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -64,9 +64,9 @@ classification:

    class MultilabelTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
-            labels = inputs.pop("labels")
+            labels = inputs.get("labels")
            outputs = model(**inputs)
-            logits = outputs.logits
+            logits = outputs.get('logits')
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                            labels.float().view(-1, self.model.config.num_labels))
@@ -119,6 +119,29 @@ TFTrainingArguments
    :members:


+Checkpoints
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, :class:`~transformers.Trainer` will save all checkpoints in the :obj:`output_dir` you set in the
+:class:`~transformers.TrainingArguments` you are using. Those will go in subfolder named :obj:`checkpoint-xxx` with xxx
+being the step at which the training was at.
+
+Resuming training from a checkpoint can be done when calling :meth:`~transformers.Trainer.train` with either:
+
+- :obj:`resume_from_checkpoint=True` which will resume training from the latest checkpoint
+- :obj:`resume_from_checkpoint=checkpoint_dir` which will resume training from the specific checkpoint in the directory
+  passed.
+
+In addition, you can easily save your checkpoints on the Model Hub when using :obj:`push_to_hub=True`. By default, all
+the models saved in intermediate checkpoints are saved in different commits, but not the optimizer state. You can adapt
+the :obj:`hub-strategy` value of your :class:`~transformers.TrainingArguments` to either:
+
+- :obj:`"checkpoint"`: the latest checkpoint is also pushed in a subfolder named last-checkpoint, allowing you to
+  resume training easily with :obj:`trainer.train(resume_from_checkpoint="output_dir/last-checkpoint")`.
+- :obj:`"all_checkpoints"`: all checkpoints are pushed like they appear in the output folder (so you will get one
+  checkpoint folder per folder in your final repository)
+
+
 Logging
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -197,7 +220,7 @@ which should make the "stop and resume" style of training as close as possible t
 However, due to various default non-deterministic pytorch settings this might not fully work. If you want full
 determinism please refer to `Controlling sources of randomness
 <https://pytorch.org/docs/stable/notes/randomness.html>`__. As explained in the document, that some of those settings
-that make things determinstic (.e.g., ``torch.backends.cudnn.deterministic``) may slow things down, therefore this
+that make things deterministic (.e.g., ``torch.backends.cudnn.deterministic``) may slow things down, therefore this
 can't be done by default, but you can enable those yourself if needed.


--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -43,7 +43,8 @@ Tips:
  similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
  number of (repeating) layers.

-This model was contributed by `lysandre <https://huggingface.co/lysandre>`__. The original code can be found `here
+This model was contributed by `lysandre <https://huggingface.co/lysandre>`__. This model jax version was contributed by
+`kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found `here
 <https://github.com/google-research/ALBERT>`__.

 AlbertConfig
@@ -174,3 +175,52 @@ TFAlbertForQuestionAnswering

 .. autoclass:: transformers.TFAlbertForQuestionAnswering
    :members: call
+
+
+FlaxAlbertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertModel
+    :members: __call__
+
+
+FlaxAlbertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForPreTraining
+    :members: __call__
+
+
+FlaxAlbertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForMaskedLM
+    :members: __call__
+
+
+FlaxAlbertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForSequenceClassification
+    :members: __call__
+
+
+FlaxAlbertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForMultipleChoice
+    :members: __call__
+
+
+FlaxAlbertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForTokenClassification
+    :members: __call__
+
+
+FlaxAlbertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForQuestionAnswering
+    :members: __call__
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -135,6 +135,34 @@ AutoModelForImageClassification
    :members:


+AutoModelForAudioClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForAudioClassification
+    :members:
+
+
+AutoModelForCTC
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForCTC
+    :members:
+
+
+AutoModelForSpeechSeq2Seq
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForSpeechSeq2Seq
+    :members:
+
+
+AutoModelForObjectDetection
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForObjectDetection
+    :members:
+
+
 TFAutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/beit.rst
+++ b/docs/source/model_doc/beit.rst
@@ -0,0 +1,119 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BEiT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BEiT model was proposed in `BEiT: BERT Pre-Training of Image Transformers <https://arxiv.org/abs/2106.08254>`__ by
+Hangbo Bao, Li Dong and Furu Wei. Inspired by BERT, BEiT is the first paper that makes self-supervised pre-training of
+Vision Transformers (ViTs) outperform supervised pre-training. Rather than pre-training the model to predict the class
+of an image (as done in the `original ViT paper <https://arxiv.org/abs/2010.11929>`__), BEiT models are pre-trained to
+predict visual tokens from the codebook of OpenAI's `DALL-E model <https://arxiv.org/abs/2102.12092>`__ given masked
+patches.
+
+The abstract from the paper is the following:
+
+*We introduce a self-supervised vision representation model BEiT, which stands for Bidirectional Encoder representation
+from Image Transformers. Following BERT developed in the natural language processing area, we propose a masked image
+modeling task to pretrain vision Transformers. Specifically, each image has two views in our pre-training, i.e, image
+patches (such as 16x16 pixels), and visual tokens (i.e., discrete tokens). We first "tokenize" the original image into
+visual tokens. Then we randomly mask some image patches and fed them into the backbone Transformer. The pre-training
+objective is to recover the original visual tokens based on the corrupted image patches. After pre-training BEiT, we
+directly fine-tune the model parameters on downstream tasks by appending task layers upon the pretrained encoder.
+Experimental results on image classification and semantic segmentation show that our model achieves competitive results
+with previous pre-training methods. For example, base-size BEiT achieves 83.2% top-1 accuracy on ImageNet-1K,
+significantly outperforming from-scratch DeiT training (81.8%) with the same setup. Moreover, large-size BEiT obtains
+86.3% only using ImageNet-1K, even outperforming ViT-L with supervised pre-training on ImageNet-22K (85.2%).*
+
+Tips:
+
+- BEiT models are regular Vision Transformers, but pre-trained in a self-supervised way rather than supervised. They
+  outperform both the original model (ViT) as well as Data-efficient Image Transformers (DeiT) when fine-tuned on
+  ImageNet-1K and CIFAR-100.
+- As the BEiT models expect each image to be of the same size (resolution), one can use
+  :class:`~transformers.BeitFeatureExtractor` to resize (or rescale) and normalize images for the model.
+- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
+  each checkpoint. For example, :obj:`microsoft/beit-base-patch16-224` refers to a base-sized architecture with patch
+  resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the `hub
+  <https://huggingface.co/models?search=microsoft/beit>`__.
+- The available checkpoints are either (1) pre-trained on `ImageNet-22k <http://www.image-net.org/>`__ (a collection of
+  14 million images and 22k classes) only, (2) also fine-tuned on ImageNet-22k or (3) also fine-tuned on `ImageNet-1k
+  <http://www.image-net.org/challenges/LSVRC/2012/>`__ (also referred to as ILSVRC 2012, a collection of 1.3 million
+  images and 1,000 classes).
+- BEiT uses relative position embeddings, inspired by the T5 model. During pre-training, the authors shared the
+  relative position bias among the several self-attention layers. During fine-tuning, each layer's relative position
+  bias is initialized with the shared relative position bias obtained after pre-training. Note that, if one wants to
+  pre-train a model from scratch, one needs to either set the :obj:`use_relative_position_bias` or the
+  :obj:`use_relative_position_bias` attribute of :class:`~transformers.BeitConfig` to :obj:`True` in order to add
+  position embeddings.
+
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The JAX/FLAX version of this model was
+contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found `here
+<https://github.com/microsoft/unilm/tree/master/beit>`__.
+
+BeitConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitConfig
+    :members:
+
+
+BeitFeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitFeatureExtractor
+    :members: __call__
+
+
+BeitModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitModel
+    :members: forward
+
+
+BeitForMaskedImageModeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitForMaskedImageModeling
+    :members: forward
+
+
+BeitForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitForImageClassification
+    :members: forward
+
+
+FlaxBeitModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBeitModel
+    :members: __call__
+
+
+FlaxBeitForMaskedImageModeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBeitForMaskedImageModeling
+    :members: __call__
+
+
+FlaxBeitForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBeitForImageClassification
+    :members: __call__
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -76,6 +76,9 @@ Bert specific outputs
 .. autoclass:: transformers.models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
    :members:

+.. autoclass:: transformers.models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
+    :members:
+

 BertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/blenderbot_small.rst
+++ b/docs/source/model_doc/blenderbot_small.rst
@@ -57,6 +57,13 @@ BlenderbotSmallTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+BlenderbotSmallTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallTokenizerFast
+    :members:
+
+
 BlenderbotSmallModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/byt5.rst
+++ b/docs/source/model_doc/byt5.rst
@@ -39,8 +39,11 @@ experiments.*
 This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
 found `here <https://github.com/google-research/byt5>`__.

+ByT5's architecture is based on the T5v1.1 model, so one can refer to :doc:`T5v1.1's documentation page <t5v1.1>`. They
+only differ in how inputs should be prepared for the model, see the code examples below.

-ByT5's architecture is based on the T5 model, so one can refer to :doc:`T5's documentation page <t5>`.
+Since ByT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.


 Example
--- a/docs/source/model_doc/deberta.rst
+++ b/docs/source/model_doc/deberta.rst
@@ -38,7 +38,8 @@ the training data performs consistently better on a wide range of NLP tasks, ach
 pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*


-This model was contributed by `DeBERTa <https://huggingface.co/DeBERTa>`__. The original code can be found `here
+This model was contributed by `DeBERTa <https://huggingface.co/DeBERTa>`__. This model TF 2.0 implementation was
+contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__ . The original code can be found `here
 <https://github.com/microsoft/DeBERTa>`__.


@@ -103,3 +104,45 @@ DebertaForQuestionAnswering

 .. autoclass:: transformers.DebertaForQuestionAnswering
    :members: forward
+
+
+TFDebertaModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaModel
+    :members: call
+
+
+TFDebertaPreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaPreTrainedModel
+    :members: call
+
+
+TFDebertaForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaForMaskedLM
+    :members: call
+
+
+TFDebertaForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaForSequenceClassification
+    :members: call
+
+
+TFDebertaForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaForTokenClassification
+    :members: call
+
+
+TFDebertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/deberta_v2.rst
+++ b/docs/source/model_doc/deberta_v2.rst
@@ -53,12 +53,13 @@ New in v2:
  transformer layer to better learn the local dependency of input tokens.
 - **Sharing position projection matrix with content projection matrix in attention layer** Based on previous
  experiments, this can save parameters without affecting the performance.
- **Apply bucket to encode relative postions** The DeBERTa-v2 model uses log bucket to encode relative positions
+- **Apply bucket to encode relative positions** The DeBERTa-v2 model uses log bucket to encode relative positions
  similar to T5.
 - **900M model & 1.5B model** Two additional model sizes are available: 900M and 1.5B, which significantly improves the
  performance of downstream tasks.

-This model was contributed by `DeBERTa <https://huggingface.co/DeBERTa>`__. The original code can be found `here
+This model was contributed by `DeBERTa <https://huggingface.co/DeBERTa>`__. This model TF 2.0 implementation was
+contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found `here
 <https://github.com/microsoft/DeBERTa>`__.


@@ -117,3 +118,45 @@ DebertaV2ForQuestionAnswering

 .. autoclass:: transformers.DebertaV2ForQuestionAnswering
    :members: forward
+
+
+TFDebertaV2Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2Model
+    :members: call
+
+
+TFDebertaV2PreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2PreTrainedModel
+    :members: call
+
+
+TFDebertaV2ForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2ForMaskedLM
+    :members: call
+
+
+TFDebertaV2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2ForSequenceClassification
+    :members: call
+
+
+TFDebertaV2ForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2ForTokenClassification
+    :members: call
+
+
+TFDebertaV2ForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2ForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -44,8 +44,9 @@ Tips:
 - DistilBERT doesn't have options to select the input positions (:obj:`position_ids` input). This could be added if
  necessary though, just let us know if you need this option.

-This model was contributed by `victorsanh <https://huggingface.co/victorsanh>`__. The original code can be found
-:prefix_link:`here <examples/research-projects/distillation>`.
+This model was contributed by `victorsanh <https://huggingface.co/victorsanh>`__. This model jax version was
+contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found :prefix_link:`here
+<examples/research_projects/distillation>`.


 DistilBertConfig
@@ -152,3 +153,45 @@ TFDistilBertForQuestionAnswering

 .. autoclass:: transformers.TFDistilBertForQuestionAnswering
    :members: call
+
+
+FlaxDistilBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertModel
+    :members: __call__
+
+
+FlaxDistilBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForMaskedLM
+    :members: __call__
+
+
+FlaxDistilBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForSequenceClassification
+    :members: __call__
+
+
+FlaxDistilBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForMultipleChoice
+    :members: __call__
+
+
+FlaxDistilBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForTokenClassification
+    :members: __call__
+
+
+FlaxDistilBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForQuestionAnswering
+    :members: __call__
--- a/docs/source/model_doc/encoderdecoder.rst
+++ b/docs/source/model_doc/encoderdecoder.rst
@@ -40,3 +40,10 @@ EncoderDecoderModel

 .. autoclass:: transformers.EncoderDecoderModel
    :members: forward, from_encoder_decoder_pretrained
+
+
+FlaxEncoderDecoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxEncoderDecoderModel
+    :members: __call__, from_encoder_decoder_pretrained
--- a/docs/source/model_doc/fnet.rst
+++ b/docs/source/model_doc/fnet.rst
@@ -0,0 +1,121 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+FNet
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The FNet model was proposed in `FNet: Mixing Tokens with Fourier Transforms <https://arxiv.org/abs/2105.03824>`__ by
+James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. The model replaces the self-attention layer in a BERT
+model with a fourier transform which returns only the real parts of the transform. The model is significantly faster
+than the BERT model because it has fewer parameters and is more memory efficient. The model achieves about 92-97%
+accuracy of BERT counterparts on GLUE benchmark, and trains much faster than the BERT model. The abstract from the
+paper is the following:
+
+*We show that Transformer encoder architectures can be sped up, with limited accuracy costs, by replacing the
+self-attention sublayers with simple linear transformations that "mix" input tokens. These linear mixers, along with
+standard nonlinearities in feed-forward layers, prove competent at modeling semantic relationships in several text
+classification tasks. Most surprisingly, we find that replacing the self-attention sublayer in a Transformer encoder
+with a standard, unparameterized Fourier Transform achieves 92-97% of the accuracy of BERT counterparts on the GLUE
+benchmark, but trains 80% faster on GPUs and 70% faster on TPUs at standard 512 input lengths. At longer input lengths,
+our FNet model is significantly faster: when compared to the "efficient" Transformers on the Long Range Arena
+benchmark, FNet matches the accuracy of the most accurate models, while outpacing the fastest models across all
+sequence lengths on GPUs (and across relatively shorter lengths on TPUs). Finally, FNet has a light memory footprint
+and is particularly efficient at smaller model sizes; for a fixed speed and accuracy budget, small FNet models
+outperform Transformer counterparts.*
+
+Tips on usage:
+
+- The model was trained without an attention mask as it is based on Fourier Transform. The model was trained with
+  maximum sequence length 512 which includes pad tokens. Hence, it is highly recommended to use the same maximum
+  sequence length for fine-tuning and inference.
+
+This model was contributed by `gchhablani <https://huggingface.co/gchhablani>`__. The original code can be found `here
+<https://github.com/google-research/google-research/tree/master/f_net>`__.
+
+FNetConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetConfig
+    :members:
+
+
+FNetTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+FNetTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetTokenizerFast
+    :members:
+
+
+FNetModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetModel
+    :members: forward
+
+
+FNetForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForPreTraining
+    :members: forward
+
+
+FNetForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForMaskedLM
+    :members: forward
+
+
+FNetForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForNextSentencePrediction
+    :members: forward
+
+FNetForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForSequenceClassification
+    :members: forward
+
+
+FNetForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForMultipleChoice
+    :members: forward
+
+
+FNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForTokenClassification
+    :members: forward
+
+
+FNetForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForQuestionAnswering
+    :members: forward
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -36,10 +36,11 @@ Tips:
 - GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
  observed in the `run_generation.py` example script.
- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
-  this `past` value prevents the model from re-computing pre-computed values in the context of text generation. See
-  `reusing the past in generative models <../quickstart.html#using-the-past>`__ for more information on the usage of
-  this argument.
+- The model can take the `past_key_values` (for PyTorch) or `past` (for TF) as input, which is the previously computed
+  key/value attention pairs. Using this (`past_key_values` or `past`) value prevents the model from re-computing
+  pre-computed values in the context of text generation. For PyTorch, see `past_key_values` argument of the
+  :meth:`~transformers.GPT2Model.forward` method, or for TF the `past` argument of the
+  :meth:`~transformers.TFGPT2Model.call` method for more information on its usage.

 `Write With Transformer <https://transformer.huggingface.co/doc/gpt2-large>`__ is a webapp created and hosted by
 Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
@@ -108,6 +109,13 @@ GPT2ForSequenceClassification
    :members: forward


+GPT2ForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPT2ForTokenClassification
+    :members: forward
+
+
 TFGPT2Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/gptj.rst
+++ b/docs/source/model_doc/gptj.rst
@@ -0,0 +1,107 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+GPT-J
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The GPT-J model was released in the `kingoflolz/mesh-transformer-jax
+<https://github.com/kingoflolz/mesh-transformer-jax>`__ repository by Ben Wang and Aran Komatsuzaki. It is a GPT-2-like
+causal language model trained on `the Pile <https://pile.eleuther.ai/>`__ dataset.
+
+This model was contributed by `Stella Biderman <https://huggingface.co/stellaathena>`__.
+
+Tips:
+
+- Running [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) in float32 precision on GPU requires at least 24 GB of
+  RAM. On GPUs with less than 24 GB RAM, one should therefore load the model in half-precision:
+
+.. code-block::
+
+    >>> from transformers import GPTJForCausalLM
+    >>> import torch
+
+    >>> model =  GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
+
+- Although the embedding matrix has a size of 50400, only 50257 entries are used by the GPT-2 tokenizer. These extra
+  tokens are added for the sake of efficiency on TPUs. To avoid the mis-match between embedding matrix size and vocab
+  size, the tokenizer for [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) contains 143 extra tokens
+  ``<|extratoken_1|>... <|extratoken_143|>``, so the ``vocab_size`` of tokenizer also becomes 50400.
+
+Generation
+_______________________________________________________________________________________________________________________
+
+The :meth:`~transformers.generation_utils.GenerationMixin.generate` method can be used to generate text using GPT-J
+model.
+
+.. code-block::
+
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+    >>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
+    >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
+
+    >>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
+    ...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
+    ...          "researchers was the fact that the unicorns spoke perfect English."
+
+    >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+
+    >>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
+    >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
+
+...or in float16 precision:
+
+.. code-block::
+
+    >>> from transformers import GPTJForCausalLM, AutoTokenizer
+    >>> import torch
+
+    >>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
+    >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
+
+    >>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
+    ...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
+    ...          "researchers was the fact that the unicorns spoke perfect English."
+
+    >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+
+    >>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
+    >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
+
+
+GPTJConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTJConfig
+    :members:
+
+GPTJModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTJModel
+    :members: forward
+
+
+GPTJForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTJForCausalLM
+    :members: forward
+
+
+GPTJForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTJForSequenceClassification
+    :members: forward
--- a/docs/source/model_doc/hubert.rst
+++ b/docs/source/model_doc/hubert.rst
@@ -64,6 +64,14 @@ HubertForCTC
 .. autoclass:: transformers.HubertForCTC
    :members: forward

+
+HubertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.HubertForSequenceClassification
+    :members: forward
+
+
 TFHubertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/layoutlmv2.rst
+++ b/docs/source/model_doc/layoutlmv2.rst
@@ -0,0 +1,314 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+LayoutLMV2
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The LayoutLMV2 model was proposed in `LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding
+<https://arxiv.org/abs/2012.14740>`__ by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu,
+Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. LayoutLMV2 improves `LayoutLM
+<https://huggingface.co/transformers/model_doc/layoutlm.html>`__ to obtain state-of-the-art results across several
+document image understanding benchmarks:
+
+- information extraction from scanned documents: the `FUNSD <https://guillaumejaume.github.io/FUNSD/>`__ dataset (a
+  collection of 199 annotated forms comprising more than 30,000 words), the `CORD <https://github.com/clovaai/cord>`__
+  dataset (a collection of 800 receipts for training, 100 for validation and 100 for testing), the `SROIE
+  <https://rrc.cvc.uab.es/?ch=13>`__ dataset (a collection of 626 receipts for training and 347 receipts for testing)
+  and the `Kleister-NDA <https://github.com/applicaai/kleister-nda>`__ dataset (a collection of non-disclosure
+  agreements from the EDGAR database, including 254 documents for training, 83 documents for validation, and 203
+  documents for testing).
+- document image classification: the `RVL-CDIP <https://www.cs.cmu.edu/~aharley/rvl-cdip/>`__ dataset (a collection of
+  400,000 images belonging to one of 16 classes).
+- document visual question answering: the `DocVQA <https://arxiv.org/abs/2007.00398>`__ dataset (a collection of 50,000
+  questions defined on 12,000+ document images).
+
+The abstract from the paper is the following:
+
+*Pre-training of text and layout has proved effective in a variety of visually-rich document understanding tasks due to
+its effective model architecture and the advantage of large-scale unlabeled scanned/digital-born documents. In this
+paper, we present LayoutLMv2 by pre-training text, layout and image in a multi-modal framework, where new model
+architectures and pre-training tasks are leveraged. Specifically, LayoutLMv2 not only uses the existing masked
+visual-language modeling task but also the new text-image alignment and text-image matching tasks in the pre-training
+stage, where cross-modality interaction is better learned. Meanwhile, it also integrates a spatial-aware self-attention
+mechanism into the Transformer architecture, so that the model can fully understand the relative positional
+relationship among different text blocks. Experiment results show that LayoutLMv2 outperforms strong baselines and
+achieves new state-of-the-art results on a wide variety of downstream visually-rich document understanding tasks,
+including FUNSD (0.7895 -> 0.8420), CORD (0.9493 -> 0.9601), SROIE (0.9524 -> 0.9781), Kleister-NDA (0.834 -> 0.852),
+RVL-CDIP (0.9443 -> 0.9564), and DocVQA (0.7295 -> 0.8672). The pre-trained LayoutLMv2 model is publicly available at
+this https URL.*
+
+Tips:
+
+- The main difference between LayoutLMv1 and LayoutLMv2 is that the latter incorporates visual embeddings during
+  pre-training (while LayoutLMv1 only adds visual embeddings during fine-tuning).
+- LayoutLMv2 adds both a relative 1D attention bias as well as a spatial 2D attention bias to the attention scores in
+  the self-attention layers. Details can be found on page 5 of the `paper <https://arxiv.org/abs/2012.14740>`__.
+- Demo notebooks on how to use the LayoutLMv2 model on RVL-CDIP, FUNSD, DocVQA, CORD can be found `here
+  <https://github.com/NielsRogge/Transformers-Tutorials>`__.
+- LayoutLMv2 uses Facebook AI's `Detectron2 <https://github.com/facebookresearch/detectron2/>`__ package for its visual
+  backbone. See `this link <https://detectron2.readthedocs.io/en/latest/tutorials/install.html>`__ for installation
+  instructions.
+- In addition to :obj:`input_ids`, :meth:`~transformer.LayoutLMv2Model.forward` expects 2 additional inputs, namely
+  :obj:`image` and :obj:`bbox`. The :obj:`image` input corresponds to the original document image in which the text
+  tokens occur. The model expects each document image to be of size 224x224. This means that if you have a batch of
+  document images, :obj:`image` should be a tensor of shape (batch_size, 3, 224, 224). This can be either a
+  :obj:`torch.Tensor` or a :obj:`Detectron2.structures.ImageList`. You don't need to normalize the channels, as this is
+  done by the model. Important to note is that the visual backbone expects BGR channels instead of RGB, as all models
+  in Detectron2 are pre-trained using the BGR format. The :obj:`bbox` input are the bounding boxes (i.e. 2D-positions)
+  of the input text tokens. This is identical to :class:`~transformer.LayoutLMModel`. These can be obtained using an
+  external OCR engine such as Google's `Tesseract <https://github.com/tesseract-ocr/tesseract>`__ (there's a `Python
+  wrapper <https://pypi.org/project/pytesseract/>`__ available). Each bounding box should be in (x0, y0, x1, y1)
+  format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1, y1)
+  represents the position of the lower right corner. Note that one first needs to normalize the bounding boxes to be on
+  a 0-1000 scale. To normalize, you can use the following function:
+
+.. code-block::
+
+    def normalize_bbox(bbox, width, height):
+         return [
+             int(1000 * (bbox[0] / width)),
+             int(1000 * (bbox[1] / height)),
+             int(1000 * (bbox[2] / width)),
+             int(1000 * (bbox[3] / height)),
+         ]
+
+Here, :obj:`width` and :obj:`height` correspond to the width and height of the original document in which the token
+occurs (before resizing the image). Those can be obtained using the Python Image Library (PIL) library for example, as
+follows:
+
+.. code-block::
+
+    from PIL import Image
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.")
+
+    width, height = image.size
+
+However, this model includes a brand new :class:`~transformer.LayoutLMv2Processor` which can be used to directly
+prepare data for the model (including applying OCR under the hood). More information can be found in the "Usage"
+section below.
+
+- Internally, :class:`~transformer.LayoutLMv2Model` will send the :obj:`image` input through its visual backbone to
+  obtain a lower-resolution feature map, whose shape is equal to the :obj:`image_feature_pool_shape` attribute of
+  :class:`~transformer.LayoutLMv2Config`. This feature map is then flattened to obtain a sequence of image tokens. As
+  the size of the feature map is 7x7 by default, one obtains 49 image tokens. These are then concatenated with the text
+  tokens, and send through the Transformer encoder. This means that the last hidden states of the model will have a
+  length of 512 + 49 = 561, if you pad the text tokens up to the max length. More generally, the last hidden states
+  will have a shape of :obj:`seq_length` + :obj:`image_feature_pool_shape[0]` *
+  :obj:`config.image_feature_pool_shape[1]`.
+- When calling :meth:`~transformer.LayoutLMv2Model.from_pretrained`, a warning will be printed with a long list of
+  parameter names that are not initialized. This is not a problem, as these parameters are batch normalization
+  statistics, which are going to have values when fine-tuning on a custom dataset.
+- If you want to train the model in a distributed environment, make sure to call :meth:`synchronize_batch_norm` on the
+  model in order to properly synchronize the batch normalization layers of the visual backbone.
+
+In addition, there's LayoutXLM, which is a multilingual version of LayoutLMv2. More information can be found on
+:doc:`LayoutXLM's documentation page <layoutxlm>`.
+
+Usage: LayoutLMv2Processor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The easiest way to prepare data for the model is to use :class:`~transformer.LayoutLMv2Processor`, which internally
+combines a feature extractor (:class:`~transformer.LayoutLMv2FeatureExtractor`) and a tokenizer
+(:class:`~transformer.LayoutLMv2Tokenizer` or :class:`~transformer.LayoutLMv2TokenizerFast`). The feature extractor
+handles the image modality, while the tokenizer handles the text modality. A processor combines both, which is ideal
+for a multi-modal model like LayoutLMv2. Note that you can still use both separately, if you only want to handle one
+modality.
+
+.. code-block::
+
+    from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2TokenizerFast, LayoutLMv2Processor
+
+    feature_extractor = LayoutLMv2FeatureExtractor() # apply_ocr is set to True by default
+    tokenizer = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
+    processor = LayoutLMv2Processor(feature_extractor, tokenizer)
+
+In short, one can provide a document image (and possibly additional data) to :class:`~transformer.LayoutLMv2Processor`,
+and it will create the inputs expected by the model. Internally, the processor first uses
+:class:`~transformer.LayoutLMv2FeatureExtractor` to apply OCR on the image to get a list of words and normalized
+bounding boxes, as well to resize the image to a given size in order to get the :obj:`image` input. The words and
+normalized bounding boxes are then provided to :class:`~transformer.LayoutLMv2Tokenizer` or
+:class:`~transformer.LayoutLMv2TokenizerFast`, which converts them to token-level :obj:`input_ids`,
+:obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`. Optionally, one can provide word labels to the processor,
+which are turned into token-level :obj:`labels`.
+
+:class:`~transformer.LayoutLMv2Processor` uses `PyTesseract <https://pypi.org/project/pytesseract/>`__, a Python
+wrapper around Google's Tesseract OCR engine, under the hood. Note that you can still use your own OCR engine of
+choice, and provide the words and normalized boxes yourself. This requires initializing
+:class:`~transformer.LayoutLMv2FeatureExtractor` with :obj:`apply_ocr` set to :obj:`False`.
+
+In total, there are 5 use cases that are supported by the processor. Below, we list them all. Note that each of these
+use cases work for both batched and non-batched inputs (we illustrate them for non-batched inputs).
+
+**Use case 1: document image classification (training, inference) + token classification (inference), apply_ocr =
+True**
+
+This is the simplest case, in which the processor (actually the feature extractor) will perform OCR on the image to get
+the words and normalized bounding boxes.
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    encoding = processor(image, return_tensors="pt") # you can also add all tokenizer parameters here such as padding, truncation
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+
+**Use case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False**
+
+In case one wants to do OCR themselves, one can initialize the feature extractor with :obj:`apply_ocr` set to
+:obj:`False`. In that case, one should provide the words and corresponding (normalized) bounding boxes themselves to
+the processor.
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    words = ["hello", "world"]
+    boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+    encoding = processor(image, words, boxes=boxes, return_tensors="pt")
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+
+**Use case 3: token classification (training), apply_ocr=False**
+
+For token classification tasks (such as FUNSD, CORD, SROIE, Kleister-NDA), one can also provide the corresponding word
+labels in order to train a model. The processor will then convert these into token-level :obj:`labels`. By default, it
+will only label the first wordpiece of a word, and label the remaining wordpieces with -100, which is the
+:obj:`ignore_index` of PyTorch's CrossEntropyLoss. In case you want all wordpieces of a word to be labeled, you can
+initialize the tokenizer with :obj:`only_label_first_subword` set to :obj:`False`.
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    words = ["hello", "world"]
+    boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+    word_labels = [1, 2]
+    encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'labels', 'image'])
+
+**Use case 4: visual question answering (inference), apply_ocr=True**
+
+For visual question answering tasks (such as DocVQA), you can provide a question to the processor. By default, the
+processor will apply OCR on the image, and create [CLS] question tokens [SEP] word tokens [SEP].
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    question = "What's his name?"
+    encoding = processor(image, question, return_tensors="pt") 
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+
+**Use case 5: visual question answering (inference), apply_ocr=False**
+
+For visual question answering tasks (such as DocVQA), you can provide a question to the processor. If you want to
+perform OCR yourself, you can provide your own words and (normalized) bounding boxes to the processor.
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    question = "What's his name?"
+    words = ["hello", "world"]
+    boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+    encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")  
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+
+LayoutLMv2Config
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2Config
+    :members:
+
+
+LayoutLMv2FeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2FeatureExtractor
+    :members: __call__
+
+
+LayoutLMv2Tokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2Tokenizer
+    :members: __call__, save_vocabulary
+
+
+LayoutLMv2TokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2TokenizerFast
+    :members: __call__
+
+
+LayoutLMv2Processor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2Processor
+    :members: __call__
+
+
+LayoutLMv2Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2Model
+    :members: forward
+
+
+LayoutLMv2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2ForSequenceClassification
+    :members:
+
+
+LayoutLMv2ForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2ForTokenClassification
+    :members:
+
+
+LayoutLMv2ForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2ForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/layoutxlm.rst
+++ b/docs/source/model_doc/layoutxlm.rst
@@ -0,0 +1,56 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+LayoutXLM
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+LayoutXLM was proposed in `LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding
+<https://arxiv.org/abs/2104.08836>`__ by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha
+Zhang, Furu Wei. It's a multilingual extension of the `LayoutLMv2 model <https://arxiv.org/abs/2012.14740>`__ trained
+on 53 languages.
+
+The abstract from the paper is the following:
+
+*Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually-rich document
+understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. In
+this paper, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to
+bridge the language barriers for visually-rich document understanding. To accurately evaluate LayoutXLM, we also
+introduce a multilingual form understanding benchmark dataset named XFUN, which includes form understanding samples in
+7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese), and key-value pairs are manually labeled
+for each language. Experiment results show that the LayoutXLM model has significantly outperformed the existing SOTA
+cross-lingual pre-trained models on the XFUN dataset.*
+
+One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like so:
+
+.. code-block::
+
+    from transformers import LayoutLMv2Model
+
+    model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base') 
+
+Note that LayoutXLM requires a different tokenizer, based on :class:`~transformers.XLMRobertaTokenizer`. You can
+initialize it as follows:
+
+.. code-block::
+
+    from transformers import AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained('microsoft/layoutxlm-base') 
+
+As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page
+<layoutlmv2>` for all tips, code examples and notebooks.
+
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
+<https://github.com/microsoft/unilm>`__.
--- a/docs/source/model_doc/led.rst
+++ b/docs/source/model_doc/led.rst
@@ -46,8 +46,8 @@ Tips:
 - LED makes use of *global attention* by means of the ``global_attention_mask`` (see
  :class:`~transformers.LongformerModel`). For summarization, it is advised to put *global attention* only on the first
  ``<s>`` token. For question answering, it is advised to put *global attention* on all tokens of the question.
- To fine-tune LED on all 16384, it is necessary to enable *gradient checkpointing* by setting
-  ``config.gradient_checkpointing = True``.
+- To fine-tune LED on all 16384, it is necessary to enable *gradient checkpointing* by executing
+  ``model.gradient_checkpointing_enable()``.
 - A notebook showing how to evaluate LED, can be accessed `here
  <https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing>`__.
 - A notebook showing how to fine-tune LED, can be accessed `here
--- a/docs/source/model_doc/m2m_100.rst
+++ b/docs/source/model_doc/m2m_100.rst
@@ -58,7 +58,7 @@ examples. To install :obj:`sentencepiece` run ``pip install sentencepiece``.
    tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M', src_lang="en", tgt_lang="fr")

    src_text = "Life is like a box of chocolates."
-    tgt_lang = "La vie est comme une boîte de chocolat."
+    tgt_text = "La vie est comme une boîte de chocolat."

    model_inputs = tokenizer(src_text, return_tensors="pt")
    with tokenizer.as_target_tokenizer():
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -103,8 +103,8 @@ Here is the code to see all available pretrained models on the hub:

 .. code-block:: python

-    from transformers.hf_api import HfApi
-    model_list = HfApi().model_list()
+    from huggingface_hub.hf_api import HfApi
+    model_list = HfApi().list_models()
    org = "Helsinki-NLP"
    model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
    suffix = [x.split('/')[1] for x in model_ids]
--- a/docs/source/model_doc/mbart.rst
+++ b/docs/source/model_doc/mbart.rst
@@ -49,11 +49,11 @@ inside the context manager :meth:`~transformers.MBartTokenizer.as_target_tokeniz

    >>> from transformers import MBartForConditionalGeneration, MBartTokenizer

-    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
+    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
    >>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"

-    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt", src_lang="en_XX", tgt_lang="ro_RO")
+    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
    >>> with tokenizer.as_target_tokenizer():
    ...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")

--- a/docs/source/model_doc/mt5.rst
+++ b/docs/source/model_doc/mt5.rst
@@ -10,7 +10,7 @@
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.

-MT5
+mT5
 -----------------------------------------------------------------------------------------------------------------------

 Overview
@@ -24,9 +24,28 @@ The abstract from the paper is the following:

 *The recent "Text-to-Text Transfer Transformer" (T5) leveraged a unified text-to-text format and scale to attain
 state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a
-multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We describe
+multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail
 the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual
-benchmarks. All of the code and model checkpoints*
+benchmarks. We also describe a simple technique to prevent "accidental translation" in the zero-shot setting, where a
+generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model
+checkpoints used in this work are publicly available.*
+
+Note: mT5 was only pre-trained on `mC4 <https://huggingface.co/datasets/mc4>`__ excluding any supervised training.
+Therefore, this model has to be fine-tuned before it is useable on a downstream task, unlike the original T5 model.
+Since mT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
+
+Google has released the following variants:
+
+- `google/mt5-small <https://huggingface.co/google/mt5-small>`__
+
+- `google/mt5-base <https://huggingface.co/google/mt5-base>`__
+
+- `google/mt5-large <https://huggingface.co/google/mt5-large>`__
+
+- `google/mt5-xl <https://huggingface.co/google/mt5-xl>`__
+
+- `google/mt5-xxl <https://huggingface.co/google/mt5-xxl>`__.

 This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
 found `here <https://github.com/google-research/multilingual-t5>`__.
@@ -94,3 +113,17 @@ TFMT5EncoderModel

 .. autoclass:: transformers.TFMT5EncoderModel
    :members:
+
+
+FlaxMT5Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMT5Model
+    :members:
+
+
+FlaxMT5ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMT5ForConditionalGeneration
+    :members:
--- a/docs/source/model_doc/pegasus.rst
+++ b/docs/source/model_doc/pegasus.rst
@@ -152,3 +152,17 @@ TFPegasusForConditionalGeneration

 .. autoclass:: transformers.TFPegasusForConditionalGeneration
    :members: call
+
+
+FlaxPegasusModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxPegasusModel
+    :members: __call__, encode, decode
+
+
+FlaxPegasusForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxPegasusForConditionalGeneration
+    :members: __call__, encode, decode
--- a/docs/source/model_doc/rembert.rst
+++ b/docs/source/model_doc/rembert.rst
@@ -0,0 +1,161 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+RemBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The RemBERT model was proposed in `Rethinking Embedding Coupling in Pre-trained Language Models
+<https://arxiv.org/abs/2010.12821>`__ by Hyung Won Chung, Thibault Févry, Henry Tsai, Melvin Johnson, Sebastian Ruder.
+
+The abstract from the paper is the following:
+
+*We re-evaluate the standard practice of sharing weights between input and output embeddings in state-of-the-art
+pre-trained language models. We show that decoupled embeddings provide increased modeling flexibility, allowing us to
+significantly improve the efficiency of parameter allocation in the input embedding of multilingual models. By
+reallocating the input embedding parameters in the Transformer layers, we achieve dramatically better performance on
+standard natural language understanding tasks with the same number of parameters during fine-tuning. We also show that
+allocating additional capacity to the output embedding provides benefits to the model that persist through the
+fine-tuning stage even though the output embedding is discarded after pre-training. Our analysis shows that larger
+output embeddings prevent the model's last layers from overspecializing to the pre-training task and encourage
+Transformer representations to be more general and more transferable to other tasks and languages. Harnessing these
+findings, we are able to train models that achieve strong performance on the XTREME benchmark without increasing the
+number of parameters at the fine-tuning stage.*
+
+Tips:
+
+For fine-tuning, RemBERT can be thought of as a bigger version of mBERT with an ALBERT-like factorization of the
+embedding layer. The embeddings are not tied in pre-training, in contrast with BERT, which enables smaller input
+embeddings (preserved during fine-tuning) and bigger output embeddings (discarded at fine-tuning). The tokenizer is
+also similar to the Albert one rather than the BERT one.
+
+RemBertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertConfig
+    :members:
+
+
+RemBertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+RemBertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertTokenizerFast
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+RemBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertModel
+    :members: forward
+
+
+RemBertForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForCausalLM
+    :members: forward
+
+
+RemBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForMaskedLM
+    :members: forward
+
+
+RemBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForSequenceClassification
+    :members: forward
+
+
+RemBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForMultipleChoice
+    :members: forward
+
+
+RemBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForTokenClassification
+    :members: forward
+
+
+RemBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForQuestionAnswering
+    :members: forward
+
+
+TFRemBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertModel
+    :members: call
+
+
+TFRemBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForMaskedLM
+    :members: call
+
+
+TFRemBertForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForCausalLM
+    :members: call
+
+
+TFRemBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForSequenceClassification
+    :members: call
+
+
+TFRemBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForMultipleChoice
+    :members: call
+
+
+TFRemBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForTokenClassification
+    :members: call
+
+
+TFRemBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/speech_to_text.rst
+++ b/docs/source/model_doc/speech_to_text.rst
@@ -42,8 +42,8 @@ features. The :class:`~transformers.Speech2TextProcessor` wraps :class:`~transfo
 predicted token ids.

 The feature extractor depends on :obj:`torchaudio` and the tokenizer depends on :obj:`sentencepiece` so be sure to
-install those packages before running the examples. You could either install those as extra speech dependancies with
-``pip install transformers"[speech, sentencepiece]"`` or install the packages seperatly with ``pip install torchaudio
+install those packages before running the examples. You could either install those as extra speech dependencies with
+``pip install transformers"[speech, sentencepiece]"`` or install the packages seperately with ``pip install torchaudio
 sentencepiece``. Also ``torchaudio`` requires the development version of the `libsndfile
 <http://www.mega-nerd.com/libsndfile/>`__ package which can be installed via a system package manager. On Ubuntu it can
 be installed as follows: ``apt install libsndfile1-dev``
--- a/docs/source/model_doc/speech_to_text_2.rst
+++ b/docs/source/model_doc/speech_to_text_2.rst
@@ -0,0 +1,123 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Speech2Text2
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Speech2Text2 model is used together with :doc:`Wav2Vec2 <wav2vec2>` for Speech Translation models proposed in
+`Large-Scale Self- and Semi-Supervised Learning for Speech Translation <https://arxiv.org/abs/2104.06678>`__ by
+Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+
+Speech2Text2 is a *decoder-only* transformer model that can be used with any speech *encoder-only*, such as
+:doc:`Wav2Vec2 <wav2vec2>` or :doc:`HuBERT <hubert>` for Speech-to-Text tasks. Please refer to the
+:doc:`SpeechEncoderDecoder <speechencoderdecoder>` class on how to combine Speech2Text2 with any speech *encoder-only*
+model.
+
+This model was contributed by `Patrick von Platen <https://huggingface.co/patrickvonplaten>`__.
+
+The original code can be found `here
+<https://github.com/pytorch/fairseq/blob/1f7ef9ed1e1061f8c7f88f8b94c7186834398690/fairseq/models/wav2vec/wav2vec2_asr.py#L266>`__.
+
+
+Tips:
+
+- Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see
+  the `official models <https://huggingface.co/models?other=speech2text2>`__ .
+- Speech2Text2 is always used within the :doc:`SpeechEncoderDecoder <speechencoderdecoder>` framework.
+- Speech2Text2's tokenizer currently only supports inference, but not training.
+
+Inference
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Speech2Text2's :class:`~transformers.SpeechEncoderDecoderModel` model accepts raw waveform input values from speech and
+makes use of :func:`~transformers.generation_utils.GenerationMixin.generate` to translate the input speech
+autoregressively to the target language.
+
+The :class:`~transformers.Wav2Vec2FeatureExtractor` class is responsible for preprocessing the input speech and
+:class:`~transformers.Speech2Text2Tokenizer` decodes the generated target tokens to the target string. The
+:class:`~transformers.Speech2Text2Processor` wraps :class:`~transformers.Wav2Vec2FeatureExtractor` and
+:class:`~transformers.Speech2Text2Tokenizer` into a single instance to both extract the input features and decode the
+predicted token ids.
+
+- Step-by-step Speech Translation
+
+.. code-block::
+
+        >>> import torch
+        >>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
+        >>> from datasets import load_dataset
+        >>> import soundfile as sf
+
+        >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
+        >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
+
+        >>> def map_to_array(batch):
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
+
+        >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
+
+        >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
+        >>> generated_ids = model.generate(input_ids=inputs["input_values"], attention_mask=inputs["attention_mask"])
+
+        >>> transcription = processor.batch_decode(generated_ids)
+
+
+- Speech Translation via Pipelines
+
+    The automatic speech recognition pipeline can also be used to translate speech in just a couple lines of code
+
+.. code-block::
+
+        >>> from datasets import load_dataset
+        >>> from transformers import pipeline
+
+        >>> librispeech_en = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+        >>> asr = pipeline("automatic-speech-recognition", model="facebook/s2t-wav2vec2-large-en-de", feature_extractor="facebook/s2t-wav2vec2-large-en-de")
+
+        >>> translation_de = asr(librispeech_en[0]["file"])
+
+
+See `model hub <https://huggingface.co/models?filter=speech2text2>`__ to look for Speech2Text2 checkpoints.
+
+
+Speech2Text2Config
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Speech2Text2Config
+    :members:
+
+
+Speech2TextTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Speech2Text2Tokenizer
+    :members: batch_decode, decode, save_vocabulary
+
+
+Speech2Text2Processor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Speech2Text2Processor
+    :members: __call__, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
+
+
+Speech2Text2ForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Speech2Text2ForCausalLM
+    :members: forward
--- a/docs/source/model_doc/speechencoderdecoder.rst
+++ b/docs/source/model_doc/speechencoderdecoder.rst
@@ -0,0 +1,40 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Speech Encoder Decoder Models
+-----------------------------------------------------------------------------------------------------------------------
+
+The :class:`~transformers.SpeechEncoderDecoderModel` can be used to initialize a speech-sequence-to-text-sequence model
+with any pretrained speech autoencoding model as the encoder (*e.g.* :doc:`Wav2Vec2 <wav2vec2>`, :doc:`Hubert
+<hubert>`) and any pretrained autoregressive model as the decoder.
+
+The effectiveness of initializing speech-sequence-to-text-sequence models with pretrained checkpoints for speech
+recognition and speech translation has *e.g.* been shown in `Large-Scale Self- and Semi-Supervised Learning for Speech
+Translation <https://arxiv.org/abs/2104.06678>`__ by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli,
+Alexis Conneau.
+
+An example of how to use a :class:`~transformers.SpeechEncoderDecoderModel` for inference can be seen in
+:doc:`Speech2Text2 <speech_to_text_2>`.
+
+
+SpeechEncoderDecoderConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SpeechEncoderDecoderConfig
+    :members:
+
+
+SpeechEncoderDecoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SpeechEncoderDecoderModel
+    :members: forward, from_encoder_decoder_pretrained
--- a/docs/source/model_doc/splinter.rst
+++ b/docs/source/model_doc/splinter.rst
@@ -0,0 +1,87 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Splinter
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Splinter model was proposed in `Few-Shot Question Answering by Pretraining Span Selection
+<https://arxiv.org/abs/2101.00438>`__ by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. Splinter
+is an encoder-only transformer (similar to BERT) pretrained using the recurring span selection task on a large corpus
+comprising Wikipedia and the Toronto Book Corpus.
+
+The abstract from the paper is the following:
+
+In several question answering benchmarks, pretrained models have reached human parity through fine-tuning on an order
+of 100,000 annotated questions and answers. We explore the more realistic few-shot setting, where only a few hundred
+training examples are available, and observe that standard models perform poorly, highlighting the discrepancy between
+current pretraining objectives and question answering. We propose a new pretraining scheme tailored for question
+answering: recurring span selection. Given a passage with multiple sets of recurring spans, we mask in each set all
+recurring spans but one, and ask the model to select the correct span in the passage for each masked span. Masked spans
+are replaced with a special token, viewed as a question representation, that is later used during fine-tuning to select
+the answer span. The resulting model obtains surprisingly good results on multiple benchmarks (e.g., 72.7 F1 on SQuAD
+with only 128 training examples), while maintaining competitive performance in the high-resource setting.
+
+Tips:
+
+- Splinter was trained to predict answers spans conditioned on a special [QUESTION] token. These tokens contextualize
+  to question representations which are used to predict the answers. This layer is called QASS, and is the default
+  behaviour in the :class:`~transformers.SplinterForQuestionAnswering` class. Therefore:
+- Use :class:`~transformers.SplinterTokenizer` (rather than :class:`~transformers.BertTokenizer`), as it already
+  contains this special token. Also, its default behavior is to use this token when two sequences are given (for
+  example, in the `run_qa.py` script).
+- If you plan on using Splinter outside `run_qa.py`, please keep in mind the question token - it might be important for
+  the success of your model, especially in a few-shot setting.
+- Please note there are two different checkpoints for each size of Splinter. Both are basically the same, except that
+  one also has the pretrained wights of the QASS layer (`tau/splinter-base-qass` and `tau/splinter-large-qass`) and one
+  doesn't (`tau/splinter-base` and `tau/splinter-large`). This is done to support randomly initializing this layer at
+  fine-tuning, as it is shown to yield better results for some cases in the paper.
+
+This model was contributed by `yuvalkirstain <https://huggingface.co/yuvalkirstain>`__ and `oriram
+<https://huggingface.co/oriram>`__. The original code can be found `here <https://github.com/oriram/splinter>`__.
+
+SplinterConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterConfig
+    :members:
+
+
+SplinterTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+SplinterTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterTokenizerFast
+    :members:
+
+
+SplinterModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterModel
+    :members: forward
+
+
+SplinterForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterForQuestionAnswering
+    :members: forward
--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -13,9 +13,6 @@
 T5
 -----------------------------------------------------------------------------------------------------------------------

-**DISCLAIMER:** This model is still a work in progress, if you see something strange, file a `Github Issue
-<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__.
-
 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -42,28 +39,56 @@ Tips:
  different prefix to the input corresponding to each task, e.g., for translation: *translate English to German: ...*,
  for summarization: *summarize: ...*.

-  For more information about which prefix to use, it is easiest to look into Appendix D of the `paper
-  <https://arxiv.org/pdf/1910.10683.pdf>`__. - For sequence-to-sequence generation, it is recommended to use
-  :meth:`~transformers.generation_utils.GenerationMixin.generate`. This method takes care of feeding the encoded input
-  via cross-attention layers to the decoder and auto-regressively generates the decoder output. - T5 uses relative
-  scalar embeddings. Encoder input padding can be done on the left and on the right.
+- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
+
+- See the :ref:`training`, :ref:`inference` and :ref:`scripts` sections below for all details regarding usage.
+
+T5 comes in different sizes:
+
+- `t5-small <https://huggingface.co/t5-small>`__
+
+- `t5-base <https://huggingface.co/t5-base>`__
+
+- `t5-large <https://huggingface.co/t5-large>`__
+
+- `t5-3b <https://huggingface.co/t5-3b>`__
+
+- `t5-11b <https://huggingface.co/t5-11b>`__.
+
+Based on the original T5 model, Google has released some follow-up works:
+
+- **T5v1.1**: T5v1.1 is an improved version of T5 with some architectural tweaks, and is pre-trained on C4 only without
+  mixing in the supervised tasks. Refer to the documentation of T5v1.1 which can be found :doc:`here <t5v1.1>`.
+
+- **mT5**: mT5 is a multilingual T5 model. It is pre-trained on the mC4 corpus, which includes 101 languages. Refer to
+  the documentation of mT5 which can be found :doc:`here <mt5>`.
+
+- **byT5**: byT5 is a T5 model pre-trained on byte sequences rather than SentencePiece subword token sequences. Refer
+  to the documentation of byT5 which can be found :doc:`here <byt5>`.
+
+All checkpoints can be found on the `hub <https://huggingface.co/models?search=t5>`__.

 This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
 <https://github.com/google-research/text-to-text-transfer-transformer>`__.

+.. _training:
+
 Training
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher
-forcing. This means that for training we always need an input sequence and a target sequence. The input sequence is fed
-to the model using :obj:`input_ids`. The target sequence is shifted to the right, i.e., prepended by a start-sequence
-token and fed to the decoder using the :obj:`decoder_input_ids`. In teacher-forcing style, the target sequence is then
-appended by the EOS token and corresponds to the :obj:`labels`. The PAD token is hereby used as the start-sequence
-token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
+forcing. This means that for training, we always need an input sequence and a corresponding target sequence. The input
+sequence is fed to the model using :obj:`input_ids`. The target sequence is shifted to the right, i.e., prepended by a
+start-sequence token and fed to the decoder using the :obj:`decoder_input_ids`. In teacher-forcing style, the target
+sequence is then appended by the EOS token and corresponds to the :obj:`labels`. The PAD token is hereby used as the
+start-sequence token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
+
+One can use :class:`~transformers.T5ForConditionalGeneration` (or the Tensorflow/Flax variant), which includes the
+language modeling head on top of the decoder.

 - Unsupervised denoising training

-  In this setup spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and
+  In this setup, spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and
  the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. Each
  sentinel token represents a unique mask token for this sentence and should start with :obj:`<extra_id_0>`,
  :obj:`<extra_id_1>`, ... up to :obj:`<extra_id_99>`. As a default, 100 sentinel tokens are available in
@@ -72,34 +97,201 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash
  For instance, the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be
  processed as follows:

-.. code-block::
+    .. code-block::

-    from transformers import T5ForConditionalGeneration, T5Tokenizer
-    model = T5ForConditionalGeneration.from_pretrained("t5-small")
-    tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        from transformers import T5Tokenizer, T5ForConditionalGeneration

-    input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
-    labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
-    # the forward function automatically creates the correct decoder_input_ids
-    loss = model(input_ids=input_ids, labels=labels).loss
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
+        labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
+        # the forward function automatically creates the correct decoder_input_ids
+        loss = model(input_ids=input_ids, labels=labels).loss
+
+  If you're interested in pre-training T5 on a new corpus, check out the `run_t5_mlm_flax.py
+  <https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling>`__ script in the Examples
+  directory.

 - Supervised training

-  In this setup the input sequence and output sequence are standard sequence-to-sequence input output mapping. In
-  translation, for instance with the input sequence "The house is wonderful." and output sequence "Das Haus ist
-  wunderbar.", the sentences should be processed as follows:
+  In this setup, the input sequence and output sequence are a standard sequence-to-sequence input-output mapping.
+  Suppose that we want to fine-tune the model for translation for example, and we have a training example: the input
+  sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar.", then they should be prepared for
+  the model as follows:
+
+    .. code-block::
+
+        from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+        labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
+        # the forward function automatically creates the correct decoder_input_ids
+        loss = model(input_ids=input_ids, labels=labels).loss
+
+  As you can see, only 2 inputs are required for the model in order to compute a loss: :obj:`input_ids` (which are the
+  :obj:`input_ids` of the encoded input sequence) and :obj:`labels` (which are the :obj:`input_ids` of the encoded
+  target sequence). The model will automatically create the :obj:`decoder_input_ids` based on the :obj:`labels`, by
+  shifting them one position to the right and prepending the :obj:`config.decoder_start_token_id`, which for T5 is
+  equal to 0 (i.e. the id of the pad token). Also note the task prefix: we prepend the input sequence with 'translate
+  English to German: ' before encoding it. This will help in improving the performance, as this task prefix was used
+  during T5's pre-training.
+
+  However, the example above only shows a single training example. In practice, one trains deep learning models in
+  batches. This entails that we must pad/truncate examples to the same length. For encoder-decoder models, one
+  typically defines a :obj:`max_source_length` and :obj:`max_target_length`, which determine the maximum length of the
+  input and output sequences respectively (otherwise they are truncated). These should be carefully set depending on
+  the task.
+
+  In addition, we must make sure that padding token id's of the :obj:`labels` are not taken into account by the loss
+  function. In PyTorch and Tensorflow, this can be done by replacing them with -100, which is the :obj:`ignore_index`
+  of the :obj:`CrossEntropyLoss`. In Flax, one can use the :obj:`decoder_attention_mask` to ignore padded tokens from
+  the loss (see the `Flax summarization script
+  <https://github.com/huggingface/transformers/tree/master/examples/flax/summarization>`__ for details). We also pass
+  :obj:`attention_mask` as additional input to the model, which makes sure that padding tokens of the inputs are
+  ignored. The code example below illustrates all of this.
+
+    .. code-block::
+
+        from transformers import T5Tokenizer, T5ForConditionalGeneration 
+        import torch
+
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        # the following 2 hyperparameters are task-specific
+        max_source_length = 512
+        max_target_length = 128
+
+        # Suppose we have the following 2 training examples:
+        input_sequence_1 = "Welcome to NYC"
+        output_sequence_1 = "Bienvenue à NYC"
+
+        input_sequence_2 = "HuggingFace is a company"
+        output_sequence_2 = "HuggingFace est une entreprise"
+
+        # encode the inputs
+        task_prefix = "translate English to French: "
+        input_sequences = [input_sequence_1, input_sequence_2]
+        encoding = tokenizer([task_prefix + sequence for sequence in input_sequences], 
+                             padding='longest', 
+                             max_length=max_source_length, 
+                             truncation=True, 
+                             return_tensors="pt")
+        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
+
+        # encode the targets
+        target_encoding = tokenizer([output_sequence_1, output_sequence_2], 
+                                    padding='longest', 
+                                    max_length=max_target_length, 
+                                    truncation=True)
+        labels = target_encoding.input_ids
+
+        # replace padding token id's of the labels by -100
+        labels = [
+                   [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels
+        ] 
+        labels = torch.tensor(labels)
+
+        # forward pass
+        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
+
+Additional training tips:
+
+- T5 models need a slightly higher learning rate than the default one set in the :obj:`Trainer` when using the AdamW
+  optimizer. Typically, 1e-4 and 3e-4 work well for most problems (classification, summarization, translation, question
+  answering, question generation). Note that T5 was pre-trained using the AdaFactor optimizer.
+
+- According to `this forum post <https://discuss.huggingface.co/t/t5-finetuning-tips/684>`__, task prefixes matter when
+  (1) doing multi-task training (2) your task is similar or related to one of the supervised tasks used in T5's
+  pre-training mixture (see Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`__ for the task prefixes
+  used).
+
+- If training on TPU, it is recommended to pad all examples of the dataset to the same length or make use of
+  `pad_to_multiple_of` to have a small number of predefined bucket sizes to fit all examples in. Dynamically padding
+  batches to the longest example is not recommended on TPU as it triggers a recompilation for every batch shape that is
+  encountered during training thus significantly slowing down the training. only padding up to the longest example in a
+  batch) leads to very slow training on TPU.
+
+.. _inference:
+
+Inference
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+At inference time, it is recommended to use :meth:`~transformers.generation_utils.GenerationMixin.generate`. This
+method takes care of encoding the input and feeding the encoded hidden states via cross-attention layers to the decoder
+and auto-regressively generates the decoder output. Check out `this blog post
+<https://huggingface.co/blog/how-to-generate>`__ to know all the details about generating text with Transformers.
+There's also `this blog post <https://huggingface.co/blog/encoder-decoder#encoder-decoder>`__ which explains how
+generation works in general in encoder-decoder models.

 .. code-block::

-    from transformers import T5ForConditionalGeneration, T5Tokenizer
-    model = T5ForConditionalGeneration.from_pretrained("t5-small")
-    tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        from transformers import T5Tokenizer, T5ForConditionalGeneration 

-    input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
-    labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
-    # the forward function automatically creates the correct decoder_input_ids
-    loss = model(input_ids=input_ids, labels=labels).loss
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")

+        input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+        outputs = model.generate(input_ids)
+        print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        # Das Haus ist wunderbar.
+
+Note that T5 uses the :obj:`pad_token_id` as the :obj:`decoder_start_token_id`, so when doing generation without using
+:meth:`~transformers.generation_utils.GenerationMixin.generate`, make sure you start it with the :obj:`pad_token_id`.
+
+The example above only shows a single example. You can also do batched inference, like so:
+
+.. code-block::
+
+        from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        # when generating, we will use the logits of right-most token to predict the next token
+        # so the padding should be on the left
+        tokenizer.padding_side = "left" 
+        tokenizer.pad_token = tokenizer.eos_token # to avoid an error
+
+        task_prefix = 'translate English to German: '
+        sentences = ['The house is wonderful.', 'I like to work in NYC.'] # use different length sentences to test batching
+        inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
+
+        output_sequences = model.generate(
+            input_ids=inputs['input_ids'],
+            attention_mask=inputs['attention_mask'],
+            do_sample=False, # disable sampling to test if batching affects output
+        )
+
+        print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
+
+        # ['Das Haus ist wunderbar.', 'Ich arbeite gerne in NYC.']
+
+.. _scripts:
+
+Example scripts
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+T5 is supported by several example scripts, both for pre-training and fine-tuning.
+
+* pre-training: the `run_t5_mlm_flax.py
+  <https://github.com/huggingface/transformers/blob/master/examples/flax/language-modeling/run_t5_mlm_flax.py>`__
+  script allows you to further pre-train T5 or pre-train T5 from scratch on your own data. The `t5_tokenizer_model.py
+  <https://github.com/huggingface/transformers/blob/master/examples/flax/language-modeling/t5_tokenizer_model.py>`__
+  script allows you to further train a T5 tokenizer or train a T5 Tokenizer from scratch on your own data. Note that
+  Flax (a neural network library on top of JAX) is particularly useful to train on TPU hardware.
+
+* fine-tuning: T5 is supported by the official summarization scripts (`PyTorch
+  <https://github.com/huggingface/transformers/tree/master/examples/pytorch/summarization>`__, `Tensorflow
+  <https://github.com/huggingface/transformers/tree/master/examples/tensorflow/summarization>`__, and `Flax
+  <https://github.com/huggingface/transformers/tree/master/examples/flax/summarization>`__) and translation scripts
+  (`PyTorch <https://github.com/huggingface/transformers/tree/master/examples/pytorch/translation>`__ and `Tensorflow
+  <https://github.com/huggingface/transformers/tree/master/examples/tensorflow/translation>`__). These scripts allow
+  you to easily fine-tune T5 on custom data for summarization/translation.

 T5Config
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/t5v1.1.rst
+++ b/docs/source/model_doc/t5v1.1.rst
@@ -0,0 +1,66 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+T5v1.1
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+T5v1.1 was released in the `google-research/text-to-text-transfer-transformer
+<https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__
+repository by Colin Raffel et al. It's an improved version of the original T5 model.
+
+One can directly plug in the weights of T5v1.1 into a T5 model, like so:
+
+.. code-block::
+
+    from transformers import T5ForConditionalGeneration
+
+    model = T5ForConditionalGeneration.from_pretrained('google/t5-v1_1-base') 
+
+T5 Version 1.1 includes the following improvements compared to the original T5 model:
+
+- GEGLU activation in the feed-forward hidden layer, rather than ReLU. See `this paper
+  <https://arxiv.org/abs/2002.05202>`__.
+
+- Dropout was turned off in pre-training (quality win). Dropout should be re-enabled during fine-tuning.
+
+- Pre-trained on C4 only without mixing in the downstream tasks.
+
+- No parameter sharing between the embedding and classifier layer.
+
+- "xl" and "xxl" replace "3B" and "11B". The model shapes are a bit different - larger :obj:`d_model` and smaller
+  :obj:`num_heads` and :obj:`d_ff`.
+
+Note: T5 Version 1.1 was only pre-trained on `C4 <https://huggingface.co/datasets/c4>`__ excluding any supervised
+training. Therefore, this model has to be fine-tuned before it is useable on a downstream task, unlike the original T5
+model. Since t5v1.1 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
+
+Google has released the following variants:
+
+- `google/t5-v1_1-small <https://huggingface.co/google/t5-v1_1-small>`__
+
+- `google/t5-v1_1-base <https://huggingface.co/google/t5-v1_1-base>`__
+
+- `google/t5-v1_1-large <https://huggingface.co/google/t5-v1_1-large>`__
+
+- `google/t5-v1_1-xl <https://huggingface.co/google/t5-v1_1-xl>`__
+
+- `google/t5-v1_1-xxl <https://huggingface.co/google/t5-v1_1-xxl>`__.
+
+One can refer to :doc:`T5's documentation page <t5>` for all tips, code examples and notebooks.
+
+This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
+found `here
+<https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__.
--- a/docs/source/model_doc/visual_bert.rst
+++ b/docs/source/model_doc/visual_bert.rst
@@ -58,9 +58,17 @@ layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The
 appropriately for the textual and visual parts.

 The :class:`~transformers.BertTokenizer` is used to encode the text. A custom detector/feature extractor must be used
-to get the visual embeddings. For an example on how to generate visual embeddings, see the `colab notebook
-<https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__. The following example shows
-how to get the last hidden state using :class:`~transformers.VisualBertModel`:
+to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models:
+
+* `VisualBERT VQA demo notebook
+  <https://github.com/huggingface/transformers/tree/master/examples/research_projects/visual_bert>`__ : This notebook
+  contains an example on VisualBERT VQA.
+
+* `Generate Embeddings for VisualBERT (Colab Notebook)
+  <https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__ : This notebook contains
+  an example on how to generate visual embeddings.
+
+The following example shows how to get the last hidden state using :class:`~transformers.VisualBertModel`:

 .. code-block::

@@ -74,6 +82,13 @@ how to get the last hidden state using :class:`~transformers.VisualBertModel`:
        >>> # this is a custom function that returns the visual embeddings given the image path
        >>> visual_embeds = get_visual_embeddings(image_path)

+        >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+        >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+        >>> inputs.update({
+        ...     "visual_embeds": visual_embeds,
+        ...     "visual_token_type_ids": visual_token_type_ids,
+        ...     "visual_attention_mask": visual_attention_mask
+        ... })
        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state

--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -66,6 +66,23 @@ Tips:
  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.

+Following the original Vision Transformer, some follow-up works have been made:
+
+- DeiT (Data-efficient Image Transformers) by Facebook AI. DeiT models are distilled vision transformers. Refer to
+  :doc:`DeiT's documentation page <deit>`. The authors of DeiT also released more efficiently trained ViT models, which
+  you can directly plug into :class:`~transformers.ViTModel` or :class:`~transformers.ViTForImageClassification`. There
+  are 4 variants available (in 3 different sizes): `facebook/deit-tiny-patch16-224`, `facebook/deit-small-patch16-224`,
+  `facebook/deit-base-patch16-224` and `facebook/deit-base-patch16-384`. Note that one should use
+  :class:`~transformers.DeiTFeatureExtractor` in order to prepare images for the model.
+
+- BEiT (BERT pre-training of Image Transformers) by Microsoft Research. BEiT models outperform supervised pre-trained
+  vision transformers using a self-supervised method inspired by BERT (masked image modeling) and based on a VQ-VAE.
+  Refer to :doc:`BEiT's documentation page <beit>`.
+
+- DINO (a method for self-supervised training of Vision Transformers) by Facebook AI. Vision Transformers trained using
+  the DINO method show very interesting properties not seen with convolutional models. They are capable of segmenting
+  objects, without having ever been trained to do so. DINO checkpoints can be found on the `hub
+  <https://huggingface.co/models?other=dino>`__.

 This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code (written in JAX) can be
 found `here <https://github.com/google-research/vision_transformer>`__.
--- a/docs/source/model_doc/wav2vec2.rst
+++ b/docs/source/model_doc/wav2vec2.rst
@@ -67,6 +67,22 @@ Wav2Vec2Processor
    :members: __call__, pad, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor


+Wav2Vec2 specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2BaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput
+    :members: 
+
+.. autoclass:: transformers.models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2BaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2ForPreTrainingOutput
+    :members: 
+
+
 Wav2Vec2Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -80,6 +96,14 @@ Wav2Vec2ForCTC
 .. autoclass:: transformers.Wav2Vec2ForCTC
    :members: forward

+
+Wav2Vec2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Wav2Vec2ForSequenceClassification
+    :members: forward
+
+
 Wav2Vec2ForPreTraining
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_sharing.rst
+++ b/docs/source/model_sharing.rst
@@ -341,8 +341,8 @@ Add a model card

 To make sure everyone knows what your model can do, what its limitations, potential bias or ethical considerations are,
 please add a README.md model card to your model repo. You can just create it, or there's also a convenient button
-titled "Add a README.md" on your model page. A model card template can be found `here
-<https://github.com/huggingface/model_card>`__ (meta-suggestions are welcome). model card template (meta-suggestions
+titled "Add a README.md" on your model page. A model card documentation can be found `here
+<https://huggingface.co/docs/hub/model-repos>`__ (meta-suggestions are welcome). model card template (meta-suggestions
 are welcome).

 .. note::
--- a/docs/source/parallelism.md
+++ b/docs/source/parallelism.md
@@ -58,7 +58,7 @@ a0 | b0 | c0
 a1 | b1 | c1
 a2 | b2 | c2
 ```
-Layer La has weights a0, at and a2.
+Layer La has weights a0, a1 and a2.

 If we have 3 GPUs, the Sharded DDP (= Zero-DP) splits the model onto 3 GPUs like so:

@@ -220,9 +220,12 @@ Special considerations: TP requires very fast network, and therefore it's not ad
 This section is based on the original much more [detailed TP overview](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530).
 by [@anton-l](https://github.com/anton-l).

-Implementations:
+Alternative names:
 - DeepSpeed calls it [tensor slicing](https://www.deepspeed.ai/features/#model-parallelism)
- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation.
+
+Implementations:
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation, as it's very model-specific
+- [parallelformers](https://github.com/tunib-ai/parallelformers) (only inference at the moment)

 🤗 Transformers status:
 - core: not yet implemented in the core
--- a/docs/source/performance.md
+++ b/docs/source/performance.md
@@ -53,6 +53,7 @@ Software:
 - Tensor Parallelism
 - Low-memory Optimizers
 - fp16/bf16 (smaller data)
+- Gradient checkpointing



@@ -226,6 +227,21 @@ pytorch `autocast` which performs AMP include a caching feature, which speed thi

 Autocast maintains a cache of the FP16 casts of model params (leaves). This helps streamline parameter reuse: if the same FP32 param is used in several different FP16list ops, like several matmuls, instead of re-casting the param to FP16 on entering each matmul, the cast will occur on the first matmul, the casted FP16 copy will be cached, and for all later matmuls the FP16 copy will be reused. The cache is maintained only within a particular outermost autocast context. When you exit the autocast context the cache is dropped. For recommended usage, in which autocast wraps the forward pass, and then you exit the context before calling backward(), this means the cache only lasts the duration of the forward pass each iteration, and will be rebuilt next iteration. (The cache of FP16-casted copies MUST be rebuilt each iteration. The FP32 params get updated by the optimizer, so the FP16 copies must be recreated, otherwise the FP16 values will be stale.)

+
+### Gradient Checkpointing
+
+One way to use significantly less GPU memory is to enabled "Gradient Checkpointing" (also known as "activation checkpointing"). When enabled, a lot of memory can be freed at the cost of small decrease in the training speed due to recomputing parts of the graph during back-propagation.
+
+This technique was first shared in the paper: [Training Deep Nets with Sublinear Memory Cost](https://arxiv.org/abs/1604.06174). The paper will also give you the exact details on the savings, but it's in the ballpark of `O(sqrt(n))`, where `n` is the number of feed-forward layers.
+
+To activate this feature in 🤗 Transformers for models that support it, use:
+
+```python
+model.gradient_checkpointing_enable()
+```
+or add `--gradient_checkpointing` to the Trainer arguments.
+
+
 ### Batch sizes

 One gets the most efficient performance when batch sizes and input/output neuron counts are divisible by a certain number, which typically starts at 8, but can be much higher as well. That number varies a lot depending on the specific hardware being used and the dtype of the model.
--- a/docs/source/perplexity.rst
+++ b/docs/source/perplexity.rst
@@ -96,11 +96,11 @@ dataset in memory.

 .. code-block:: python

-    from nlp import load_dataset
+    from datasets import load_dataset
    test = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
    encodings = tokenizer('\n\n'.join(test['text']), return_tensors='pt')

-With 🤗 Transformers, we can simply pass the ``input_ids`` as the ``labels`` to our model, and the average
+With 🤗 Transformers, we can simply pass the ``input_ids`` as the ``labels`` to our model, and the average negative
 log-likelihood for each token is returned as the loss. With our sliding window approach, however, there is overlap in
 the tokens we pass to the model at each iteration. We don't want the log-likelihood for the tokens we're just treating
 as context to be included in our loss, so we can set these targets to ``-100`` so that they are ignored. The following
@@ -110,10 +110,13 @@ available to condition on).

 .. code-block:: python

+    import torch
+    from tqdm import tqdm
+
    max_length = model.config.n_positions
    stride = 512

-    lls = []
+    nlls = []
    for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, encodings.input_ids.size(1))
@@ -124,11 +127,11 @@ available to condition on).

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
-            log_likelihood = outputs[0] * trg_len
+            neg_log_likelihood = outputs[0] * trg_len

-        lls.append(log_likelihood)
+        nlls.append(neg_log_likelihood)

-    ppl = torch.exp(torch.stack(lls).sum() / end_loc)
+    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)

 Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window
 strategy we discussed above. The smaller the stride, the more context the model will have in making each prediction,
--- a/docs/source/preprocessing.rst
+++ b/docs/source/preprocessing.rst
@@ -243,15 +243,16 @@ three arguments you need to know for this are :obj:`padding`, :obj:`truncation`

 - :obj:`truncation` controls the truncation. It can be a boolean or a string which should be:

-    - :obj:`True` or :obj:`'only_first'` truncate to a maximum length specified by the :obj:`max_length` argument or
+    - :obj:`True` or :obj:`'longest_first'` truncate to a maximum length specified by the :obj:`max_length` argument or
      the maximum length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will
-      only truncate the first sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.
+      truncate token by token, removing a token from the longest sequence in the pair until the proper length is
+      reached.
    - :obj:`'only_second'` truncate to a maximum length specified by the :obj:`max_length` argument or the maximum
      length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will only truncate
      the second sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.
-    - :obj:`'longest_first'` truncate to a maximum length specified by the :obj:`max_length` argument or the maximum
-      length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will truncate token
-      by token, removing a token from the longest sequence in the pair until the proper length is reached.
+    - :obj:`'only_first'` truncate to a maximum length specified by the :obj:`max_length` argument or the maximum
+      length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will only truncate
+      the first sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.
    - :obj:`False` or :obj:`'do_not_truncate'` to not truncate the sequences. As we have seen before, this is the
      default behavior.

--- a/docs/source/quicktour.rst
+++ b/docs/source/quicktour.rst
@@ -65,10 +65,10 @@ make them readable. For instance:
 .. code-block::

    >>> classifier('We are very happy to show you the 🤗 Transformers library.')
-    [{'label': 'POSITIVE', 'score': 0.9997795224189758}]
+    [{'label': 'POSITIVE', 'score': 0.9998}]

-That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model as a
-`batch`, returning a list of dictionaries like this one:
+That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model, returning
+a list of dictionaries like this one:

 .. code-block::

@@ -79,6 +79,8 @@ That's encouraging! You can use it on a list of sentences, which will be preproc
    label: POSITIVE, with score: 0.9998
    label: NEGATIVE, with score: 0.5309

+To use with a large dataset, look at :doc:`iterating over a pipeline <./main_classes/pipelines>`
+
 You can see the second sentence has been classified as negative (it needs to be positive or negative) but its score is
 fairly neutral.

@@ -195,7 +197,8 @@ sequence:
 .. code-block::

    >>> print(inputs)
-    {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+    {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102],
+     'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

 You can pass a list of sentences directly to your tokenizer. If your goal is to send them through your model as a
 batch, you probably want to pad them all to the same length, truncate them to the maximum length the model can accept
@@ -260,12 +263,12 @@ objects are described in greater detail :doc:`here <main_classes/output>`. For n
    >>> ## PYTORCH CODE
    >>> print(pt_outputs)
    SequenceClassifierOutput(loss=None, logits=tensor([[-4.0833,  4.3364],
-        [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
+            [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
    >>> ## TENSORFLOW CODE
    >>> print(tf_outputs)
    TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
-    array([[-4.0832963 ,  4.3364143 ],
-           [ 0.081807  , -0.04178282]], dtype=float32)>, hidden_states=None, attentions=None)
+    array([[-4.0833 ,  4.3364  ],
+           [ 0.0818, -0.0418]], dtype=float32)>, hidden_states=None, attentions=None)

 Notice how the output object has a ``logits`` attribute. You can use this to access the model's final activations.

@@ -283,7 +286,7 @@ Let's apply the SoftMax activation to get predictions.
    >>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
    >>> ## TENSORFLOW CODE
    >>> import tensorflow as tf
-    >>> tf.nn.softmax(tf_outputs.logits, axis=-1)
+    >>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)

 We can see we get the numbers from before:

@@ -292,8 +295,8 @@ We can see we get the numbers from before:
    >>> ## TENSORFLOW CODE
    >>> print(tf_predictions)
    tf.Tensor(
-    [[2.2042994e-04 9.9977952e-01]
-     [5.3086340e-01 4.6913657e-01]], shape=(2, 2), dtype=float32)
+    [[2.2043e-04 9.9978e-01]
+     [5.3086e-01 4.6914e-01]], shape=(2, 2), dtype=float32)
    >>> ## PYTORCH CODE
    >>> print(pt_predictions)
    tensor([[2.2043e-04, 9.9978e-01],
@@ -309,14 +312,14 @@ attribute:
    >>> pt_outputs = pt_model(**pt_batch, labels = torch.tensor([1, 0]))
    >>> print(pt_outputs)
    SequenceClassifierOutput(loss=tensor(0.3167, grad_fn=<NllLossBackward>), logits=tensor([[-4.0833,  4.3364],
-    [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
+            [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
    >>> ## TENSORFLOW CODE
    >>> import tensorflow as tf
    >>> tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0]))
    >>> print(tf_outputs)
-    TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.2051287e-04, 6.3326043e-01], dtype=float32)>, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
-    array([[-4.0832963 ,  4.3364143 ],
-           [ 0.081807  , -0.04178282]], dtype=float32)>, hidden_states=None, attentions=None)
+    TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.2051e-04, 6.3326e-01], dtype=float32)>, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
+    array([[-4.0833 ,  4.3364  ],
+           [ 0.0818, -0.0418]], dtype=float32)>, hidden_states=None, attentions=None)

 Models are standard `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ or `tf.keras.Model
 <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ so you can use them in your usual training loop. 🤗
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -42,6 +42,7 @@ Ready-made configurations include the following models:
 - BERT
 - DistilBERT
 - GPT-2
+- LayoutLM
 - RoBERTa
 - T5
 - XLM-RoBERTa
@@ -99,6 +100,30 @@ It will be exported under ``onnx/bert-base-cased``. You should see similar logs:
                    -[✓] all values close (atol: 0.0001)
    All good, model saved at: onnx/bert-base-cased/model.onnx

+This export can now be used in the ONNX inference runtime:
+
+.. code-block::
+
+    import onnxruntime as ort
+
+    from transformers import BertTokenizerFast
+    tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
+
+    ort_session = ort.InferenceSession("onnx/bert-base-cased/model.onnx")
+
+    inputs = tokenizer("Using BERT in ONNX!", return_tensors="np")
+    outputs = ort_session.run(["last_hidden_state", "pooler_output"], dict(inputs))
+
+The outputs used (:obj:`["last_hidden_state", "pooler_output"]`) can be obtained by taking a look at the ONNX
+configuration of each model. For example, for BERT:
+
+.. code-block::
+
+    from transformers.models.bert import BertOnnxConfig, BertConfig
+
+    config = BertConfig()
+    onnx_config = BertOnnxConfig(config)
+    output_keys = list(onnx_config.outputs.keys())

 Implementing a custom configuration for an unsupported architecture
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -142,6 +167,12 @@ An important fact to notice is the use of `OrderedDict` in both inputs and outpu
 as inputs are matched against their relative position within the `PreTrainedModel.forward()` prototype and outputs are
 match against there position in the returned `BaseModelOutputX` instance.

+An example of such an addition is visible here, for the MBart model: `Making MBART ONNX-convertible
+<https://github.com/huggingface/transformers/pull/13049/commits/d097adcebd89a520f04352eb215a85916934204f>`__
+
+If you would like to contribute your addition to the library, we recommend you implement tests. An example of such
+tests is visible here: `Adding tests to the MBART ONNX conversion
+<https://github.com/huggingface/transformers/pull/13049/commits/5d642f65abf45ceeb72bd855ca7bfe2506a58e6a>`__

 Graph conversion
 -----------------------------------------------------------------------------------------------------------------------
--- a/docs/source/task_summary.rst
+++ b/docs/source/task_summary.rst
@@ -107,7 +107,8 @@ each other. The process is the following:
    >>> sequence_1 = "Apples are especially bad for your health"
    >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

-    >>> # The tokekenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to the sequence, as well as compute the attention masks.
+    >>> # The tokenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to
+    >>> # the sequence, as well as compute the attention masks.
    >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
    >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")

@@ -141,12 +142,13 @@ each other. The process is the following:
    >>> sequence_1 = "Apples are especially bad for your health"
    >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

-    >>> # The tokekenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to the sequence, as well as compute the attention masks.
+    >>> # The tokenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to
+    >>> # the sequence, as well as compute the attention masks.
    >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="tf")
    >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf")

-    >>> paraphrase_classification_logits = model(paraphrase)[0]
-    >>> not_paraphrase_classification_logits = model(not_paraphrase)[0]
+    >>> paraphrase_classification_logits = model(paraphrase).logits
+    >>> not_paraphrase_classification_logits = model(not_paraphrase).logits

    >>> paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
    >>> not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
@@ -197,11 +199,11 @@ positions of the extracted answer in the text.

    >>> result = question_answerer(question="What is extractive question answering?", context=context)
    >>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
-    Answer: 'the task of extracting an answer from a text given a question.', score: 0.6226, start: 34, end: 96
+    Answer: 'the task of extracting an answer from a text given a question', score: 0.6177, start: 34, end: 95

    >>> result = question_answerer(question="What is a good example of a question answering dataset?", context=context)
    >>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
-    Answer: 'SQuAD dataset,', score: 0.5053, start: 147, end: 161
+    Answer: 'SQuAD dataset', score: 0.5152, start: 147, end: 160


 Here is an example of question answering using a model and a tokenizer. The process is the following:
@@ -247,10 +249,10 @@ Here is an example of question answering using a model and a tokenizer. The proc
    ...     answer_start_scores = outputs.start_logits
    ...     answer_end_scores = outputs.end_logits
    ...
-    ...     answer_start = torch.argmax(
-    ...         answer_start_scores
-    ...     )  # Get the most likely beginning of answer with the argmax of the score
-    ...     answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
+    ...     # Get the most likely beginning of answer with the argmax of the score
+    ...     answer_start = torch.argmax(answer_start_scores)
+    ...     # Get the most likely end of answer with the argmax of the score 
+    ...     answer_end = torch.argmax(answer_end_scores) + 1
    ...
    ...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    ...
@@ -261,7 +263,7 @@ Here is an example of question answering using a model and a tokenizer. The proc
    Question: What does 🤗 Transformers provide?
    Answer: general - purpose architectures
    Question: 🤗 Transformers provides interoperability between which frameworks?
-    Answer: tensorflow 2 . 0 and pytorch
+    Answer: tensorflow 2. 0 and pytorch
    >>> ## TENSORFLOW CODE
    >>> from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
    >>> import tensorflow as tf
@@ -290,12 +292,11 @@ Here is an example of question answering using a model and a tokenizer. The proc
    ...     answer_start_scores = outputs.start_logits
    ...     answer_end_scores = outputs.end_logits
    ...
-    ...     answer_start = tf.argmax(
-    ...         answer_start_scores, axis=1
-    ...     ).numpy()[0]  # Get the most likely beginning of answer with the argmax of the score
-    ...     answer_end = (
-    ...         tf.argmax(answer_end_scores, axis=1) + 1
-    ...     ).numpy()[0]  # Get the most likely end of answer with the argmax of the score
+    ...     # Get the most likely beginning of answer with the argmax of the score
+    ...     answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0]
+    ...     # Get the most likely end of answer with the argmax of the score
+    ...     answer_end = tf.argmax(answer_end_scores, axis=1).numpy()[0] + 1
+    ...
    ...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    ...
    ...     print(f"Question: {question}")
@@ -305,7 +306,7 @@ Here is an example of question answering using a model and a tokenizer. The proc
    Question: What does 🤗 Transformers provide?
    Answer: general - purpose architectures
    Question: 🤗 Transformers provides interoperability between which frameworks?
-    Answer: tensorflow 2 . 0 and pytorch
+    Answer: tensorflow 2. 0 and pytorch



@@ -344,31 +345,31 @@ This outputs the sequences with the mask filled, the confidence score, and the t

    >>> from pprint import pprint
    >>> pprint(unmasker(f"HuggingFace is creating a {unmasker.tokenizer.mask_token} that the community uses to solve NLP tasks."))
-    [{'score': 0.1792745739221573,
-      'sequence': '<s>HuggingFace is creating a tool that the community uses to '
-                  'solve NLP tasks.</s>',
+    [{'score': 0.1793,
+      'sequence': 'HuggingFace is creating a tool that the community uses to solve '
+                  'NLP tasks.',
      'token': 3944,
-      'token_str': 'Ġtool'},
-     {'score': 0.11349421739578247,
-      'sequence': '<s>HuggingFace is creating a framework that the community uses '
-                  'to solve NLP tasks.</s>',
+      'token_str': ' tool'},
+     {'score': 0.1135,
+      'sequence': 'HuggingFace is creating a framework that the community uses to '
+                  'solve NLP tasks.',
      'token': 7208,
-      'token_str': 'Ġframework'},
-     {'score': 0.05243554711341858,
-      'sequence': '<s>HuggingFace is creating a library that the community uses to '
-                  'solve NLP tasks.</s>',
+      'token_str': ' framework'},
+     {'score': 0.0524,
+      'sequence': 'HuggingFace is creating a library that the community uses to '
+                  'solve NLP tasks.',
      'token': 5560,
-      'token_str': 'Ġlibrary'},
-     {'score': 0.03493533283472061,
-      'sequence': '<s>HuggingFace is creating a database that the community uses '
-                  'to solve NLP tasks.</s>',
+      'token_str': ' library'},
+     {'score': 0.0349,
+      'sequence': 'HuggingFace is creating a database that the community uses to '
+                  'solve NLP tasks.',
      'token': 8503,
-      'token_str': 'Ġdatabase'},
-     {'score': 0.02860250137746334,
-      'sequence': '<s>HuggingFace is creating a prototype that the community uses '
-                  'to solve NLP tasks.</s>',
+      'token_str': ' database'},
+     {'score': 0.0286,
+      'sequence': 'HuggingFace is creating a prototype that the community uses to '
+                  'solve NLP tasks.',
      'token': 17715,
-      'token_str': 'Ġprototype'}]
+      'token_str': ' prototype'}]

 Here is an example of doing masked language modeling using a model and a tokenizer. The process is the following:

@@ -385,42 +386,22 @@ Here is an example of doing masked language modeling using a model and a tokeniz
 .. code-block::

    >>> ## PYTORCH CODE
-    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> from transformers import AutoModelForMaskedLM, AutoTokenizer
    >>> import torch

    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-    >>> model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+    >>> model = AutoModelForMaskedLM.from_pretrained("distilbert-base-cased")

-    >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+    >>> sequence = "Distilled models are smaller than the models they mimic. Using them instead of the large " \
+    ...     f"versions would help {tokenizer.mask_token} our carbon footprint."

-    >>> input = tokenizer.encode(sequence, return_tensors="pt")
-    >>> mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
+    >>> inputs = tokenizer(sequence, return_tensors="pt")
+    >>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

-    >>> token_logits = model(input).logits
+    >>> token_logits = model(**inputs).logits
    >>> mask_token_logits = token_logits[0, mask_token_index, :]

    >>> top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
-    >>> ## TENSORFLOW CODE
-    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
-    >>> import tensorflow as tf
-
-    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-    >>> model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")
-
-    >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
-
-    >>> input = tokenizer.encode(sequence, return_tensors="tf")
-    >>> mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
-
-    >>> token_logits = model(input)[0]
-    >>> mask_token_logits = token_logits[0, mask_token_index, :]
-
-    >>> top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
-
-
-This prints five sequences, with the top 5 tokens predicted by the model:
-
-.. code-block::

    >>> for token in top_5_tokens:
    ...     print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
@@ -429,6 +410,34 @@ This prints five sequences, with the top 5 tokens predicted by the model:
    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelForMaskedLM, AutoTokenizer
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    >>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert-base-cased")
+
+    >>> sequence = "Distilled models are smaller than the models they mimic. Using them instead of the large " \
+    ...     f"versions would help {tokenizer.mask_token} our carbon footprint."
+
+    >>> inputs = tokenizer(sequence, return_tensors="tf")
+    >>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
+
+    >>> token_logits = model(**inputs).logits
+    >>> mask_token_logits = token_logits[0, mask_token_index, :]
+
+    >>> top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
+
+    >>> for token in top_5_tokens:
+    ...     print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
+
+
+This prints five sequences, with the top 5 tokens predicted by the model.


 Causal Language Modeling
@@ -449,19 +458,20 @@ of tokens.
 .. code-block::

    >>> ## PYTORCH CODE
-    >>> from transformers import AutoModelWithLMHead, AutoTokenizer, top_k_top_p_filtering
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer, top_k_top_p_filtering
    >>> import torch
    >>> from torch import nn

    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    >>> model = AutoModelWithLMHead.from_pretrained("gpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")

    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and"

-    >>> input_ids = tokenizer.encode(sequence, return_tensors="pt")
+    >>> inputs = tokenizer(sequence, return_tensors="pt")
+    >>> input_ids = inputs["input_ids"]

    >>> # get logits of last hidden state
-    >>> next_token_logits = model(input_ids).logits[:, -1, :]
+    >>> next_token_logits = model(**inputs).logits[:, -1, :]

    >>> # filter
    >>> filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
@@ -473,19 +483,22 @@ of tokens.
    >>> generated = torch.cat([input_ids, next_token], dim=-1)

    >>> resulting_string = tokenizer.decode(generated.tolist()[0])
+    >>> print(resulting_string)
+    Hugging Face is based in DUMBO, New York City, and ...
    >>> ## TENSORFLOW CODE
-    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer, tf_top_k_top_p_filtering
+    >>> from transformers import TFAutoModelForCausalLM, AutoTokenizer, tf_top_k_top_p_filtering
    >>> import tensorflow as tf

    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    >>> model = TFAutoModelWithLMHead.from_pretrained("gpt2")
+    >>> model = TFAutoModelForCausalLM.from_pretrained("gpt2")

-    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and "
+    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and"

-    >>> input_ids = tokenizer.encode(sequence, return_tensors="tf")
+    >>> inputs = tokenizer(sequence, return_tensors="tf")
+    >>> input_ids = inputs["input_ids"]

    >>> # get logits of last hidden state
-    >>> next_token_logits = model(input_ids)[0][:, -1, :]
+    >>> next_token_logits = model(**inputs).logits[:, -1, :]

    >>> # filter
    >>> filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
@@ -496,14 +509,11 @@ of tokens.
    >>> generated = tf.concat([input_ids, next_token], axis=1)

    >>> resulting_string = tokenizer.decode(generated.numpy().tolist()[0])
-
-
-This outputs a (hopefully) coherent next token following the original sequence, which in our case is the word *has*:
-
-.. code-block::
-
    >>> print(resulting_string)
-    Hugging Face is based in DUMBO, New York City, and has
+    Hugging Face is based in DUMBO, New York City, and ...
+
+This outputs a (hopefully) coherent next token following the original sequence, which in our case is the word *is* or
+*features*.

 In the next section, we show how :func:`~transformers.generation_utils.GenerationMixin.generate` can be used to
 generate multiple tokens up to a specified length instead of one token at a time.
@@ -522,7 +532,8 @@ As a default all models apply *Top-K* sampling when used in pipelines, as config

    >>> text_generator = pipeline("text-generation")
    >>> print(text_generator("As far as I am concerned, I will", max_length=50, do_sample=False))
-    [{'generated_text': 'As far as I am concerned, I will be the first to admit that I am not a fan of the idea of a "free market." I think that the idea of a free market is a bit of a stretch. I think that the idea'}]
+    [{'generated_text': 'As far as I am concerned, I will be the first to admit that I am not a fan of the idea of a
+    "free market." I think that the idea of a free market is a bit of a stretch. I think that the idea'}]



@@ -536,9 +547,9 @@ Below is an example of text generation using ``XLNet`` and its tokenizer, which
 .. code-block::

    >>> ## PYTORCH CODE
-    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer

-    >>> model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased")
+    >>> model = AutoModelForCausalLM.from_pretrained("xlnet-base-cased")
    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
@@ -554,41 +565,42 @@ Below is an example of text generation using ``XLNet`` and its tokenizer, which
    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""

    >>> prompt = "Today the weather is really nice and I am planning on "
-    >>> inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")
+    >>> inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]

-    >>> prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    >>> prompt_length = len(tokenizer.decode(inputs[0]))
    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
-    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
-
-    >>> ## TENSORFLOW CODE
-    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
-
-    >>> model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased")
-    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
-
-    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
-    >>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
-    ... (except for Alexei and Maria) are discovered.
-    ... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
-    ... remainder of the story. 1883 Western Siberia,
-    ... a young Grigori Rasputin is asked by his father and a group of men to perform magic.
-    ... Rasputin has a vision and denounces one of the men as a horse thief. Although his
-    ... father initially slaps him for making such an accusation, Rasputin watches as the
-    ... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-    ... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
-    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
-
-    >>> prompt = "Today the weather is really nice and I am planning on "
-    >>> inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")
-
-    >>> prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
-    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
-    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
-
-.. code-block::
+    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:]

    >>> print(generated)
-    Today the weather is really nice and I am planning on anning on taking a nice...... of a great time!<eop>...............
+    Today the weather is really nice and I am planning ...
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelForCausalLM, AutoTokenizer
+
+    >>> model = TFAutoModelForCausalLM.from_pretrained("xlnet-base-cased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
+
+    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
+    >>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
+    ... (except for Alexei and Maria) are discovered.
+    ... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+    ... remainder of the story. 1883 Western Siberia,
+    ... a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+    ... Rasputin has a vision and denounces one of the men as a horse thief. Although his
+    ... father initially slaps him for making such an accusation, Rasputin watches as the
+    ... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    ... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
+
+    >>> prompt = "Today the weather is really nice and I am planning on "
+    >>> inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")["input_ids"]
+
+    >>> prompt_length = len(tokenizer.decode(inputs[0]))
+    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
+    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:]
+
+    >>> print(generated)
+    Today the weather is really nice and I am planning ...
+

 Text generation is currently possible with *GPT-2*, *OpenAi-GPT*, *CTRL*, *XLNet*, *Transfo-XL* and *Reformer* in
 PyTorch and for most models in Tensorflow as well. As can be seen in the example above *XLNet* and *Transfo-XL* often
@@ -638,21 +650,20 @@ Here are the expected results:

 .. code-block::

-    >>> print(ner_pipe(sequence))
-    [
-        {'word': 'Hu', 'score': 0.9995632767677307, 'entity': 'I-ORG'},
-        {'word': '##gging', 'score': 0.9915938973426819, 'entity': 'I-ORG'},
-        {'word': 'Face', 'score': 0.9982671737670898, 'entity': 'I-ORG'},
-        {'word': 'Inc', 'score': 0.9994403719902039, 'entity': 'I-ORG'},
-        {'word': 'New', 'score': 0.9994346499443054, 'entity': 'I-LOC'},
-        {'word': 'York', 'score': 0.9993270635604858, 'entity': 'I-LOC'},
-        {'word': 'City', 'score': 0.9993864893913269, 'entity': 'I-LOC'},
-        {'word': 'D', 'score': 0.9825621843338013, 'entity': 'I-LOC'},
-        {'word': '##UM', 'score': 0.936983048915863, 'entity': 'I-LOC'},
-        {'word': '##BO', 'score': 0.8987102508544922, 'entity': 'I-LOC'},
-        {'word': 'Manhattan', 'score': 0.9758241176605225, 'entity': 'I-LOC'},
-        {'word': 'Bridge', 'score': 0.990249514579773, 'entity': 'I-LOC'}
-    ]
+    >>> for entity in ner_pipe(sequence):
+    ...     print(entity)
+    {'entity': 'I-ORG', 'score': 0.9996, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}
+    {'entity': 'I-ORG', 'score': 0.9910, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}
+    {'entity': 'I-ORG', 'score': 0.9982, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}
+    {'entity': 'I-ORG', 'score': 0.9995, 'index': 4, 'word': 'Inc', 'start': 13, 'end': 16}
+    {'entity': 'I-LOC', 'score': 0.9994, 'index': 11, 'word': 'New', 'start': 40, 'end': 43}
+    {'entity': 'I-LOC', 'score': 0.9993, 'index': 12, 'word': 'York', 'start': 44, 'end': 48}
+    {'entity': 'I-LOC', 'score': 0.9994, 'index': 13, 'word': 'City', 'start': 49, 'end': 53}
+    {'entity': 'I-LOC', 'score': 0.9863, 'index': 19, 'word': 'D', 'start': 79, 'end': 80}
+    {'entity': 'I-LOC', 'score': 0.9514, 'index': 20, 'word': '##UM', 'start': 80, 'end': 82}
+    {'entity': 'I-LOC', 'score': 0.9337, 'index': 21, 'word': '##BO', 'start': 82, 'end': 84}
+    {'entity': 'I-LOC', 'score': 0.9762, 'index': 28, 'word': 'Manhattan', 'start': 114, 'end': 123}
+    {'entity': 'I-LOC', 'score': 0.9915, 'index': 29, 'word': 'Bridge', 'start': 124, 'end': 130}

 Note how the tokens of the sequence "Hugging Face" have been identified as an organisation, and "New York City",
 "DUMBO" and "Manhattan Bridge" have been identified as locations.
@@ -679,26 +690,13 @@ Here is an example of doing named entity recognition, using a model and a tokeni
    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

-    >>> label_list = [
-    ...     "O",       # Outside of a named entity
-    ...     "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
-    ...     "I-MISC",  # Miscellaneous entity
-    ...     "B-PER",   # Beginning of a person's name right after another person's name
-    ...     "I-PER",   # Person's name
-    ...     "B-ORG",   # Beginning of an organisation right after another organisation
-    ...     "I-ORG",   # Organisation
-    ...     "B-LOC",   # Beginning of a location right after another location
-    ...     "I-LOC"    # Location
-    ... ]
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, " \
+    ...            "therefore very close to the Manhattan Bridge."

-    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-    ...            "close to the Manhattan Bridge."
+    >>> inputs = tokenizer(sequence, return_tensors="pt")
+    >>> tokens = inputs.tokens()

-    >>> # Bit of a hack to get the tokens with the special tokens
-    >>> tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
-    >>> inputs = tokenizer.encode(sequence, return_tensors="pt")
-
-    >>> outputs = model(inputs).logits
+    >>> outputs = model(**inputs).logits
    >>> predictions = torch.argmax(outputs, dim=2)
    >>> ## TENSORFLOW CODE
    >>> from transformers import TFAutoModelForTokenClassification, AutoTokenizer
@@ -707,14 +705,13 @@ Here is an example of doing named entity recognition, using a model and a tokeni
    >>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

-    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-    ...            "close to the Manhattan Bridge."
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, " \
+    ...            "therefore very close to the Manhattan Bridge."

-    >>> # Bit of a hack to get the tokens with the special tokens
-    >>> tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
-    >>> inputs = tokenizer.encode(sequence, return_tensors="tf")
+    >>> inputs = tokenizer(sequence, return_tensors="tf")
+    >>> tokens = inputs.tokens()

-    >>> outputs = model(inputs)[0]
+    >>> outputs = model(**inputs)[0]
    >>> predictions = tf.argmax(outputs, axis=2)


@@ -755,8 +752,7 @@ illustrated below:
    (',', 'O')
    ('therefore', 'O')
    ('very', 'O')
-    ('##c', 'O')
-    ('##lose', 'O')
+    ('close', 'O')
    ('to', 'O')
    ('the', 'O')
    ('Manhattan', 'I-LOC')
@@ -764,6 +760,7 @@ illustrated below:
    ('.', 'O')
    ('[SEP]', 'O')

+
 Summarization
 -----------------------------------------------------------------------------------------------------------------------

@@ -811,7 +808,9 @@ below. This outputs the following summary:
 .. code-block::

    >>> print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
-    [{'summary_text': 'Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and 2002. She is believed to still be married to four men.'}]
+    [{'summary_text': ' Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in
+    the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and
+    2002 . At one time, she was married to eight men at once, prosecutors say .'}]

 Here is an example of doing summarization using a model and a tokenizer. The process is the following:

@@ -833,8 +832,15 @@ CNN / Daily Mail), it yields very good results.
    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")

    >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
-    >>> inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="pt", max_length=512, truncation=True)
-    >>> outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
+    >>> inputs = tokenizer("summarize: " + ARTICLE, return_tensors="pt", max_length=512, truncation=True)
+    >>> outputs = model.generate(
+    ...     inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
+    ... )
+
+    >>> print(tokenizer.decode(outputs[0]))
+    <pad> prosecutors say the marriages were part of an immigration scam. if convicted, barrientos faces two criminal
+    counts of "offering a false instrument for filing in the first degree" she has been married 10 times, nine of them
+    between 1999 and 2002.</s>
    >>> ## TENSORFLOW CODE
    >>> from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

@@ -842,13 +848,15 @@ CNN / Daily Mail), it yields very good results.
    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")

    >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
-    >>> inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
-    >>> outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
-
-.. code-block::
+    >>> inputs = tokenizer("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
+    >>> outputs = model.generate(
+    ...     inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
+    ... )

    >>> print(tokenizer.decode(outputs[0]))
-    <pad> prosecutors say the marriages were part of an immigration scam. if convicted, barrientos faces two criminal counts of "offering a false instrument for filing in the first degree" she has been married 10 times, nine of them between 1999 and 2002.</s>
+    <pad> prosecutors say the marriages were part of an immigration scam. if convicted, barrientos faces two criminal
+    counts of "offering a false instrument for filing in the first degree" she has been married 10 times, nine of them
+    between 1999 and 2002.


 Translation
@@ -888,25 +896,32 @@ Here is an example of doing translation using a model and a tokenizer. The proce
 .. code-block::

    >>> ## PYTORCH CODE
-    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

-    >>> model = AutoModelWithLMHead.from_pretrained("t5-base")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")

-    >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="pt")
-    >>> outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
-    >>> ## TENSORFLOW CODE
-    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
-
-    >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base")
-    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-
-    >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="tf")
-    >>> outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
-
-As with the pipeline example, we get the same translation:
-
-.. code-block::
+    >>> inputs = tokenizer(
+    ...     "translate English to German: Hugging Face is a technology company based in New York and Paris",
+    ...     return_tensors="pt"
+    ... )
+    >>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)

    >>> print(tokenizer.decode(outputs[0]))
-    Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.
+    <pad> Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.</s>
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
+
+    >>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    >>> inputs = tokenizer(
+    ...     "translate English to German: Hugging Face is a technology company based in New York and Paris",
+    ...     return_tensors="tf"
+    ... )
+    >>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)
+
+    >>> print(tokenizer.decode(outputs[0]))
+    <pad> Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.
+
+We get the same translation as with the pipeline example.
--- a/docs/source/training.rst
+++ b/docs/source/training.rst
@@ -33,7 +33,7 @@ Preparing the datasets
   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
   picture-in-picture" allowfullscreen></iframe>

-We will use the `🤗 Datasets <https:/github.com/huggingface/datasets/>`__ library to download and preprocess the IMDB
+We will use the `🤗 Datasets <https://github.com/huggingface/datasets/>`__ library to download and preprocess the IMDB
 datasets. We will go over this part pretty quickly. Since the focus of this tutorial is on training, you should refer
 to the 🤗 Datasets `documentation <https://huggingface.co/docs/datasets/>`__ or the :doc:`preprocessing` tutorial for
 more information.
@@ -281,7 +281,7 @@ Fine-tuning in native PyTorch
   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
   picture-in-picture" allowfullscreen></iframe>

-You might need to restart your notebook at this stage to free some memory, or excute the following code:
+You might need to restart your notebook at this stage to free some memory, or execute the following code:

 .. code-block:: python

--- a/examples/flax/README.md
+++ b/examples/flax/README.md
@@ -46,6 +46,8 @@ module abstraction using Python dataclasses that leads to concise and explicit c
 All of our JAX/Flax models are designed to run efficiently on Google
 Cloud TPUs. Here is [a guide for running JAX on Google Cloud TPU](https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm).

+Consider applying for the [Google TPU Research Cloud project](https://sites.research.google/trc/) for free TPU compute.
+
 Each example README contains more details on the specific model and training
 procedure.

--- a/examples/flax/language-modeling/README.md
+++ b/examples/flax/language-modeling/README.md
@@ -49,21 +49,15 @@ Next we clone the model repository to add the tokenizer and model files.
 git clone https://huggingface.co/<your-username>/norwegian-roberta-base
 ```

-To ensure that all tensorboard traces will be uploaded correctly, we need to 
-track them. You can run the following command inside your model repo to do so.
+To setup all relevant files for training, let's go into the cloned model directory.

-```
+```bash
 cd norwegian-roberta-base
-git lfs track "*tfevents*"
 ```

-Great, we have set up our model repository. During training, we will automatically
-push the training logs and model weights to the repo.
-
 Next, let's add a symbolic link to the `run_mlm_flax.py`.

 ```bash
-export MODEL_DIR="./norwegian-roberta-base"
 ln -s ~/transformers/examples/flax/language-modeling/run_mlm_flax.py run_mlm_flax.py
 ```

@@ -71,15 +65,13 @@ ln -s ~/transformers/examples/flax/language-modeling/run_mlm_flax.py run_mlm_fla

 In the first step, we train a tokenizer to efficiently process the text input for the model. Similar to how it is shown in [How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train), we use a **`ByteLevelBPETokenizer`**.
 The tokenizer is trained on the complete Norwegian dataset of OSCAR
-and consequently saved in `${MODEL_DIR}`
+and consequently saved in the cloned model directory.
 This can take up to 10 minutes depending on your hardware ☕.

 ```python
 from datasets import load_dataset
 from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer

-model_dir = "./norwegian-roberta-base"  # ${MODEL_DIR}
-
 # load dataset
 dataset = load_dataset("oscar", "unshuffled_deduplicated_no", split="train")

@@ -100,7 +92,7 @@ tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=
 ])

 # Save files to disk
-tokenizer.save(f"{model_dir}/tokenizer.json")
+tokenizer.save("./tokenizer.json")
 ```

 ### Create configuration
@@ -112,22 +104,23 @@ in the local model folder:
 ```python
 from transformers import RobertaConfig

-model_dir = "./norwegian-roberta-base"  # ${MODEL_DIR}
-
-config = RobertaConfig.from_pretrained("roberta-base", vocab_size=tokenizer.vocab_size)
-config.save_pretrained(model_dir)
+config = RobertaConfig.from_pretrained("roberta-base", vocab_size=50265)
+config.save_pretrained("./")
 ```

+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
 ### Train model

 Next we can run the example script to pretrain the model:

 ```bash
 ./run_mlm_flax.py \
-    --output_dir="${MODEL_DIR}" \
+    --output_dir="./" \
    --model_type="roberta" \
-    --config_name="${MODEL_DIR}" \
-    --tokenizer_name="${MODEL_DIR}" \
+    --config_name="./" \
+    --tokenizer_name="./" \
    --dataset_name="oscar" \
    --dataset_config_name="unshuffled_deduplicated_no" \
    --max_seq_length="128" \
@@ -149,7 +142,7 @@ Next we can run the example script to pretrain the model:
 Training should converge at a loss and accuracy 
 of 1.78 and 0.64 respectively after 18 epochs on a single TPUv3-8.
 This should take less than 18 hours.
-Training statistics can be accessed on [tfhub.de](https://tensorboard.dev/experiment/GdYmdak2TWeVz0DDRYOrrg).
+Training statistics can be accessed on [tfhub.dev](https://tensorboard.dev/experiment/GdYmdak2TWeVz0DDRYOrrg).

 For a step-by-step walkthrough of how to do masked language modeling in Flax, please have a 
 look at [this](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/masked_language_modeling_flax.ipynb) google colab.
@@ -180,25 +173,51 @@ Next we clone the model repository to add the tokenizer and model files.
 git clone https://huggingface.co/<your-username>/norwegian-gpt2
 ```

-To ensure that all tensorboard traces will be uploaded correctly, we need to 
-track them. You can run the following command inside your model repo to do so.
-
-```
-cd norwegian-gpt2
-git lfs track "*tfevents*"
-```
-
-Great, we have set up our model repository. During training, we will automatically
-push the training logs and model weights to the repo.
-
-Next, let's add a symbolic link to the `run_clm_flax.py`.
+To setup all relevant files for training, let's go into the cloned model directory.
+
+```bash
+cd norwegian-gpt2
+```
+
+Next, let's add a symbolic link to the training script `run_clm_flax.py`.

 ```bash
-export MODEL_DIR="./norwegian-gpt2"
 ln -s ~/transformers/examples/flax/language-modeling/run_clm_flax.py run_clm_flax.py
 ```

-Next, we'll follow the same steps as above in [Train tokenizer](#train-tokenizer) to train the tokenizer.
+### Train tokenizer
+
+In the first step, we train a tokenizer to efficiently process the text input for the model. Similar to how it is shown in [How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train), we use a **`ByteLevelBPETokenizer`**.
+The tokenizer is trained on the complete Norwegian dataset of OSCAR
+and consequently saved in the cloned model directory.
+This can take up to 10 minutes depending on your hardware ☕.
+
+```python
+from datasets import load_dataset
+from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
+
+# load dataset
+dataset = load_dataset("oscar", "unshuffled_deduplicated_no", split="train")
+
+# Instantiate tokenizer
+tokenizer = ByteLevelBPETokenizer()
+
+def batch_iterator(batch_size=1000):
+    for i in range(0, len(dataset), batch_size):
+        yield dataset[i: i + batch_size]["text"]
+
+# Customized training
+tokenizer.train_from_iterator(batch_iterator(), vocab_size=50257, min_frequency=2, special_tokens=[
+    "<s>",
+    "<pad>",
+    "</s>",
+    "<unk>",
+    "<mask>",
+])
+
+# Save files to disk
+tokenizer.save("./tokenizer.json")
+```

 ### Create configuration

@@ -209,22 +228,23 @@ in the local model folder:
 ```python
 from transformers import GPT2Config

-model_dir = "./norwegian-gpt2"  # ${MODEL_DIR}
-
-config = GPT2Config.from_pretrained("gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0, vocab_size=tokenizer.vocab_size)
-config.save_pretrained(model_dir)
+config = GPT2Config.from_pretrained("gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0, vocab_size=50257)
+config.save_pretrained("./")
 ```

+Great, we have set up our model repository. During training, we will now automatically
+push the training logs and model weights to the repo.
+
 ### Train model

-Next we can run the example script to pretrain the model:
+Finally, we can run the example script to pretrain the model:

 ```bash
 ./run_clm_flax.py \
-    --output_dir="${MODEL_DIR}" \
+    --output_dir="./" \
    --model_type="gpt2" \
-    --config_name="${MODEL_DIR}" \
-    --tokenizer_name="${MODEL_DIR}" \
+    --config_name="./" \
+    --tokenizer_name="./" \
    --dataset_name="oscar" \
    --dataset_config_name="unshuffled_deduplicated_no" \
    --do_train --do_eval \
@@ -246,6 +266,9 @@ of 3.24 and 25.72 respectively after 20 epochs on a single TPUv3-8.
 This should take less than ~21 hours.
 Training statistics can be accessed on [tfhub.de](https://tensorboard.dev/experiment/2zEhLwJ0Qp2FAkI3WVH9qA).

+For a step-by-step walkthrough of how to do causal language modeling in Flax, please have a 
+look at [this](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/causal_language_modeling_flax.ipynb) google colab.
+
 ## T5-like span-masked language modeling

 In the following, we demonstrate how to train a T5 model using the span-masked language model 
@@ -272,21 +295,15 @@ Next we clone the model repository to add the tokenizer and model files.
 git clone https://huggingface.co/<your-username>/norwegian-t5-base
 ```

-To ensure that all tensorboard traces will be uploaded correctly, we need to 
-track them. You can run the following command inside your model repo to do so.
+To setup all relevant files for trairing, let's go into the cloned model directory.

-```
+```bash
 cd norwegian-t5-base
-git lfs track "*tfevents*"
 ```

-Great, we have set up our model repository. During training, we will automatically
-push the training logs and model weights to the repo.
-
 Next, let's add a symbolic link to the `run_t5_mlm_flax.py` and `t5_tokenizer_model` scripts.

 ```bash
-export MODEL_DIR="./norwegian-t5-base"
 ln -s ~/transformers/examples/flax/language-modeling/run_t5_mlm_flax.py run_t5_mlm_flax.py
 ln -s ~/transformers/examples/flax/language-modeling/t5_tokenizer_model.py t5_tokenizer_model.py
 ```
@@ -299,7 +316,7 @@ a sentencepiece unigram tokenizer as shown in [t5_tokenizer_model.py](https://gi
 which is heavily inspired from [yandex-research/DeDLOC's tokenizer model](https://github.com/yandex-research/DeDLOC/blob/5c994bc64e573702a9a79add3ecd68b38f14b548/sahajbert/tokenizer/tokenizer_model.py) .

 The tokenizer is trained on the complete Norwegian dataset of OSCAR
-and consequently saved in `${MODEL_DIR}`
+and consequently saved in the cloned model directory.
 This can take up to 120 minutes depending on your hardware ☕☕☕ .

 ```python
@@ -310,7 +327,6 @@ from t5_tokenizer_model import SentencePieceUnigramTokenizer

 vocab_size = 32_000
 input_sentence_size = None
-model_dir = "./norwegian-t5-base"  # ${MODEL_DIR}

 # Initialize a dataset
 dataset = datasets.load_dataset("oscar", name="unshuffled_deduplicated_no", split="train")
@@ -335,7 +351,7 @@ tokenizer.train_from_iterator(
 )

 # Save files to disk
-tokenizer.save(f"{model_dir}/tokenizer.json")
+tokenizer.save("./tokenizer.json")
 ```

 ### Create configuration
@@ -347,12 +363,13 @@ in the local model folder:
 ```python
 from transformers import T5Config

-model_dir = "./norwegian-t5-base"  # ${MODEL_DIR}
-
-config = T5Config.from_pretrained("google/t5-v1_1-base", vocab_size=tokenizer.vocab_size)
-config.save_pretrained(model_dir)
+config = T5Config.from_pretrained("google/t5-v1_1-base", vocab_size=tokenizer.get_vocab_size())
+config.save_pretrained("./")
 ```

+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
 ### Train model

 Next we can run the example script to pretrain the model:
@@ -373,15 +390,15 @@ Next we can run the example script to pretrain the model:
 	--weight_decay="0.001" \
 	--warmup_steps="2000" \
 	--overwrite_output_dir \
-	--logging_steps="100" \
-	--save_steps="1000" \
-	--eval_steps="1000" \
+	--logging_steps="500" \
+	--save_steps="10000" \
+	--eval_steps="2500" \
 	--push_to_hub
 ```

 Training should converge at a loss and accuracy 
-of 2.2 and 58.0 respectively after 2 epochs on a single TPUv3-8.
-This should take around 24 hours.
+of 2.36 and 57.0 respectively after 3 epochs on a single TPUv3-8.
+This should take around 4.5 hours.
 Training statistics can be accessed on directly on the 🤗 [hub](https://huggingface.co/patrickvonplaten/t5-base-norwegian/tensorboard)

 ## Runtime evaluation
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -31,6 +31,7 @@ from pathlib import Path
 from typing import Callable, Optional

 import datasets
+import numpy as np
 from datasets import Dataset, load_dataset
 from tqdm import tqdm

@@ -51,6 +52,7 @@ from transformers import (
    HfArgumentParser,
    TrainingArguments,
    is_tensorboard_available,
+    set_seed,
 )
 from transformers.testing_utils import CaptureLogger

@@ -154,6 +156,9 @@ class DataTrainingArguments:
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
+    keep_linebreaks: bool = field(
+        default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
+    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -182,18 +187,16 @@ def data_loader(rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuf
    steps_per_epoch = len(dataset) // batch_size

    if shuffle:
-        batch_idx = jax.random.permutation(rng, len(dataset))
+        batch_idx = np.random.permutation(len(dataset))
    else:
-        batch_idx = jnp.arange(len(dataset))
+        batch_idx = np.arange(len(dataset))

    batch_idx = batch_idx[: steps_per_epoch * batch_size]  # Skip incomplete batch.
    batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))

    for idx in batch_idx:
        batch = dataset[idx]
-        batch = {k: jnp.array(v) for k, v in batch.items()}
-
-        batch = shard(batch)
+        batch = {k: np.array(v) for k, v in batch.items()}

        yield batch

@@ -269,6 +272,9 @@ def main():
    # Set the verbosity to info of the Transformers logger (on main process only):
    logger.info(f"Training/evaluation parameters {training_args}")

+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
    #  Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
@@ -299,6 +305,7 @@ def main():
            )
    else:
        data_files = {}
+        dataset_args = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
@@ -306,20 +313,23 @@ def main():
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
-        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
+        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args)

-        if "validation" not in datasets.keys():
-            datasets["validation"] = load_dataset(
+        if "validation" not in dataset.keys():
+            dataset["validation"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
+                **dataset_args,
            )
-            datasets["train"] = load_dataset(
+            dataset["train"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
+                **dataset_args,
            )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -577,7 +587,7 @@ def main():

    train_time = 0
    train_metrics = []
-    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    epochs = tqdm(range(num_epochs), desc="Epoch ... ", position=0)
    for epoch in epochs:
        # ======================== Training ================================
        train_start = time.time()
@@ -591,6 +601,7 @@ def main():
        # train
        for step in tqdm(range(steps_per_epoch), desc="Training...", position=1, leave=False):
            batch = next(train_loader)
+            batch = shard(batch)
            state, train_metric = p_train_step(state, batch)
            train_metrics.append(train_metric)

@@ -617,6 +628,7 @@ def main():
                for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False):
                    # Model forward
                    batch = next(eval_loader)
+                    batch = shard(batch)
                    metrics = p_eval_step(state.params, batch)
                    eval_metrics.append(metrics)

--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -214,7 +214,7 @@ class FlaxDataCollatorForLanguageModeling:

    def mask_tokens(
        self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
-    ) -> Tuple[jnp.ndarray, jnp.ndarray]:
+    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -353,7 +353,8 @@ class FlaxDataCollatorForT5MLM:
            np.random.shuffle(mask_indices)
            first_in_segment = np.pad(mask_indices, [[1, 0]])
            segment_id = np.cumsum(first_in_segment)
-            segment_length = np.asarray(jax.ops.segment_sum(np.ones_like(segment_id), segment_id))
+            # count length of sub segments assuming that list is sorted
+            _, segment_length = np.unique(segment_id, return_counts=True)
            return segment_length

        noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans)
@@ -720,7 +721,7 @@ if __name__ == "__main__":
    state = jax_utils.replicate(state)

    train_time = 0
-    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    epochs = tqdm(range(num_epochs), desc="Epoch ... ", position=0)
    for epoch in epochs:
        # ======================== Training ================================
        train_start = time.time()
--- a/examples/flax/question-answering/README.md
+++ b/examples/flax/question-answering/README.md
@@ -0,0 +1,128 @@
+<!---
+Copyright 2021 The Google Flax Team Authors and HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Question Answering examples
+
+Based on the script [`run_qa.py`](https://github.com/huggingface/transformers/blob/master/examples/flax/question-answering/run_qa.py).
+
+**Note:** This script only works with models that have a fast tokenizer (backed by the 🤗 Tokenizers library) as it
+uses special features of those tokenizers. You can check if your favorite model has a fast tokenizer in
+[this table](https://huggingface.co/transformers/index.html#supported-frameworks), if it doesn't you can still use the old version
+of the script.
+
+
+The following example fine-tunes BERT on SQuAD:
+
+To begin with it is recommended to create a model repository to save the trained model and logs.
+Here we call the model `"bert-qa-squad-test"`, but you can change the model name as you like.
+
+You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
+you are logged in) or via the command line:
+
+```
+huggingface-cli repo create bert-qa-squad-test
+```
+
+Next we clone the model repository to add the tokenizer and model files.
+
+```
+git clone https://huggingface.co/<your-username>/bert-qa-squad-test
+```
+
+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
+Next, let's add a symbolic link to the `run_qa.py`.
+
+```bash
+export MODEL_DIR="./bert-qa-squad-test"
+ln -s ~/transformers/examples/flax/question-answering/run_qa.py run_qa.py
+```
+
+```bash
+python run_qa.py \
+  --model_name_or_path bert-base-uncased \
+  --dataset_name squad \
+  --do_train   \
+  --do_eval   \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2 \
+  --per_device_train_batch_size 12 \
+  --output_dir ${MODEL_DIR} \
+  --eval_steps 1000 \
+  --push_to_hub
+```
+
+Using the command above, the script will train for 2 epochs and run eval after each epoch. 
+Metrics and hyperparameters are stored in Tensorflow event files in `--output_dir`.
+You can see the results by running `tensorboard` in that directory:
+
+```bash
+$ tensorboard --logdir .
+```
+
+or directly on the hub under *Training metrics*.
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 88.62
+exact_match = 81.34
+```
+
+sample Metrics - [tfhub.dev](https://tensorboard.dev/experiment/6gU75Hx8TGCnc6tr4ZgI9Q)
+
+Here is an example training on 4 TITAN RTX GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python run_qa.py   \
+--model_name_or_path bert-large-uncased-whole-word-masking   \
+--dataset_name squad   \
+--do_train   \
+--do_eval   \
+--per_device_train_batch_size 6   \
+--learning_rate 3e-5   \
+--num_train_epochs 2   \
+--max_seq_length 384   \
+--doc_stride 128   \
+--output_dir /tmp/wwm_uncased_finetuned_squad/ \
+--eval_steps 1000
+```
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 93.31
+exact_match = 87.04
+```
+
+
+### Usage notes
+
+Note that when contexts are long they may be split into multiple training cases, not all of which may contain
+the answer span. 
+
+As-is, the example script will train on SQuAD or any other question-answering dataset formatted the same way, and can handle user
+inputs as well.
+
+### Memory usage and data loading
+
+One thing to note is that all data is loaded into memory in this script. Most question answering datasets are small
+enough that this is not an issue, but if you have a very large dataset you will need to modify the script to handle
+data streaming.
--- a/examples/flax/question-answering/requirements.txt
+++ b/examples/flax/question-answering/requirements.txt
@@ -0,0 +1,5 @@
+datasets >= 1.8.0
+jax>=0.2.17
+jaxlib>=0.1.68
+flax>=0.3.4
+optax>=0.0.8
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -0,0 +1,905 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for question answering.
+"""
+# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
+
+import logging
+import os
+import random
+import sys
+import time
+from dataclasses import dataclass, field
+from itertools import chain
+from typing import Any, Callable, Dict, Optional, Tuple
+
+import datasets
+import numpy as np
+from datasets import load_dataset, load_metric
+from tqdm import tqdm
+
+import jax
+import jax.numpy as jnp
+import optax
+import transformers
+from flax import struct, traverse_util
+from flax.jax_utils import replicate, unreplicate
+from flax.metrics import tensorboard
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    EvalPrediction,
+    FlaxAutoModelForQuestionAnswering,
+    HfArgumentParser,
+    PreTrainedTokenizerFast,
+    TrainingArguments,
+)
+from transformers.utils import check_min_version
+from utils_qa import postprocess_qa_predictions
+
+
+logger = logging.getLogger(__name__)
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.11.0")
+
+Array = Any
+Dataset = datasets.arrow_dataset.Dataset
+PRNGKey = Any
+
+
+# region Arguments
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_seq_length: int = field(
+        default=384,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
+            "be faster on GPU but will be slower on TPU)."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+            "value if set."
+        },
+    )
+    version_2_with_negative: bool = field(
+        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
+    )
+    null_score_diff_threshold: float = field(
+        default=0.0,
+        metadata={
+            "help": "The threshold used to select the null answer: if the best answer has a score that is less than "
+            "the score of the null answer minus this threshold, the null answer is selected for this example. "
+            "Only useful when `version_2_with_negative=True`."
+        },
+    )
+    doc_stride: int = field(
+        default=128,
+        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
+    )
+    n_best_size: int = field(
+        default=20,
+        metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
+    )
+    max_answer_length: int = field(
+        default=30,
+        metadata={
+            "help": "The maximum length of an answer that can be generated. This is needed because the start "
+            "and end predictions are not conditioned on one another."
+        },
+    )
+
+    def __post_init__(self):
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+            and self.test_file is None
+        ):
+            raise ValueError("Need either a dataset name or a training/validation file/test_file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
+
+
+# endregion
+
+# region Create a train state
+def create_train_state(
+    model: FlaxAutoModelForQuestionAnswering,
+    learning_rate_fn: Callable[[int], float],
+    num_labels: int,
+    training_args: TrainingArguments,
+) -> train_state.TrainState:
+    """Create initial training state."""
+
+    class TrainState(train_state.TrainState):
+        """Train state with an Optax optimizer.
+
+        The two functions below differ depending on whether the task is classification
+        or regression.
+
+        Args:
+          logits_fn: Applied to last layer to obtain the logits.
+          loss_fn: Function to compute the loss.
+        """
+
+        logits_fn: Callable = struct.field(pytree_node=False)
+        loss_fn: Callable = struct.field(pytree_node=False)
+
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBERT-like models.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+
+    tx = optax.adamw(
+        learning_rate=learning_rate_fn,
+        b1=training_args.adam_beta1,
+        b2=training_args.adam_beta2,
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+        mask=decay_mask_fn,
+    )
+
+    def cross_entropy_loss(logits, labels):
+        start_loss = optax.softmax_cross_entropy(logits[0], onehot(labels[0], num_classes=num_labels))
+        end_loss = optax.softmax_cross_entropy(logits[1], onehot(labels[1], num_classes=num_labels))
+        xentropy = (start_loss + end_loss) / 2.0
+        return jnp.mean(xentropy)
+
+    return TrainState.create(
+        apply_fn=model.__call__,
+        params=model.params,
+        tx=tx,
+        logits_fn=lambda logits: logits,
+        loss_fn=cross_entropy_loss,
+    )
+
+
+# endregion
+
+
+# region Create learning rate function
+def create_learning_rate_fn(
+    train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, linear_decay learning rate function."""
+    steps_per_epoch = train_ds_size // train_batch_size
+    num_train_steps = steps_per_epoch * num_train_epochs
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+
+
+# endregion
+
+# region train data iterator
+def train_data_collator(rng: PRNGKey, dataset: Dataset, batch_size: int):
+    """Returns shuffled batches of size `batch_size` from truncated `train dataset`, sharded over all local devices."""
+    steps_per_epoch = len(dataset) // batch_size
+    perms = jax.random.permutation(rng, len(dataset))
+    perms = perms[: steps_per_epoch * batch_size]  # Skip incomplete batch.
+    perms = perms.reshape((steps_per_epoch, batch_size))
+
+    for perm in perms:
+        batch = dataset[perm]
+        batch = {k: np.array(v) for k, v in batch.items()}
+        batch = shard(batch)
+
+        yield batch
+
+
+# endregion
+
+# region eval data iterator
+def eval_data_collator(dataset: Dataset, batch_size: int):
+    """Returns batches of size `batch_size` from `eval dataset`, sharded over all local devices."""
+    for i in range(len(dataset) // batch_size):
+        batch = dataset[i * batch_size : (i + 1) * batch_size]
+        batch = {k: np.array(v) for k, v in batch.items()}
+        batch = shard(batch)
+
+        yield batch
+
+
+# endregion
+
+
+def main():
+    # region Argument parsing
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # endregion
+
+    # region Logging
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    # endregion
+
+    # region Load Data
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
+    else:
+        # Loading the dataset from local csv or json file.
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # endregion
+
+    # region Load pretrained model and tokenizer
+    #
+    # Load pretrained model and tokenizer
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=True,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    # endregion
+
+    # region Tokenizer check: this script requires a fast tokenizer.
+    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        raise ValueError(
+            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
+            "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
+            "requirement"
+        )
+    # endregion
+
+    # region Preprocessing the datasets
+    # Preprocessing is slightly different for training and evaluation.
+    if training_args.do_train:
+        column_names = raw_datasets["train"].column_names
+    elif training_args.do_eval:
+        column_names = raw_datasets["validation"].column_names
+    else:
+        column_names = raw_datasets["test"].column_names
+    question_column_name = "question" if "question" in column_names else column_names[0]
+    context_column_name = "context" if "context" in column_names else column_names[1]
+    answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+    # Padding side determines if we do (question|context) or (context|question).
+    pad_on_right = tokenizer.padding_side == "right"
+
+    if data_args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    # Training preprocessing
+    def prepare_train_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=data_args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length",
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # The offset mappings will give us a map from token to character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+
+        # Let's label those examples!
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+
+        for i, offsets in enumerate(offset_mapping):
+            # We will label impossible answers with the index of the CLS token.
+            input_ids = tokenized_examples["input_ids"][i]
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            answers = examples[answer_column_name][sample_index]
+            # If no answers are given, set the cls_index as answer.
+            if len(answers["answer_start"]) == 0:
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                # Start/end character index of the answer in the text.
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+
+                # Start token index of the current span in the text.
+                token_start_index = 0
+                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+                    token_start_index += 1
+
+                # End token index of the current span in the text.
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+                    token_end_index -= 1
+
+                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                else:
+                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                    # Note: we could go after the last offset if the answer is the last word (edge case).
+                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
+                        token_start_index += 1
+                    tokenized_examples["start_positions"].append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    tokenized_examples["end_positions"].append(token_end_index + 1)
+
+        return tokenized_examples
+
+    processed_raw_datasets = dict()
+    if training_args.do_train:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = raw_datasets["train"]
+        if data_args.max_train_samples is not None:
+            # We will select sample from whole data if agument is specified
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+        # Create train feature from dataset
+        train_dataset = train_dataset.map(
+            prepare_train_features,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        if data_args.max_train_samples is not None:
+            # Number of samples might increase during Feature Creation, We select only specified max samples
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+        processed_raw_datasets["train"] = train_dataset
+
+    # Validation preprocessing
+    def prepare_validation_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=data_args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length",
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+        # corresponding example_id and we will store the offset mappings.
+        tokenized_examples["example_id"] = []
+
+        for i in range(len(tokenized_examples["input_ids"])):
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            context_index = 1 if pad_on_right else 0
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+
+        return tokenized_examples
+
+    if training_args.do_eval:
+        if "validation" not in raw_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_examples = raw_datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            # We will select sample from whole data
+            eval_examples = eval_examples.select(range(data_args.max_eval_samples))
+        # Validation Feature Creation
+        eval_dataset = eval_examples.map(
+            prepare_validation_features,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        if data_args.max_eval_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+        processed_raw_datasets["validation"] = eval_dataset
+
+    if training_args.do_predict:
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_examples = raw_datasets["test"]
+        if data_args.max_predict_samples is not None:
+            # We will select sample from whole data
+            predict_examples = predict_examples.select(range(data_args.max_predict_samples))
+        # Predict Feature Creation
+        predict_dataset = predict_examples.map(
+            prepare_validation_features,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        if data_args.max_predict_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+        processed_raw_datasets["test"] = predict_dataset
+    # endregion
+
+    # region Metrics and Post-processing:
+    def post_processing_function(examples, features, predictions, stage="eval"):
+        # Post-processing: we match the start logits and end logits to answers in the original context.
+        predictions = postprocess_qa_predictions(
+            examples=examples,
+            features=features,
+            predictions=predictions,
+            version_2_with_negative=data_args.version_2_with_negative,
+            n_best_size=data_args.n_best_size,
+            max_answer_length=data_args.max_answer_length,
+            null_score_diff_threshold=data_args.null_score_diff_threshold,
+            output_dir=training_args.output_dir,
+            prefix=stage,
+        )
+        # Format the result to the format the metric expects.
+        if data_args.version_2_with_negative:
+            formatted_predictions = [
+                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+            ]
+        else:
+            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+    metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
+
+    def compute_metrics(p: EvalPrediction):
+        return metric.compute(predictions=p.predictions, references=p.label_ids)
+
+    # Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
+    def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
+        """
+        Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
+
+        Args:
+            start_or_end_logits(:obj:`tensor`):
+                This is the output predictions of the model. We can only enter either start or end logits.
+            eval_dataset: Evaluation dataset
+            max_len(:obj:`int`):
+                The maximum length of the output tensor. ( See the model.eval() part for more details )
+        """
+
+        step = 0
+        # create a numpy array and fill it with -100.
+        logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float64)
+        # Now since we have create an array now we will populate it with the outputs of the model.
+        for i, output_logit in enumerate(start_or_end_logits):  # populate columns
+            # We have to fill it such that we have to take the whole tensor and replace it on the newly created array
+            # And after every iteration we have to change the step
+
+            batch_size = output_logit.shape[0]
+            cols = output_logit.shape[1]
+
+            if step + batch_size < len(dataset):
+                logits_concat[step : step + batch_size, :cols] = output_logit
+            else:
+                logits_concat[step:, :cols] = output_logit[: len(dataset) - step]
+
+            step += batch_size
+
+        return logits_concat
+
+    # endregion
+
+    # region Training steps and logging init
+    train_dataset = processed_raw_datasets["train"]
+    eval_dataset = processed_raw_datasets["validation"]
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # Define a summary writer
+    summary_writer = tensorboard.SummaryWriter(training_args.output_dir)
+    summary_writer.hparams({**training_args.to_dict(), **vars(model_args), **vars(data_args)})
+
+    def write_train_metric(summary_writer, train_metrics, train_time, step):
+        summary_writer.scalar("train_time", train_time, step)
+
+        train_metrics = get_metrics(train_metrics)
+        for key, vals in train_metrics.items():
+            tag = f"train_{key}"
+            for i, val in enumerate(vals):
+                summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+
+    def write_eval_metric(summary_writer, eval_metrics, step):
+        for metric_name, value in eval_metrics.items():
+            summary_writer.scalar(f"eval_{metric_name}", value, step)
+
+    num_epochs = int(training_args.num_train_epochs)
+    rng = jax.random.PRNGKey(training_args.seed)
+    dropout_rngs = jax.random.split(rng, jax.local_device_count())
+
+    train_batch_size = training_args.per_device_train_batch_size * jax.local_device_count()
+    eval_batch_size = training_args.per_device_eval_batch_size * jax.local_device_count()
+    # endregion
+
+    # region Load model
+    model = FlaxAutoModelForQuestionAnswering.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        seed=training_args.seed,
+        dtype=getattr(jnp, model_args.dtype),
+    )
+
+    learning_rate_fn = create_learning_rate_fn(
+        len(train_dataset),
+        train_batch_size,
+        training_args.num_train_epochs,
+        training_args.warmup_steps,
+        training_args.learning_rate,
+    )
+
+    state = create_train_state(model, learning_rate_fn, num_labels=max_seq_length, training_args=training_args)
+    # endregion
+
+    # region Define train step functions
+    def train_step(
+        state: train_state.TrainState, batch: Dict[str, Array], dropout_rng: PRNGKey
+    ) -> Tuple[train_state.TrainState, float]:
+        """Trains model with an optimizer (both in `state`) on `batch`, returning a pair `(new_state, loss)`."""
+        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
+        start_positions = batch.pop("start_positions")
+        end_positions = batch.pop("end_positions")
+        targets = (start_positions, end_positions)
+
+        def loss_fn(params):
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)
+            loss = state.loss_fn(logits, targets)
+            return loss
+
+        grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+        metrics = jax.lax.pmean({"loss": loss, "learning_rate": learning_rate_fn(state.step)}, axis_name="batch")
+        return new_state, metrics, new_dropout_rng
+
+    p_train_step = jax.pmap(train_step, axis_name="batch", donate_argnums=(0,))
+    # endregion
+
+    # region Define eval step functions
+    def eval_step(state, batch):
+        logits = state.apply_fn(**batch, params=state.params, train=False)
+        return state.logits_fn(logits)
+
+    p_eval_step = jax.pmap(eval_step, axis_name="batch")
+    # endregion
+
+    # region Define train and eval loop
+    logger.info(f"===== Starting training ({num_epochs} epochs) =====")
+    train_time = 0
+
+    # make sure weights are replicated on each device
+    state = replicate(state)
+
+    train_time = 0
+    step_per_epoch = len(train_dataset) // train_batch_size
+    total_steps = step_per_epoch * num_epochs
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    for epoch in epochs:
+
+        train_start = time.time()
+        train_metrics = []
+
+        # Create sampling rng
+        rng, input_rng = jax.random.split(rng)
+
+        # train
+        for step, batch in enumerate(
+            tqdm(
+                train_data_collator(input_rng, train_dataset, train_batch_size),
+                total=step_per_epoch,
+                desc="Training...",
+                position=1,
+            ),
+            1,
+        ):
+            state, train_metric, dropout_rngs = p_train_step(state, batch, dropout_rngs)
+            train_metrics.append(train_metric)
+
+            cur_step = epoch * step_per_epoch + step
+
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+
+                epochs.write(
+                    f"Step... ({cur_step}/{total_steps} | Training Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
+                )
+
+                train_metrics = []
+
+            if (
+                training_args.do_eval
+                and (cur_step % training_args.eval_steps == 0 or cur_step % step_per_epoch == 0)
+                and cur_step > 0
+            ):
+
+                eval_metrics = {}
+                all_start_logits = []
+                all_end_logits = []
+                # evaluate
+                for batch in tqdm(
+                    eval_data_collator(eval_dataset, eval_batch_size),
+                    total=len(eval_dataset) // eval_batch_size,
+                    desc="Evaluating ...",
+                    position=2,
+                ):
+                    _ = batch.pop("example_id")
+                    _ = batch.pop("offset_mapping")
+                    predictions = p_eval_step(state, batch)
+                    start_logits = np.array([pred for pred in chain(*predictions[0])])
+                    end_logits = np.array([pred for pred in chain(*predictions[1])])
+                    all_start_logits.append(start_logits)
+                    all_end_logits.append(end_logits)
+
+                # evaluate also on leftover examples (not divisible by batch_size)
+                num_leftover_samples = len(eval_dataset) % eval_batch_size
+
+                # make sure leftover batch is evaluated on one device
+                if num_leftover_samples > 0 and jax.process_index() == 0:
+                    # take leftover samples
+                    batch = eval_dataset[-num_leftover_samples:]
+                    batch = {k: np.array(v) for k, v in batch.items()}
+                    _ = batch.pop("example_id")
+                    _ = batch.pop("offset_mapping")
+
+                    predictions = eval_step(unreplicate(state), batch)
+                    start_logits = np.array([pred for pred in predictions[0]])
+                    end_logits = np.array([pred for pred in predictions[1]])
+                    all_start_logits.append(start_logits)
+                    all_end_logits.append(end_logits)
+
+                max_len = max([x.shape[1] for x in all_start_logits])  # Get the max_length of the tensor
+
+                # concatenate the numpy array
+                start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, max_len)
+                end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset, max_len)
+
+                # delete the list of numpy arrays
+                del all_start_logits
+                del all_end_logits
+                outputs_numpy = (start_logits_concat, end_logits_concat)
+                prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy)
+                eval_metrics = compute_metrics(prediction)
+
+                logger.info(f"Step... ({cur_step}/{total_steps} | Evaluation metrics: {eval_metrics})")
+
+                if jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+
+            if (cur_step % training_args.save_steps == 0 and cur_step > 0) or (cur_step == total_steps):
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(unreplicate(state.params))
+                    model.save_pretrained(
+                        training_args.output_dir,
+                        params=params,
+                        push_to_hub=training_args.push_to_hub,
+                        commit_message=f"Saving weights and logs of step {cur_step}",
+                    )
+        epochs.desc = f"Epoch ... {epoch + 1}/{num_epochs}"
+    # endregion
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/flax/question-answering/utils_qa.py
+++ b/examples/flax/question-answering/utils_qa.py
@@ -0,0 +1,427 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Post-processing utilities for question answering.
+"""
+import collections
+import json
+import logging
+import os
+from typing import Optional, Tuple
+
+import numpy as np
+from tqdm.auto import tqdm
+
+
+logger = logging.getLogger(__name__)
+
+
+def postprocess_qa_predictions(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    null_score_diff_threshold: float = 0.0,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
+    original contexts. This is the base postprocessing functions for models that only return start and end logits.
+
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
+            The threshold used to select the null answer: if the best answer has a score that is less than the score of
+            the null answer minus this threshold, the null answer is selected for this example (note that the score of
+            the null answer for an example giving several features is the minimum of the scores for the null answer on
+            each feature: all features must be aligned on the fact they `want` to predict a null answer).
+
+            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+    assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)."
+    all_start_logits, all_end_logits = predictions
+
+    assert len(predictions[0]) == len(features), f"Got {len(predictions[0])} predictions and {len(features)} features."
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    if version_2_with_negative:
+        scores_diff_json = collections.OrderedDict()
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(tqdm(examples)):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_prediction = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_logits = all_start_logits[feature_index]
+            end_logits = all_end_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+            # Update minimum null prediction.
+            feature_null_score = start_logits[0] + end_logits[0]
+            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
+                min_null_prediction = {
+                    "offsets": (0, 0),
+                    "score": feature_null_score,
+                    "start_logit": start_logits[0],
+                    "end_logit": end_logits[0],
+                }
+
+            # Go through all possibilities for the `n_best_size` greater start and end logits.
+            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
+                    # to part of the input_ids that are not in the context.
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or offset_mapping[end_index] is None
+                    ):
+                        continue
+                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+                        continue
+                    prelim_predictions.append(
+                        {
+                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+                            "score": start_logits[start_index] + end_logits[end_index],
+                            "start_logit": start_logits[start_index],
+                            "end_logit": end_logits[end_index],
+                        }
+                    )
+        if version_2_with_negative:
+            # Add the minimum null prediction
+            prelim_predictions.append(min_null_prediction)
+            null_score = min_null_prediction["score"]
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+        # Add back the minimum null prediction if it was removed because of its low score.
+        if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions):
+            predictions.append(min_null_prediction)
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
+            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction. If the null answer is not possible, this is easy.
+        if not version_2_with_negative:
+            all_predictions[example["id"]] = predictions[0]["text"]
+        else:
+            # Otherwise we first need to find the best non-empty prediction.
+            i = 0
+            while predictions[i]["text"] == "":
+                i += 1
+            best_non_null_pred = predictions[i]
+
+            # Then we compare to the null prediction using the threshold.
+            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
+            scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example["id"]] = ""
+            else:
+                all_predictions[example["id"]] = best_non_null_pred["text"]
+
+        # Make `predictions` JSON-serializable by casting np.float back to float.
+        all_nbest_json[example["id"]] = [
+            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        assert os.path.isdir(output_dir), f"{output_dir} is not a directory."
+
+        prediction_file = os.path.join(
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+        )
+        nbest_file = os.path.join(
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
+
+
+def postprocess_qa_predictions_with_beam_search(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    start_n_top: int = 5,
+    end_n_top: int = 5,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
+    original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as
+    cls token predictions.
+
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        start_n_top (:obj:`int`, `optional`, defaults to 5):
+            The number of top start logits too keep when searching for the :obj:`n_best_size` predictions.
+        end_n_top (:obj:`int`, `optional`, defaults to 5):
+            The number of top end logits too keep when searching for the :obj:`n_best_size` predictions.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+    assert len(predictions) == 5, "`predictions` should be a tuple with five elements."
+    start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
+
+    assert len(predictions[0]) == len(
+        features
+    ), f"Got {len(predictions[0])} predicitions and {len(features)} features."
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(tqdm(examples)):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_score = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_log_prob = start_top_log_probs[feature_index]
+            start_indexes = start_top_index[feature_index]
+            end_log_prob = end_top_log_probs[feature_index]
+            end_indexes = end_top_index[feature_index]
+            feature_null_score = cls_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+            # Update minimum null prediction
+            if min_null_score is None or feature_null_score < min_null_score:
+                min_null_score = feature_null_score
+
+            # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_index = int(start_indexes[i])
+                    j_index = i * end_n_top + j
+                    end_index = int(end_indexes[j_index])
+                    # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
+                    # p_mask but let's not take any risk)
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or offset_mapping[end_index] is None
+                    ):
+                        continue
+                    # Don't consider answers with a length negative or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+                        continue
+                    prelim_predictions.append(
+                        {
+                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+                            "score": start_log_prob[i] + end_log_prob[j_index],
+                            "start_log_prob": start_log_prob[i],
+                            "end_log_prob": end_log_prob[j_index],
+                        }
+                    )
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0:
+            predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6})
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction and set the probability for the null answer.
+        all_predictions[example["id"]] = predictions[0]["text"]
+        if version_2_with_negative:
+            scores_diff_json[example["id"]] = float(min_null_score)
+
+        # Make `predictions` JSON-serializable by casting np.float back to float.
+        all_nbest_json[example["id"]] = [
+            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        assert os.path.isdir(output_dir), f"{output_dir} is not a directory."
+
+        prediction_file = os.path.join(
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+        )
+        nbest_file = os.path.join(
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions, scores_diff_json
--- a/examples/flax/text-classification/README.md
+++ b/examples/flax/text-classification/README.md
@@ -100,7 +100,7 @@ In the Tensorboard results linked below, the random seed of each model is equal

 | Task  | Metric                       | Acc (best run) | Acc (avg/5runs) | Stdev     | Metrics                                                                  |
 |-------|------------------------------|----------------|-----------------|-----------|--------------------------------------------------------------------------|
-| CoLA  | Matthew's corr               | 60.57          | 59.04           | 1.06      | [tfhub.dev](https://tensorboard.dev/experiment/lfr2adVpRtmLDALKrElkzg/)  |
+| CoLA  | Matthews corr                | 60.57          | 59.04           | 1.06      | [tfhub.dev](https://tensorboard.dev/experiment/lfr2adVpRtmLDALKrElkzg/)  |
 | SST-2 | Accuracy                     | 92.66          | 92.23           | 0.57      | [tfhub.dev](https://tensorboard.dev/experiment/jYvfv2trRHKMjoWnXVwrZA/)  |
 | MRPC  | F1/Accuracy                  | 89.90/85.78    | 88.97/84.36     | 0.72/1.09 | [tfhub.dev](https://tensorboard.dev/experiment/bo3W3DEoRw2Q7YXjWrJkfg/)  |
 | STS-B | Pearson/Spearman corr.       | 89.04/88.70    | 88.94/88.63     | 0.07/0.07 | [tfhub.dev](https://tensorboard.dev/experiment/fxVwbLD7QpKhbot0r9rn2w/)  |
--- a/examples/flax/token-classification/README.md
+++ b/examples/flax/token-classification/README.md
@@ -0,0 +1,74 @@
+<!---
+Copyright 2021 The Google Flax Team Authors and HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Token classification examples
+
+Fine-tuning the library models for token classification task such as Named Entity Recognition (NER), Parts-of-speech tagging (POS) or phrase extraction (CHUNKS). The main script run_flax_ner.py leverages the 🤗 Datasets library. You can easily customize it to your needs if you need extra processing on your datasets.
+
+It will either run on a datasets hosted on our hub or with your own text files for training and validation, you might just need to add some tweaks in the data preprocessing.
+
+The following example fine-tunes BERT on CoNLL-2003:
+
+To begin with it is recommended to create a model repository to save the trained model and logs.
+Here we call the model `"bert-ner-conll2003-test"`, but you can change the model name as you like.
+
+You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
+you are logged in) or via the command line:
+
+```
+huggingface-cli repo create bert-ner-conll2003-test
+```
+
+Next we clone the model repository to add the tokenizer and model files.
+
+```
+git clone https://huggingface.co/<your-username>/bert-ner-conll2003-test
+```
+
+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
+Next, let's add a symbolic link to the `run_flax_ner.py`.
+
+```bash
+export MODEL_DIR="./bert-ner-conll2003-test"
+ln -s ~/transformers/examples/flax/token-classification/run_flax_ner.py run_flax_ner.py
+```
+
+```bash
+python run_flax_ner.py \
+  --model_name_or_path bert-base-cased \
+  --dataset_name conll2003 \
+  --max_seq_length 128 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --per_device_train_batch_size 4 \
+  --output_dir ${MODEL_DIR} \
+  --eval_steps 300 \
+  --push_to_hub
+```
+
+Using the command above, the script will train for 3 epochs and run eval after each epoch. 
+Metrics and hyperparameters are stored in Tensorflow event files in `--output_dir`.
+You can see the results by running `tensorboard` in that directory:
+
+```bash
+$ tensorboard --logdir .
+```
+
+or directly on the hub under *Training metrics*.
+
+sample Metrics - [tfhub.dev](https://tensorboard.dev/experiment/u52qsBIpQSKEEXEJd2LVYA)
--- a/examples/flax/token-classification/requirements.txt
+++ b/examples/flax/token-classification/requirements.txt
@@ -0,0 +1,6 @@
+datasets >= 1.8.0
+jax>=0.2.8
+jaxlib>=0.1.59
+flax>=0.3.4
+optax>=0.0.8
+seqeval
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -0,0 +1,669 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Fine-tuning a 🤗 Flax Transformers model on token classification tasks (NER, POS, CHUNKS)"""
+import logging
+import os
+import random
+import sys
+import time
+from dataclasses import dataclass, field
+from itertools import chain
+from typing import Any, Callable, Dict, Optional, Tuple
+
+import datasets
+import numpy as np
+from datasets import ClassLabel, load_dataset, load_metric
+from tqdm import tqdm
+
+import jax
+import jax.numpy as jnp
+import optax
+import transformers
+from flax import struct, traverse_util
+from flax.jax_utils import replicate, unreplicate
+from flax.metrics import tensorboard
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    FlaxAutoModelForTokenClassification,
+    HfArgumentParser,
+    TrainingArguments,
+)
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+
+
+logger = logging.getLogger(__name__)
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.11.0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
+
+Array = Any
+Dataset = datasets.arrow_dataset.Dataset
+PRNGKey = Any
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    task_name: Optional[str] = field(default="ner", metadata={"help": "The name of the task (ner, pos...)."})
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a csv or JSON file)."}
+    )
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate on (a csv or JSON file)."},
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input test data file to predict on (a csv or JSON file)."},
+    )
+    text_column_name: Optional[str] = field(
+        default=None, metadata={"help": "The column name of text to input in the file (a csv or JSON file)."}
+    )
+    label_column_name: Optional[str] = field(
+        default=None, metadata={"help": "The column name of label to input in the file (a csv or JSON file)."}
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_seq_length: int = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. If set, sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+            "value if set."
+        },
+    )
+    label_all_tokens: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to put the label for one word on all tokens of generated by that word or just on the "
+            "one (in which case the other tokens will have a padding index)."
+        },
+    )
+    return_entity_level_metrics: bool = field(
+        default=False,
+        metadata={"help": "Whether to return all the entity levels during evaluation or just the overall ones."},
+    )
+
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+        self.task_name = self.task_name.lower()
+
+
+def create_train_state(
+    model: FlaxAutoModelForTokenClassification,
+    learning_rate_fn: Callable[[int], float],
+    num_labels: int,
+    training_args: TrainingArguments,
+) -> train_state.TrainState:
+    """Create initial training state."""
+
+    class TrainState(train_state.TrainState):
+        """Train state with an Optax optimizer.
+
+        The two functions below differ depending on whether the task is classification
+        or regression.
+
+        Args:
+          logits_fn: Applied to last layer to obtain the logits.
+          loss_fn: Function to compute the loss.
+        """
+
+        logits_fn: Callable = struct.field(pytree_node=False)
+        loss_fn: Callable = struct.field(pytree_node=False)
+
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBERT-like models.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+
+    tx = optax.adamw(
+        learning_rate=learning_rate_fn,
+        b1=training_args.adam_beta1,
+        b2=training_args.adam_beta2,
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+        mask=decay_mask_fn,
+    )
+
+    def cross_entropy_loss(logits, labels):
+        xentropy = optax.softmax_cross_entropy(logits, onehot(labels, num_classes=num_labels))
+        return jnp.mean(xentropy)
+
+    return TrainState.create(
+        apply_fn=model.__call__,
+        params=model.params,
+        tx=tx,
+        logits_fn=lambda logits: logits.argmax(-1),
+        loss_fn=cross_entropy_loss,
+    )
+
+
+def create_learning_rate_fn(
+    train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, linear_decay learning rate function."""
+    steps_per_epoch = train_ds_size // train_batch_size
+    num_train_steps = steps_per_epoch * num_train_epochs
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+
+
+def train_data_collator(rng: PRNGKey, dataset: Dataset, batch_size: int):
+    """Returns shuffled batches of size `batch_size` from truncated `train dataset`, sharded over all local devices."""
+    steps_per_epoch = len(dataset) // batch_size
+    perms = jax.random.permutation(rng, len(dataset))
+    perms = perms[: steps_per_epoch * batch_size]  # Skip incomplete batch.
+    perms = perms.reshape((steps_per_epoch, batch_size))
+
+    for perm in perms:
+        batch = dataset[perm]
+        batch = {k: np.array(v) for k, v in batch.items()}
+        batch = shard(batch)
+
+        yield batch
+
+
+def eval_data_collator(dataset: Dataset, batch_size: int):
+    """Returns batches of size `batch_size` from `eval dataset`, sharded over all local devices."""
+    for i in range(len(dataset) // batch_size):
+        batch = dataset[i * batch_size : (i + 1) * batch_size]
+        batch = {k: np.array(v) for k, v in batch.items()}
+        batch = shard(batch)
+
+        yield batch
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called
+    # 'tokens' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
+    else:
+        # Loading the dataset from local csv or json file.
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = (data_args.train_file if data_args.train_file is not None else data_args.valid_file).split(".")[-1]
+        raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+    # See more about loading any type of standard or custom dataset at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    if raw_datasets["train"] is not None:
+        column_names = raw_datasets["train"].column_names
+        features = raw_datasets["train"].features
+    else:
+        column_names = raw_datasets["validation"].column_names
+        features = raw_datasets["validation"].features
+
+    if data_args.text_column_name is not None:
+        text_column_name = data_args.text_column_name
+    elif "tokens" in column_names:
+        text_column_name = "tokens"
+    else:
+        text_column_name = column_names[0]
+
+    if data_args.label_column_name is not None:
+        label_column_name = data_args.label_column_name
+    elif f"{data_args.task_name}_tags" in column_names:
+        label_column_name = f"{data_args.task_name}_tags"
+    else:
+        label_column_name = column_names[1]
+
+    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
+    # unique labels.
+    def get_label_list(labels):
+        unique_labels = set()
+        for label in labels:
+            unique_labels = unique_labels | set(label)
+        label_list = list(unique_labels)
+        label_list.sort()
+        return label_list
+
+    if isinstance(features[label_column_name].feature, ClassLabel):
+        label_list = features[label_column_name].feature.names
+        # No need to convert the labels since they are already ints.
+        label_to_id = {i: i for i in range(len(label_list))}
+    else:
+        label_list = get_label_list(raw_datasets["train"][label_column_name])
+        label_to_id = {l: i for i, l in enumerate(label_list)}
+    num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        num_labels=num_labels,
+        label2id=label_to_id,
+        id2label={i: l for l, i in label_to_id.items()},
+        finetuning_task=data_args.task_name,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
+    if config.model_type in {"gpt2", "roberta"}:
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path,
+            cache_dir=model_args.cache_dir,
+            revision=model_args.model_revision,
+            use_auth_token=True if model_args.use_auth_token else None,
+            add_prefix_space=True,
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name_or_path,
+            cache_dir=model_args.cache_dir,
+            revision=model_args.model_revision,
+            use_auth_token=True if model_args.use_auth_token else None,
+        )
+    model = FlaxAutoModelForTokenClassification.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    # Preprocessing the datasets
+    # Tokenize all texts and align the labels with them.
+    def tokenize_and_align_labels(examples):
+        tokenized_inputs = tokenizer(
+            examples[text_column_name],
+            max_length=data_args.max_seq_length,
+            padding="max_length",
+            truncation=True,
+            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
+            is_split_into_words=True,
+        )
+
+        labels = []
+
+        for i, label in enumerate(examples[label_column_name]):
+            word_ids = tokenized_inputs.word_ids(batch_index=i)
+            previous_word_idx = None
+            label_ids = []
+            for word_idx in word_ids:
+                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
+                # ignored in the loss function.
+                if word_idx is None:
+                    label_ids.append(-100)
+                # We set the label for the first token of each word.
+                elif word_idx != previous_word_idx:
+                    label_ids.append(label_to_id[label[word_idx]])
+                # For the other tokens in a word, we set the label to either the current label or -100, depending on
+                # the label_all_tokens flag.
+                else:
+                    label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100)
+                previous_word_idx = word_idx
+
+            labels.append(label_ids)
+        tokenized_inputs["labels"] = labels
+        return tokenized_inputs
+
+    processed_raw_datasets = raw_datasets.map(
+        tokenize_and_align_labels,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        load_from_cache_file=not data_args.overwrite_cache,
+        remove_columns=raw_datasets["train"].column_names,
+        desc="Running tokenizer on dataset",
+    )
+
+    train_dataset = processed_raw_datasets["train"]
+    eval_dataset = processed_raw_datasets["validation"]
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # Define a summary writer
+    summary_writer = tensorboard.SummaryWriter(training_args.output_dir)
+    summary_writer.hparams({**training_args.to_dict(), **vars(model_args), **vars(data_args)})
+
+    def write_train_metric(summary_writer, train_metrics, train_time, step):
+        summary_writer.scalar("train_time", train_time, step)
+
+        train_metrics = get_metrics(train_metrics)
+        for key, vals in train_metrics.items():
+            tag = f"train_{key}"
+            for i, val in enumerate(vals):
+                summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+
+    def write_eval_metric(summary_writer, eval_metrics, step):
+        for metric_name, value in eval_metrics.items():
+            summary_writer.scalar(f"eval_{metric_name}", value, step)
+
+    num_epochs = int(training_args.num_train_epochs)
+    rng = jax.random.PRNGKey(training_args.seed)
+    dropout_rngs = jax.random.split(rng, jax.local_device_count())
+
+    train_batch_size = training_args.per_device_train_batch_size * jax.local_device_count()
+    eval_batch_size = training_args.per_device_eval_batch_size * jax.local_device_count()
+
+    learning_rate_fn = create_learning_rate_fn(
+        len(train_dataset),
+        train_batch_size,
+        training_args.num_train_epochs,
+        training_args.warmup_steps,
+        training_args.learning_rate,
+    )
+
+    state = create_train_state(model, learning_rate_fn, num_labels=num_labels, training_args=training_args)
+
+    # define step functions
+    def train_step(
+        state: train_state.TrainState, batch: Dict[str, Array], dropout_rng: PRNGKey
+    ) -> Tuple[train_state.TrainState, float]:
+        """Trains model with an optimizer (both in `state`) on `batch`, returning a pair `(new_state, loss)`."""
+        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
+        targets = batch.pop("labels")
+
+        def loss_fn(params):
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+            loss = state.loss_fn(logits, targets)
+            return loss
+
+        grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+        metrics = jax.lax.pmean({"loss": loss, "learning_rate": learning_rate_fn(state.step)}, axis_name="batch")
+        return new_state, metrics, new_dropout_rng
+
+    p_train_step = jax.pmap(train_step, axis_name="batch", donate_argnums=(0,))
+
+    def eval_step(state, batch):
+        logits = state.apply_fn(**batch, params=state.params, train=False)[0]
+        return state.logits_fn(logits)
+
+    p_eval_step = jax.pmap(eval_step, axis_name="batch")
+
+    metric = load_metric("seqeval")
+
+    def get_labels(y_pred, y_true):
+        # Transform predictions and references tensos to numpy arrays
+
+        # Remove ignored index (special tokens)
+        true_predictions = [
+            [label_list[p] for (p, l) in zip(pred, gold_label) if l != -100]
+            for pred, gold_label in zip(y_pred, y_true)
+        ]
+        true_labels = [
+            [label_list[l] for (p, l) in zip(pred, gold_label) if l != -100]
+            for pred, gold_label in zip(y_pred, y_true)
+        ]
+        return true_predictions, true_labels
+
+    def compute_metrics():
+        results = metric.compute()
+        if data_args.return_entity_level_metrics:
+            # Unpack nested dictionaries
+            final_results = {}
+            for key, value in results.items():
+                if isinstance(value, dict):
+                    for n, v in value.items():
+                        final_results[f"{key}_{n}"] = v
+                else:
+                    final_results[key] = value
+            return final_results
+        else:
+            return {
+                "precision": results["overall_precision"],
+                "recall": results["overall_recall"],
+                "f1": results["overall_f1"],
+                "accuracy": results["overall_accuracy"],
+            }
+
+    logger.info(f"===== Starting training ({num_epochs} epochs) =====")
+    train_time = 0
+
+    # make sure weights are replicated on each device
+    state = replicate(state)
+
+    train_time = 0
+    step_per_epoch = len(train_dataset) // train_batch_size
+    total_steps = step_per_epoch * num_epochs
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    for epoch in epochs:
+
+        train_start = time.time()
+        train_metrics = []
+
+        # Create sampling rng
+        rng, input_rng = jax.random.split(rng)
+
+        # train
+        for step, batch in enumerate(
+            tqdm(
+                train_data_collator(input_rng, train_dataset, train_batch_size),
+                total=step_per_epoch,
+                desc="Training...",
+                position=1,
+            )
+        ):
+            state, train_metric, dropout_rngs = p_train_step(state, batch, dropout_rngs)
+            train_metrics.append(train_metric)
+
+            cur_step = epoch * step_per_epoch + step
+
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+
+                epochs.write(
+                    f"Step... ({cur_step}/{total_steps} | Training Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
+                )
+
+                train_metrics = []
+
+            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
+
+                eval_metrics = {}
+                # evaluate
+                for batch in tqdm(
+                    eval_data_collator(eval_dataset, eval_batch_size),
+                    total=len(eval_dataset) // eval_batch_size,
+                    desc="Evaluating ...",
+                    position=2,
+                ):
+                    labels = batch.pop("labels")
+                    predictions = p_eval_step(state, batch)
+                    predictions = np.array([pred for pred in chain(*predictions)])
+                    labels = np.array([label for label in chain(*labels)])
+                    labels[np.array(chain(*batch["attention_mask"])) == 0] = -100
+                    preds, refs = get_labels(predictions, labels)
+                    metric.add_batch(
+                        predictions=preds,
+                        references=refs,
+                    )
+
+                # evaluate also on leftover examples (not divisible by batch_size)
+                num_leftover_samples = len(eval_dataset) % eval_batch_size
+
+                # make sure leftover batch is evaluated on one device
+                if num_leftover_samples > 0 and jax.process_index() == 0:
+                    # take leftover samples
+                    batch = eval_dataset[-num_leftover_samples:]
+                    batch = {k: np.array(v) for k, v in batch.items()}
+
+                    labels = batch.pop("labels")
+                    predictions = eval_step(unreplicate(state), batch)
+                    labels = np.array(labels)
+                    labels[np.array(batch["attention_mask"]) == 0] = -100
+                    preds, refs = get_labels(predictions, labels)
+                    metric.add_batch(
+                        predictions=preds,
+                        references=refs,
+                    )
+
+                eval_metrics = compute_metrics()
+
+                if data_args.return_entity_level_metrics:
+                    logger.info(f"Step... ({cur_step}/{total_steps} | Validation metrics: {eval_metrics}")
+                else:
+                    logger.info(
+                        f"Step... ({cur_step}/{total_steps} | Validation f1: {eval_metrics['f1']}, Validation Acc: {eval_metrics['accuracy']})"
+                    )
+
+                if jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+
+            if (cur_step % training_args.save_steps == 0 and cur_step > 0) or (cur_step == total_steps):
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(unreplicate(state.params))
+                    model.save_pretrained(
+                        training_args.output_dir,
+                        params=params,
+                        push_to_hub=training_args.push_to_hub,
+                        commit_message=f"Saving weights and logs of step {cur_step}",
+                    )
+        epochs.desc = f"Epoch ... {epoch + 1}/{num_epochs}"
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/legacy/multiple_choice/utils_multiple_choice.py
+++ b/examples/legacy/multiple_choice/utils_multiple_choice.py
@@ -77,7 +77,7 @@ class Split(Enum):

 if is_torch_available():
    import torch
-    from torch.utils.data.dataset import Dataset
+    from torch.utils.data import Dataset

    class MultipleChoiceDataset(Dataset):
        """
--- a/examples/legacy/seq2seq/seq2seq_trainer.py
+++ b/examples/legacy/seq2seq/seq2seq_trainer.py
@@ -141,7 +141,7 @@ class Seq2SeqTrainer(Trainer):
            )
        return scheduler

-    def _get_train_sampler(self) -> Optional[torch.utils.data.sampler.Sampler]:
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
        if isinstance(self.train_dataset, torch.utils.data.IterableDataset):
            return None
        elif is_torch_tpu_available():
--- a/examples/legacy/token-classification/utils_ner.py
+++ b/examples/legacy/token-classification/utils_ner.py
@@ -206,7 +206,7 @@ class TokenClassificationTask:
 if is_torch_available():
    import torch
    from torch import nn
-    from torch.utils.data.dataset import Dataset
+    from torch.utils.data import Dataset

    class TokenClassificationDataset(Dataset):
        """
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@@ -74,6 +74,17 @@ line, 🤗 Trainer supports resuming from a checkpoint via `trainer.train(resume
 2. If `resume_from_checkpoint` is a path to a specific checkpoint it will use that saved checkpoint folder to resume the training from.


+### Upload the trained/fine-tuned model to the Hub
+
+All the example scripts support automatic upload of your final model to the [Model Hub](https://huggingface.co/models) by adding a `--push_to_hub` argument. It will then create a repository with your username slash the name of the folder you are using as `output_dir`. For instance, `"sgugger/test-mrpc"` if your username is `sgugger` and you are working in the folder `~/tmp/test-mrpc`.
+
+To specify a given repository name, use the `--hub_model_id` argument. You will need to specify the whole repository name (including your username), for instance `--hub_model_id sgugger/finetuned-bert-mrpc`. To upload to an organization you are a member of, just use the name of that organization instead of your username: `--hub_model_id huggingface/finetuned-bert-mrpc`.
+
+A few notes on this integration:
+
+- you will need to be logged in to the Hugging Face website locally for it to work, the easiest way to achieve this is to run `huggingface-cli login` and then type your username and password when prompted. You can also pass along your authentication token with the `--hub_token` argument.
+- the `output_dir` you pick will either need to be a new folder or a local clone of the distant repository you are using.
+
 ## Distributed training and mixed precision

 All the PyTorch scripts mentioned above work out of the box with distributed training and mixed precision, thanks to
--- a/examples/pytorch/_tests_requirements.txt
+++ b/examples/pytorch/_tests_requirements.txt
@@ -18,3 +18,5 @@ pytest
 conllu
 sentencepiece != 0.1.92
 protobuf
+torchvision
+jiwer
--- a/examples/pytorch/image-classification/README.md
+++ b/examples/pytorch/image-classification/README.md
@@ -0,0 +1,133 @@
+<!---
+Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Image classification examples
+
+The following examples showcase how to fine-tune a `ViT` for image-classification using PyTorch.
+
+## Using datasets from 🤗 `datasets`
+
+Here we show how to fine-tune a `ViT` on the [beans](https://huggingface.co/datasets/beans) dataset.
+
+👀 See the results here: [nateraw/vit-base-beans](https://huggingface.co/nateraw/vit-base-beans).
+
+```bash
+python run_image_classification.py \
+    --dataset_name beans \
+    --output_dir ./beans_outputs/ \
+    --remove_unused_columns False \
+    --do_train \
+    --do_eval \
+    --push_to_hub \
+    --push_to_hub_model_id vit-base-beans \
+    --learning_rate 2e-5 \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 8 \
+    --per_device_eval_batch_size 8 \
+    --logging_strategy steps \
+    --logging_steps 10 \
+    --evaluation_strategy epoch \
+    --save_strategy epoch \
+    --load_best_model_at_end True \
+    --save_total_limit 3 \
+    --seed 1337
+```
+
+Here we show how to fine-tune a `ViT` on the [cats_vs_dogs](https://huggingface.co/datasets/cats_vs_dogs) dataset.
+
+👀 See the results here: [nateraw/vit-base-cats-vs-dogs](https://huggingface.co/nateraw/vit-base-cats-vs-dogs).
+
+```bash
+python run_image_classification.py \
+    --dataset_name cats_vs_dogs \
+    --output_dir ./cats_vs_dogs_outputs/ \
+    --remove_unused_columns False \
+    --do_train \
+    --do_eval \
+    --push_to_hub \
+    --push_to_hub_model_id vit-base-cats-vs-dogs \
+    --fp16 True \
+    --learning_rate 2e-4 \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 32 \
+    --per_device_eval_batch_size 32 \
+    --logging_strategy steps \
+    --logging_steps 10 \
+    --evaluation_strategy epoch \
+    --save_strategy epoch \
+    --load_best_model_at_end True \
+    --save_total_limit 3 \
+    --seed 1337
+```
+
+## Using your own data
+
+To use your own dataset, the training script expects the following directory structure:
+
+```bash
+root/dog/xxx.png
+root/dog/xxy.png
+root/dog/[...]/xxz.png
+
+root/cat/123.png
+root/cat/nsdf3.png
+root/cat/[...]/asd932_.png
+```
+
+Once you've prepared your dataset, you can can run the script like this:
+
+```bash
+python run_image_classification.py \
+    --dataset_name nateraw/image-folder \
+    --train_dir <path-to-train-root> \
+    --output_dir ./outputs/ \
+    --remove_unused_columns False \
+    --do_train \
+    --do_eval
+```
+
+### 💡 The above will split the train dir into training and evaluation sets
+  - To control the split amount, use the `--train_val_split` flag.
+  - To provide your own validation split in its own directory, you can pass the `--validation_dir <path-to-val-root>` flag.
+
+
+## Sharing your model on 🤗 Hub
+
+0. If you haven't already, [sign up](https://huggingface.co/join) for a 🤗 account
+
+1. Make sure you have `git-lfs` installed and git set up.
+
+```bash
+$ apt install git-lfs
+$ git config --global user.email "you@example.com"
+$ git config --global user.name "Your Name"
+```
+
+2. Log in with your HuggingFace account credentials using `huggingface-cli`
+
+```bash
+$ huggingface-cli login
+# ...follow the prompts
+```
+
+3. When running the script, pass the following arguments:
+
+```bash
+python run_image_classification.py \
+    --push_to_hub \
+    --push_to_hub_model_id <name-your-model> \
+    ...
+```
--- a/examples/pytorch/image-classification/requirements.txt
+++ b/examples/pytorch/image-classification/requirements.txt
@@ -0,0 +1,2 @@
+torch>=1.9.0
+torchvision>=0.10.0
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@@ -0,0 +1,360 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+import datasets
+import numpy as np
+import torch
+from datasets import load_dataset
+from PIL import Image
+from torchvision.transforms import (
+    CenterCrop,
+    Compose,
+    Normalize,
+    RandomHorizontalFlip,
+    RandomResizedCrop,
+    Resize,
+    ToTensor,
+)
+
+import transformers
+from transformers import (
+    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoModelForImageClassification,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+
+
+""" Fine-tuning a 🤗 Transformers model for image classification"""
+
+logger = logging.getLogger(__name__)
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.11.0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
+
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def pil_loader(path: str):
+    with open(path, "rb") as f:
+        im = Image.open(f)
+        return im.convert("RGB")
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    dataset_name: Optional[str] = field(
+        default="nateraw/image-folder", metadata={"help": "Name of a dataset from the datasets package"}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the training data."})
+    validation_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the validation data."})
+    train_val_split: Optional[float] = field(
+        default=0.15, metadata={"help": "Percent to split off of train for validation."}
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    image_size: Optional[int] = field(default=224, metadata={"help": " The size (resolution) of each image."})
+
+    def __post_init__(self):
+        data_files = dict()
+        if self.train_dir is not None:
+            data_files["train"] = self.train_dir
+        if self.validation_dir is not None:
+            data_files["val"] = self.validation_dir
+        self.data_files = data_files if data_files else None
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        default="google/vit-base-patch16-224-in21k",
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+
+
+def collate_fn(examples):
+    pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    labels = torch.tensor([example["labels"] for example in examples])
+    return {"pixel_values": pixel_values, "labels": labels}
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Initialize our dataset and prepare it for the 'image-classification' task.
+    ds = load_dataset(
+        data_args.dataset_name,
+        data_args.dataset_config_name,
+        data_files=data_args.data_files,
+        cache_dir=model_args.cache_dir,
+        task="image-classification",
+    )
+
+    # Define torchvision transforms to be applied to each image.
+    normalize = Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+    _train_transforms = Compose(
+        [
+            RandomResizedCrop(data_args.image_size),
+            RandomHorizontalFlip(),
+            ToTensor(),
+            normalize,
+        ]
+    )
+    _val_transforms = Compose(
+        [
+            Resize(data_args.image_size),
+            CenterCrop(data_args.image_size),
+            ToTensor(),
+            normalize,
+        ]
+    )
+
+    def train_transforms(example_batch):
+        """Apply _train_transforms across a batch."""
+        example_batch["pixel_values"] = [_train_transforms(pil_loader(f)) for f in example_batch["image_file_path"]]
+        return example_batch
+
+    def val_transforms(example_batch):
+        """Apply _val_transforms across a batch."""
+        example_batch["pixel_values"] = [_val_transforms(pil_loader(f)) for f in example_batch["image_file_path"]]
+        return example_batch
+
+    # If we don't have a validation split, split off a percentage of train as validation.
+    data_args.train_val_split = None if "validation" in ds.keys() else data_args.train_val_split
+    if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0:
+        split = ds["train"].train_test_split(data_args.train_val_split)
+        ds["train"] = split["train"]
+        ds["validation"] = split["test"]
+
+    # Prepare label mappings.
+    # We'll include these in the model's config to get human readable labels in the Inference API.
+    labels = ds["train"].features["labels"].names
+    label2id, id2label = dict(), dict()
+    for i, label in enumerate(labels):
+        label2id[label] = str(i)
+        id2label[str(i)] = label
+
+    # Load the accuracy metric from the datasets package
+    metric = datasets.load_metric("accuracy")
+
+    # Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
+    # predictions and label_ids field) and has to return a dictionary string to float.
+    def compute_metrics(p):
+        """Computes accuracy on a batch of predictions"""
+        return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
+
+    config = AutoConfig.from_pretrained(
+        model_args.config_name or model_args.model_name_or_path,
+        num_labels=len(labels),
+        label2id=label2id,
+        id2label=id2label,
+        finetuning_task="image-classification",
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    model = AutoModelForImageClassification.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    # NOTE - We aren't directly using this feature extractor since we defined custom transforms above.
+    # We initialize this instance below and pass it to Trainer to ensure that the feature extraction
+    # config, preprocessor_config.json, is included in output directories.
+    # This way if we push a model to the hub, the inference widget will work.
+    feature_extractor = AutoFeatureExtractor.from_pretrained(
+        model_args.feature_extractor_name or model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        size=data_args.image_size,
+        image_mean=normalize.mean,
+        image_std=normalize.std,
+    )
+
+    if training_args.do_train:
+        if "train" not in ds:
+            raise ValueError("--do_train requires a train dataset")
+        if data_args.max_train_samples is not None:
+            ds["train"] = ds["train"].shuffle(seed=training_args.seed).select(range(data_args.max_train_samples))
+        # Set the training transforms
+        ds["train"].set_transform(train_transforms)
+
+    if training_args.do_eval:
+        if "validation" not in ds:
+            raise ValueError("--do_eval requires a validation dataset")
+        if data_args.max_eval_samples is not None:
+            ds["validation"] = (
+                ds["validation"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples))
+            )
+        # Set the validation transforms
+        ds["validation"].set_transform(val_transforms)
+
+    # Initalize our trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=ds["train"] if training_args.do_train else None,
+        eval_dataset=ds["validation"] if training_args.do_eval else None,
+        compute_metrics=compute_metrics,
+        tokenizer=feature_extractor,
+        data_collator=collate_fn,
+    )
+
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()
+        trainer.log_metrics("train", train_result.metrics)
+        trainer.save_metrics("train", train_result.metrics)
+        trainer.save_state()
+
+    # Evaluation
+    if training_args.do_eval:
+        metrics = trainer.evaluate()
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Write model card and (optionally) push to hub
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "tasks": "image-classification",
+        "dataset": data_args.dataset_name,
+        "tags": ["image-classification"],
+    }
+    if training_args.push_to_hub:
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/pytorch/language-modeling/README.md
+++ b/examples/pytorch/language-modeling/README.md
@@ -174,8 +174,3 @@ python run_clm.py --model_type gpt2 --tokenizer_name gpt2 \ --config_overrides="
 ```

 This feature is only available in `run_clm.py`, `run_plm.py` and `run_mlm.py`.
-
-This feature can also be used to activate gradient checkpointing by passing:
-```
--config_overrides "gradient_checkpointing=true,use_cache=False"
-```
--- a/examples/pytorch/language-modeling/requirements.txt
+++ b/examples/pytorch/language-modeling/requirements.txt
@@ -1,3 +1,4 @@
+accelerate
 torch >= 1.3
 datasets >= 1.8.0
 sentencepiece != 0.1.92
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -51,7 +51,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.9.0")
+check_min_version("4.11.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

@@ -172,6 +172,9 @@ class DataTrainingArguments:
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
+    keep_linebreaks: bool = field(
+        default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
+    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -266,6 +269,7 @@ def main():
            )
    else:
        data_files = {}
+        dataset_args = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
@@ -277,7 +281,8 @@ def main():
        )
        if extension == "txt":
            extension = "text"
-        raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
+        raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args)
        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
@@ -285,12 +290,14 @@ def main():
                data_files=data_files,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
+                **dataset_args,
            )
            raw_datasets["train"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
+                **dataset_args,
            )

    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
@@ -493,17 +500,19 @@ def main():
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

-    if training_args.push_to_hub:
-        kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
-        if data_args.dataset_name is not None:
-            kwargs["dataset_tags"] = data_args.dataset_name
-            if data_args.dataset_config_name is not None:
-                kwargs["dataset_args"] = data_args.dataset_config_name
-                kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-            else:
-                kwargs["dataset"] = data_args.dataset_name
+    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
+    if data_args.dataset_name is not None:
+        kwargs["dataset_tags"] = data_args.dataset_name
+        if data_args.dataset_config_name is not None:
+            kwargs["dataset_args"] = data_args.dataset_config_name
+            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+        else:
+            kwargs["dataset"] = data_args.dataset_name

+    if training_args.push_to_hub:
        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)


 def _mp_fn(index):
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -27,15 +27,17 @@ import logging
 import math
 import os
 import random
+from pathlib import Path

 import datasets
 import torch
 from datasets import load_dataset
-from torch.utils.data.dataloader import DataLoader
+from torch.utils.data import DataLoader
 from tqdm.auto import tqdm

 import transformers
-from accelerate import Accelerator
+from accelerate import Accelerator, DistributedType
+from huggingface_hub import Repository
 from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
@@ -48,6 +50,7 @@ from transformers import (
    get_scheduler,
    set_seed,
 )
+from transformers.file_utils import get_full_repo_name
 from transformers.utils.versions import require_version


@@ -173,7 +176,14 @@ def parse_args():
    parser.add_argument(
        "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
    )
-
+    parser.add_argument(
+        "--no_keep_linebreaks", action="store_true", help="Do not keep line breaks when using TXT files."
+    )
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument(
+        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
+    )
+    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    args = parser.parse_args()

    # Sanity checks
@@ -187,8 +197,8 @@ def parse_args():
            extension = args.validation_file.split(".")[-1]
            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."

-    if args.output_dir is not None:
-        os.makedirs(args.output_dir, exist_ok=True)
+    if args.push_to_hub:
+        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."

    return args

@@ -220,6 +230,18 @@ def main():
    if args.seed is not None:
        set_seed(args.seed)

+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
@@ -245,6 +267,7 @@ def main():
            )
    else:
        data_files = {}
+        dataset_args = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
@@ -252,18 +275,21 @@ def main():
        extension = args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
-        raw_datasets = load_dataset(extension, data_files=data_files)
+            dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
+        raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args)
        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[:{args.validation_split_percentage}%]",
+                **dataset_args,
            )
            raw_datasets["train"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[{args.validation_split_percentage}%:]",
+                **dataset_args,
            )

    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
@@ -403,6 +429,10 @@ def main():
        model, optimizer, train_dataloader, eval_dataloader
    )

+    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
+    if accelerator.distributed_type == DistributedType.TPU:
+        model.tie_weights()
+
    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

@@ -469,10 +499,22 @@ def main():

        logger.info(f"epoch {epoch}: perplexity: {perplexity}")

+        if args.push_to_hub and epoch < args.num_train_epochs - 1:
+            accelerator.wait_for_everyone()
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+            if accelerator.is_main_process:
+                tokenizer.save_pretrained(args.output_dir)
+                repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
+
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+        if accelerator.is_main_process:
+            tokenizer.save_pretrained(args.output_dir)
+            if args.push_to_hub:
+                repo.push_to_hub(commit_message="End of training")


 if __name__ == "__main__":
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -50,7 +50,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.9.0")
+check_min_version("4.11.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

@@ -528,17 +528,19 @@ def main():
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

-    if training_args.push_to_hub:
-        kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"}
-        if data_args.dataset_name is not None:
-            kwargs["dataset_tags"] = data_args.dataset_name
-            if data_args.dataset_config_name is not None:
-                kwargs["dataset_args"] = data_args.dataset_config_name
-                kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-            else:
-                kwargs["dataset"] = data_args.dataset_name
+    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask"}
+    if data_args.dataset_name is not None:
+        kwargs["dataset_tags"] = data_args.dataset_name
+        if data_args.dataset_config_name is not None:
+            kwargs["dataset_args"] = data_args.dataset_config_name
+            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+        else:
+            kwargs["dataset"] = data_args.dataset_name

+    if training_args.push_to_hub:
        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)


 def _mp_fn(index):
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -27,15 +27,17 @@ import logging
 import math
 import os
 import random
+from pathlib import Path

 import datasets
 import torch
 from datasets import load_dataset
-from torch.utils.data.dataloader import DataLoader
+from torch.utils.data import DataLoader
 from tqdm.auto import tqdm

 import transformers
-from accelerate import Accelerator
+from accelerate import Accelerator, DistributedType
+from huggingface_hub import Repository
 from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
@@ -48,6 +50,7 @@ from transformers import (
    get_scheduler,
    set_seed,
 )
+from transformers.file_utils import get_full_repo_name
 from transformers.utils.versions import require_version


@@ -185,7 +188,11 @@ def parse_args():
    parser.add_argument(
        "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss"
    )
-
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument(
+        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
+    )
+    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    args = parser.parse_args()

    # Sanity checks
@@ -199,8 +206,8 @@ def parse_args():
            extension = args.validation_file.split(".")[-1]
            assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file."

-    if args.output_dir is not None:
-        os.makedirs(args.output_dir, exist_ok=True)
+    if args.push_to_hub:
+        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."

    return args

@@ -232,6 +239,18 @@ def main():
    if args.seed is not None:
        set_seed(args.seed)

+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
@@ -448,6 +467,10 @@ def main():
        model, optimizer, train_dataloader, eval_dataloader
    )

+    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
+    if accelerator.distributed_type == DistributedType.TPU:
+        model.tie_weights()
+
    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

@@ -514,10 +537,22 @@ def main():

        logger.info(f"epoch {epoch}: perplexity: {perplexity}")

+        if args.push_to_hub and epoch < args.num_train_epochs - 1:
+            accelerator.wait_for_everyone()
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+            if accelerator.is_main_process:
+                tokenizer.save_pretrained(args.output_dir)
+                repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
+
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+        if accelerator.is_main_process:
+            tokenizer.save_pretrained(args.output_dir)
+            if args.push_to_hub:
+                repo.push_to_hub(commit_message="End of training")


 if __name__ == "__main__":
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -46,7 +46,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.9.0")
+check_min_version("4.11.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

@@ -499,17 +499,19 @@ def main():
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

-    if training_args.push_to_hub:
-        kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "language-modeling"}
-        if data_args.dataset_name is not None:
-            kwargs["dataset_tags"] = data_args.dataset_name
-            if data_args.dataset_config_name is not None:
-                kwargs["dataset_args"] = data_args.dataset_config_name
-                kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-            else:
-                kwargs["dataset"] = data_args.dataset_name
+    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "language-modeling"}
+    if data_args.dataset_name is not None:
+        kwargs["dataset_tags"] = data_args.dataset_name
+        if data_args.dataset_config_name is not None:
+            kwargs["dataset_args"] = data_args.dataset_config_name
+            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+        else:
+            kwargs["dataset"] = data_args.dataset_name

+    if training_args.push_to_hub:
        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)


 def _mp_fn(index):
--- a/examples/pytorch/multiple-choice/requirements.txt
+++ b/examples/pytorch/multiple-choice/requirements.txt
@@ -1,3 +1,4 @@
+accelerate
 sentencepiece != 0.1.92
 protobuf
 torch >= 1.3
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -47,7 +47,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.9.0")
+check_min_version("4.11.0")

 logger = logging.getLogger(__name__)

@@ -430,15 +430,19 @@ def main():
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

+    kwargs = dict(
+        finetuned_from=model_args.model_name_or_path,
+        tasks="multiple-choice",
+        dataset_tags="swag",
+        dataset_args="regular",
+        dataset="SWAG",
+        language="en",
+    )
+
    if training_args.push_to_hub:
-        trainer.push_to_hub(
-            finetuned_from=model_args.model_name_or_path,
-            tasks="multiple-choice",
-            dataset_tags="swag",
-            dataset_args="regular",
-            dataset="SWAG",
-            language="en",
-        )
+        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)


 def _mp_fn(index):
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -24,16 +24,18 @@ import math
 import os
 import random
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Optional, Union

 import datasets
 import torch
 from datasets import load_dataset, load_metric
-from torch.utils.data.dataloader import DataLoader
+from torch.utils.data import DataLoader
 from tqdm.auto import tqdm

 import transformers
 from accelerate import Accelerator
+from huggingface_hub import Repository
 from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
@@ -47,7 +49,7 @@ from transformers import (
    get_scheduler,
    set_seed,
 )
-from transformers.file_utils import PaddingStrategy
+from transformers.file_utils import PaddingStrategy, get_full_repo_name


 logger = logging.getLogger(__name__)
@@ -169,9 +171,15 @@ def parse_args():
        action="store_true",
        help="Activate debug mode and run training only with a subset of data.",
    )
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument(
+        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
+    )
+    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    args = parser.parse_args()
-    if args.output_dir is not None:
-        os.makedirs(args.output_dir, exist_ok=True)
+
+    if args.push_to_hub:
+        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."

    return args

@@ -260,6 +268,18 @@ def main():
    if args.seed is not None:
        set_seed(args.seed)

+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
@@ -478,10 +498,22 @@ def main():
        eval_metric = metric.compute()
        accelerator.print(f"epoch {epoch}: {eval_metric}")

+        if args.push_to_hub and epoch < args.num_train_epochs - 1:
+            accelerator.wait_for_everyone()
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+            if accelerator.is_main_process:
+                tokenizer.save_pretrained(args.output_dir)
+                repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
+
    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+        if accelerator.is_main_process:
+            tokenizer.save_pretrained(args.output_dir)
+            if args.push_to_hub:
+                repo.push_to_hub(commit_message="End of training")


 if __name__ == "__main__":
--- a/examples/pytorch/question-answering/requirements.txt
+++ b/examples/pytorch/question-answering/requirements.txt
@@ -1,2 +1,3 @@
+accelerate
 datasets >= 1.8.0
 torch >= 1.3.0
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -48,7 +48,7 @@ from utils_qa import postprocess_qa_predictions


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.9.0")
+check_min_version("4.11.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

@@ -339,6 +339,11 @@ def main():

    # Training preprocessing
    def prepare_train_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
@@ -433,6 +438,11 @@ def main():

    # Validation preprocessing
    def prepare_validation_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
@@ -613,17 +623,19 @@ def main():
        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

-    if training_args.push_to_hub:
-        kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
-        if data_args.dataset_name is not None:
-            kwargs["dataset_tags"] = data_args.dataset_name
-            if data_args.dataset_config_name is not None:
-                kwargs["dataset_args"] = data_args.dataset_config_name
-                kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-            else:
-                kwargs["dataset"] = data_args.dataset_name
+    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
+    if data_args.dataset_name is not None:
+        kwargs["dataset_tags"] = data_args.dataset_name
+        if data_args.dataset_config_name is not None:
+            kwargs["dataset_args"] = data_args.dataset_config_name
+            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+        else:
+            kwargs["dataset"] = data_args.dataset_name

+    if training_args.push_to_hub:
        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)


 def _mp_fn(index):
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -47,7 +47,7 @@ from utils_qa import postprocess_qa_predictions_with_beam_search


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.9.0")
+check_min_version("4.11.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

@@ -327,6 +327,11 @@ def main():

    # Training preprocessing
    def prepare_train_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
@@ -651,17 +656,19 @@ def main():
        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

-    if training_args.push_to_hub:
-        kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
-        if data_args.dataset_name is not None:
-            kwargs["dataset_tags"] = data_args.dataset_name
-            if data_args.dataset_config_name is not None:
-                kwargs["dataset_args"] = data_args.dataset_config_name
-                kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
-            else:
-                kwargs["dataset"] = data_args.dataset_name
+    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
+    if data_args.dataset_name is not None:
+        kwargs["dataset_tags"] = data_args.dataset_name
+        if data_args.dataset_config_name is not None:
+            kwargs["dataset_args"] = data_args.dataset_config_name
+            kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+        else:
+            kwargs["dataset"] = data_args.dataset_name

+    if training_args.push_to_hub:
        trainer.push_to_hub(**kwargs)
+    else:
+        trainer.create_model_card(**kwargs)


 def _mp_fn(index):
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -23,16 +23,18 @@ import logging
 import math
 import os
 import random
+from pathlib import Path

 import datasets
 import numpy as np
 import torch
 from datasets import load_dataset, load_metric
-from torch.utils.data.dataloader import DataLoader
+from torch.utils.data import DataLoader
 from tqdm.auto import tqdm

 import transformers
 from accelerate import Accelerator
+from huggingface_hub import Repository
 from transformers import (
    AdamW,
    DataCollatorWithPadding,
@@ -45,13 +47,14 @@ from transformers import (
    get_scheduler,
    set_seed,
 )
+from transformers.file_utils import get_full_repo_name
 from transformers.utils import check_min_version
 from transformers.utils.versions import require_version
 from utils_qa import postprocess_qa_predictions_with_beam_search


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.9.0")
+check_min_version("4.11.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

@@ -203,7 +206,11 @@ def parse_args():
        default=None,
        help="For debugging purposes or quicker training, truncate the number of prediction examples to this",
    )
-
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument(
+        "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
+    )
+    parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
    args = parser.parse_args()

    # Sanity checks
@@ -225,8 +232,8 @@ def parse_args():
            extension = args.test_file.split(".")[-1]
            assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."

-    if args.output_dir is not None:
-        os.makedirs(args.output_dir, exist_ok=True)
+    if args.push_to_hub:
+        assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed."

    return args

@@ -258,6 +265,18 @@ def main():
    if args.seed is not None:
        set_seed(args.seed)

+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.push_to_hub:
+            if args.hub_model_id is None:
+                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
+            else:
+                repo_name = args.hub_model_id
+            repo = Repository(args.output_dir, clone_from=repo_name)
+        elif args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    accelerator.wait_for_everyone()
+
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
@@ -315,6 +334,11 @@ def main():

    # Training preprocessing
    def prepare_train_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
@@ -430,6 +454,11 @@ def main():

    # Validation preprocessing
    def prepare_validation_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
@@ -693,8 +722,15 @@ def main():
            if completed_steps >= args.max_train_steps:
                break

-        # intialize all lists to collect the batches
+        if args.push_to_hub and epoch < args.num_train_epochs - 1:
+            accelerator.wait_for_everyone()
+            unwrapped_model = accelerator.unwrap_model(model)
+            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+            if accelerator.is_main_process:
+                tokenizer.save_pretrained(args.output_dir)
+                repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)

+    # intialize all lists to collect the batches
    all_start_top_log_probs = []
    all_start_top_index = []
    all_end_top_log_probs = []
@@ -811,6 +847,10 @@ def main():
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
+        if accelerator.is_main_process:
+            tokenizer.save_pretrained(args.output_dir)
+            if args.push_to_hub:
+                repo.push_to_hub(commit_message="End of training")


 if __name__ == "__main__":
--- a/Show More
+++ b/Show More