Release v4.11.2

Fix gather for TPU (#13813 )
Release v4.11.1
2021-09-30 11:54:39 -04:00 · 2021-09-30 11:53:13 -04:00 · 2021-09-29 12:04:25 -04:00 · 2021-09-29 12:03:56 -04:00 · 2021-09-29 12:03:51 -04:00 · 2021-09-29 12:03:46 -04:00
863 changed files with 107694 additions and 24147 deletions
--- a/.circleci/TROUBLESHOOT.md
+++ b/.circleci/TROUBLESHOOT.md
@@ -0,0 +1,7 @@
+# Troubleshooting
+
+This is a document explaining how to deal with various issues on Circle-CI. The entries may include actually solutions or pointers to Issues that cover those.
+
+## Circle CI
+
+* pytest worker runs out of resident RAM and gets killed by `cgroups`: https://github.com/huggingface/transformers/issues/11408
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,13 +80,50 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,speech,vision]
+            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
                    - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf ./tests/ -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf $(cat test_list.txt) -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+    
+    run_tests_torch_and_tf_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        environment:
+            OMP_NUM_THREADS: 1
+            RUN_PT_TF_CROSS_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch_and_tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - save_cache:
+                key: v0.4-{{ checksum "setup.py" }}
+                paths:
+                    - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf tests -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -110,13 +147,50 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,speech,vision]
+            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
                    - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax ./tests/ -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax $(cat test_list.txt) -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+    
+    run_tests_torch_and_flax_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        environment:
+            OMP_NUM_THREADS: 1
+            RUN_PT_FLAX_CROSS_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch_and_flax-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - save_cache:
+                key: v0.4-{{ checksum "setup.py" }}
+                paths:
+                    - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax tests -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -139,13 +213,49 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision,timm]
+            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 3 --dist=loadfile -s --make-reports=tests_torch ./tests/ | tee tests_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 3 --dist=loadfile -s --make-reports=tests_torch $(cat test_list.txt) | tee tests_output.txt
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+    
+    run_tests_torch_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - save_cache:
+                  key: v0.4-torch-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 3 --dist=loadfile -s --make-reports=tests_torch tests | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -167,12 +277,46 @@ jobs:
                      - v0.4-tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech]
            - save_cache:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_tf ./tests/ | tee tests_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_tf $(cat test_list.txt) | tee tests_output.txt
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+    
+    run_tests_tf_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech]
+            - save_cache:
+                  key: v0.4-tf-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_tf tests | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -194,12 +338,46 @@ jobs:
                    - v0.4-flax-{{ checksum "setup.py" }}
                    - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: sudo pip install .[flax,testing,sentencepiece]
+            - run: sudo pip install .[flax,testing,sentencepiece,flax-speech,vision]
            - save_cache:
                  key: v0.4-flax-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_flax ./tests/ | tee tests_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_flax $(cat test_list.txt) | tee tests_output.txt
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+    
+    run_tests_flax_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                keys:
+                    - v0.4-flax-{{ checksum "setup.py" }}
+                    - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: sudo pip install .[flax,testing,sentencepiece,vision,flax-speech]
+            - save_cache:
+                  key: v0.4-flax-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_flax tests | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -223,13 +401,50 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision]
+            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test ./tests/ | tee tests_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test $(cat test_list.txt) | tee tests_output.txt
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+    
+    run_tests_pipelines_torch_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            RUN_PIPELINE_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - save_cache:
+                  key: v0.4-torch-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test tests | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -257,7 +472,42 @@ jobs:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf ./tests/ -m is_pipeline_test | tee tests_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf $(cat test_list.txt) -m is_pipeline_test | tee tests_output.txt
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_tests_pipelines_tf_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            RUN_PIPELINE_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
+            - save_cache:
+                  key: v0.4-tf-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf tests -m is_pipeline_test | tee tests_output.txt
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -283,7 +533,10 @@ jobs:
                  key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py | tee tests_output.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py | tee tests_output.txt
+                  fi
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -304,14 +557,51 @@ jobs:
                  keys:
                      - v0.4-torch_examples-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,sentencepiece,testing]
+            - run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
            - run: pip install -r examples/pytorch/_tests_requirements.txt
            - save_cache:
                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee examples_output.txt
+            - run: python utils/tests_fetcher.py --filters examples tests | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee tests_output.txt
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/examples_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+    
+    run_examples_torch_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch_examples-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
+            - run: pip install -r examples/pytorch/_tests_requirements.txt
+            - save_cache:
+                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee examples_output.txt
            - store_artifacts:
                  path: ~/transformers/examples_output.txt
            - store_artifacts:
@@ -343,12 +633,117 @@ jobs:
                  key: v0.4-hub-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -sv ./tests/ -m is_staging_test
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -sv --make-reports=tests_hub $(cat test_list.txt) -m is_staging_test | tee tests_output.txt
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+    
+    run_tests_hub_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            HUGGINGFACE_CO_STAGING: yes
+            RUN_GIT_LFS_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-hub-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get install git-lfs
+            - run: |
+                git config --global user.email "ci@dummy.com"
+                git config --global user.name "ci"
+            - run: pip install --upgrade pip
+            - run: pip install .[torch,sentencepiece,testing]
+            - save_cache:
+                  key: v0.4-hub-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -sv --make-reports=tests_hub tests -m is_staging_test | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_tests_onnxruntime:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[torch,testing,sentencepiece,onnxruntime]
+            - save_cache:
+                  key: v0.4-onnx-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_onnx $(cat test_list.txt) -k onnx | tee tests_output.txt
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+    
+    run_tests_onnxruntime_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[torch,testing,sentencepiece,onnxruntime]
+            - save_cache:
+                  key: v0.4-onnx-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_onnx tests -k onnx | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    build_doc:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.6
+        resource_class: large
        steps:
            - checkout
            - restore_cache:
@@ -370,6 +765,7 @@ jobs:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.6
+        resource_class: large
        steps:
            - add_ssh_keys:
                fingerprints:
@@ -392,7 +788,7 @@ jobs:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.6
-        resource_class: medium
+        resource_class: large
        environment:
            TRANSFORMERS_IS_CI: yes
        parallelism: 1
@@ -403,7 +799,7 @@ jobs:
                      - v0.4-code_quality-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install isort
+            - run: pip install isort GitPython
            - run: pip install .[all,quality]
            - save_cache:
                  key: v0.4-code_quality-{{ checksum "setup.py" }}
@@ -419,6 +815,8 @@ jobs:
            - run: python utils/check_dummies.py
            - run: python utils/check_repo.py
            - run: python utils/check_inits.py
+            - run: make deps_table_check_updated
+            - run: python utils/tests_fetcher.py --sanity_check

    check_repository_consistency:
        working_directory: ~/transformers
@@ -431,6 +829,44 @@ jobs:
            - run: pip install requests
            - run: python ./utils/link_tester.py

+    run_tests_layoutlmv2:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[torch,testing,vision]
+            - run: pip install torchvision
+            - run: python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
+            - run: sudo apt install tesseract-ocr
+            - run: pip install pytesseract
+            - save_cache:
+                  key: v0.4-torch-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 1 tests/*layoutlmv2* --dist=loadfile -s --make-reports=tests_layoutlmv2 --durations=100
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
 # TPU JOBS
    run_examples_tpu:
        docker:
@@ -482,9 +918,31 @@ workflows:
            - run_tests_flax
            - run_tests_pipelines_torch
            - run_tests_pipelines_tf
+            - run_tests_onnxruntime
            - run_tests_hub
            - build_doc
+            - run_tests_layoutlmv2
            - deploy_doc: *workflow_filters
+    nightly:
+        triggers:
+            - schedule:
+                cron: "0 0 * * *"
+                filters:
+                    branches:
+                        only:
+                            - master
+        jobs:
+            - run_examples_torch_all
+            - run_tests_torch_and_tf_all
+            - run_tests_torch_and_flax_all
+            - run_tests_torch_all
+            - run_tests_tf_all
+            - run_tests_flax_all
+            - run_tests_pipelines_torch_all
+            - run_tests_pipelines_tf_all
+            - run_tests_onnxruntime_all
+            - run_tests_hub_all
+
 #    tpu_testing_jobs:
 #        triggers:
 #            - schedule:
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -63,4 +63,13 @@ deploy_doc "c5d6a28" v4.4.1
 deploy_doc "6bc89ed" v4.4.2
 deploy_doc "4906a29" v4.5.0
 deploy_doc "4bae96e" v4.5.1
-deploy_doc "25dee4a"  # v4.6.0 Latest stable release
+deploy_doc "25dee4a" v4.6.0
+deploy_doc "7a6c9fa" v4.7.0
+deploy_doc "9252a51" v4.8.0
+deploy_doc "1366172" v4.8.1
+deploy_doc "96d1cfb" v4.8.2
+deploy_doc "72aee83" v4.9.0
+deploy_doc "bff1c71" v4.9.1
+deploy_doc "41981a2" v4.9.2
+deploy_doc "39cb6f5" v4.10.0
+deploy_doc "28e2787"  # v4.10.1 Latest stable release
--- a/.github/conda/meta.yaml
+++ b/.github/conda/meta.yaml
@@ -26,6 +26,7 @@ requirements:
    - regex !=2019.12.17
    - protobuf
    - tokenizers >=0.10.1,<0.11.0
+    - pyyaml >=5.1
  run:
    - python
    - numpy >=1.17
@@ -40,6 +41,7 @@ requirements:
    - regex !=2019.12.17
    - protobuf
    - tokenizers >=0.10.1,<0.11.0
+    - pyyaml >=5.1

 test:
  imports:
--- a/.github/workflows/TROUBLESHOOT.md
+++ b/.github/workflows/TROUBLESHOOT.md
@@ -0,0 +1,9 @@
+# Troubleshooting
+
+This is a document explaining how to deal with various issues on github-actions self-hosted CI. The entries may include actually solutions or pointers to Issues that cover those.
+
+## GitHub Actions (self-hosted CI)
+
+* Deepspeed
+
+  - if jit build hangs, clear out `rm -rf ~/.cache/torch_extensions/` reference: https://github.com/huggingface/transformers/pull/12723
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@@ -0,0 +1,42 @@
+name: Doctests
+
+on:
+  push:
+    branches:
+      - doctest*
+  repository_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  RUN_SLOW: yes
+  OMP_NUM_THREADS: 16
+  MKL_NUM_THREADS: 16
+  PYTEST_TIMEOUT: 600
+
+jobs:
+  run_doctests:
+    runs-on: [self-hosted, docker-gpu, single-gpu]
+    container:
+      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          apt -y update && apt install -y libsndfile1-dev
+          pip install --upgrade pip
+          pip install .[dev]
+
+      - name: Run doctests
+        run: |
+          pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure
--- a/.github/workflows/model-templates.yml
+++ b/.github/workflows/model-templates.yml
@@ -47,6 +47,8 @@ jobs:
          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/flax-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/flax-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
          make style
          python utils/check_table.py --fix_and_overwrite
          python utils/check_dummies.py --fix_and_overwrite
@@ -59,7 +61,7 @@ jobs:
      - name: Run style changes
        run: |
          git fetch origin master:master
-          make fixup
+          make style && make quality

      - name: Failure short reports
        if: ${{ always() }}
--- a/.github/workflows/release-conda.yml
+++ b/.github/workflows/release-conda.yml
@@ -4,6 +4,8 @@ on:
  push:
    tags:
      - v*
+    branches:
+      - conda_*

 env:
  ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@@ -0,0 +1,257 @@
+name: Self-hosted runner; Nightly (scheduled)
+
+on:
+    push:
+        branches:
+            - nightly_ci*
+    repository_dispatch:
+    schedule:
+        - cron: "0 0 */3 * *"
+
+env:
+    HF_HOME: /mnt/cache
+    TRANSFORMERS_IS_CI: yes
+    RUN_SLOW: yes
+    OMP_NUM_THREADS: 16
+    MKL_NUM_THREADS: 16
+    PYTEST_TIMEOUT: 600
+    SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+
+jobs:
+    run_all_tests_torch_gpu:
+        runs-on: [self-hosted, docker-gpu, single-gpu]
+        container:
+            image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
+            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        steps:
+            - name: Launcher docker
+              uses: actions/checkout@v2
+
+            - name: NVIDIA-SMI
+              run: |
+                  nvidia-smi
+
+            - name: Install dependencies
+              run: |
+                  apt -y update && apt install -y libsndfile1-dev git
+                  pip install --upgrade pip
+                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
+
+            - name: Are GPUs recognized by our DL frameworks
+              run: |
+                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
+                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+            - name: Run all tests on GPU
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_gpu_failures_short.txt
+
+            - name: Run examples tests on GPU
+              if: ${{ always() }}
+              env:
+                  OMP_NUM_THREADS: 16
+                  MKL_NUM_THREADS: 16
+                  RUN_SLOW: yes
+                  HF_HOME: /mnt/cache
+                  TRANSFORMERS_IS_CI: yes
+              run: |
+                  pip install -r examples/pytorch/_tests_requirements.txt
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/examples_torch_gpu_failures_short.txt
+
+            - name: Run all pipeline tests on GPU
+              if: ${{ always() }}
+              env:
+                  RUN_PIPELINE_TESTS: yes
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_pipeline_gpu_failures_short.txt
+
+            - name: Test suite reports artifacts
+              if: ${{ always() }}
+              uses: actions/upload-artifact@v2
+              with:
+                  name: run_all_tests_torch_gpu_test_reports
+                  path: reports
+
+    run_all_tests_torch_multi_gpu:
+        runs-on: [self-hosted, docker-gpu, multi-gpu]
+        container:
+            image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
+            options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        steps:
+            - name: Launcher docker
+              uses: actions/checkout@v2
+
+            - name: NVIDIA-SMI
+              continue-on-error: true
+              run: |
+                  nvidia-smi
+
+            - name: Install dependencies
+              run: |
+                  apt -y update && apt install -y libsndfile1-dev git
+                  pip install --upgrade pip
+                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
+
+            - name: Are GPUs recognized by our DL frameworks
+              run: |
+                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
+                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+            - name: Run all tests on GPU
+              env:
+                  MKL_SERVICE_FORCE_INTEL: 1
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_multi_gpu_failures_short.txt
+
+            - name: Run all pipeline tests on GPU
+              if: ${{ always() }}
+              env:
+                  RUN_PIPELINE_TESTS: yes
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt
+
+            - name: Test suite reports artifacts
+              if: ${{ always() }}
+              uses: actions/upload-artifact@v2
+              with:
+                  name: run_all_tests_torch_multi_gpu_test_reports
+                  path: reports
+
+    run_all_tests_torch_cuda_extensions_gpu:
+        runs-on: [self-hosted, docker-gpu, single-gpu]
+        container:
+            image: nvcr.io/nvidia/pytorch:21.03-py3
+            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        steps:
+            - name: Launcher docker
+              uses: actions/checkout@v2
+
+            - name: NVIDIA-SMI
+              run: |
+                  nvidia-smi
+
+            - name: Install dependencies
+              run: |
+                  apt -y update && apt install -y libaio-dev
+                  pip install --upgrade pip
+                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
+                  pip install .[testing,deepspeed]
+                  pip install git+https://github.com/microsoft/DeepSpeed
+
+            - name: Are GPUs recognized by our DL frameworks
+              run: |
+                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
+                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+            - name: Run all tests on GPU
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
+
+            - name: Test suite reports artifacts
+              if: ${{ always() }}
+              uses: actions/upload-artifact@v2
+              with:
+                  name: run_tests_torch_cuda_extensions_gpu_test_reports
+                  path: reports
+
+    run_all_tests_torch_cuda_extensions_multi_gpu:
+        runs-on: [self-hosted, docker-gpu, multi-gpu]
+        container:
+            image: nvcr.io/nvidia/pytorch:21.03-py3
+            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        steps:
+            - name: Launcher docker
+              uses: actions/checkout@v2
+
+            - name: NVIDIA-SMI
+              continue-on-error: true
+              run: |
+                  nvidia-smi
+
+            - name: Install dependencies
+              run: |
+                  apt -y update && apt install -y libaio-dev
+                  pip install --upgrade pip
+                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
+                  pip install .[testing,deepspeed,fairscale]
+                  pip install git+https://github.com/microsoft/DeepSpeed
+
+            - name: Are GPUs recognized by our DL frameworks
+              run: |
+                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
+                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+            - name: Run all tests on GPU
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
+ 
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
+
+            - name: Test suite reports artifacts
+              if: ${{ always() }}
+              uses: actions/upload-artifact@v2
+              with:
+                  name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
+                  path: reports
+
+    send_results:
+        name: Send results to webhook
+        runs-on: ubuntu-latest
+        if: always()
+        needs: [
+                run_all_tests_torch_gpu,
+                run_all_tests_torch_multi_gpu,
+                run_all_tests_torch_cuda_extensions_gpu,
+                run_all_tests_torch_cuda_extensions_multi_gpu
+        ]
+        steps:
+            - uses: actions/checkout@v2
+
+            - uses: actions/download-artifact@v2
+
+            - name: Send message to Slack
+              env:
+                  CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+                  CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+                  CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+                  CI_SLACK_CHANNEL_ID_PAST_FUTURE: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
+
+              run: |
+                  pip install slack_sdk
+                  python utils/notification_service.py scheduled nightly-torch
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -11,6 +11,7 @@ on:
      - "tests/**"
      - ".github/**"
      - "templates/**"
+      - "utils/**"
  repository_dispatch:

 env:
@@ -18,6 +19,7 @@ env:
  TRANSFORMERS_IS_CI: yes
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 60

 jobs:
  run_tests_torch_gpu:
@@ -26,32 +28,47 @@ jobs:
      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
+      - name: Install dependencies
+        run: |
+          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+          apt install -y libsndfile1-dev
+          pip install --upgrade pip
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+
      - name: Launcher docker
        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2

      - name: NVIDIA-SMI
        run: |
          nvidia-smi

-      - name: Install dependencies
-        run: |
-          apt -y update && apt install -y libsndfile1-dev
-          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
-
      - name: Are GPUs recognized by our DL frameworks
        run: |
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+      
+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt

      - name: Run all non-slow tests on GPU
        run: |
-          python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_gpu tests
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_gpu $(cat test_list.txt)
+          fi

      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
        run: cat reports/tests_torch_gpu_failures_short.txt

      - name: Test suite reports artifacts
@@ -61,48 +78,118 @@ jobs:
          name: run_all_tests_torch_gpu_test_reports
          path: reports

-  run_tests_tf_gpu:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
-    timeout-minutes: 120
+  run_tests_flax_gpu:
+    runs-on: [self-hosted, docker-gpu-test, single-gpu]
    container:
      image: tensorflow/tensorflow:2.4.1-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
+      - name: Install dependencies
+        run: |
+          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
+          pip install --upgrade pip
+          pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
+
      - name: Launcher docker
        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2

      - name: NVIDIA-SMI
+        continue-on-error: true
        run: |
          nvidia-smi

-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece]
-
      - name: Are GPUs recognized by our DL frameworks
        run: |
-          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
-          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
+          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
+      
+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt

      - name: Run all non-slow tests on GPU
-        env:
-          TF_NUM_INTRAOP_THREADS: 8
-          TF_NUM_INTEROP_THREADS: 1
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu $(cat test_list.txt)
+          fi

      - name: Failure short reports
-        if: ${{ always() }}
-        run: cat reports/tests_tf_gpu_failures_short.txt
+        if: ${{ failure() }}
+        run: cat reports/tests_flax_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
-          name: run_all_tests_tf_gpu_test_reports
+          name: run_all_tests_flax_gpu_test_reports
          path: reports

+#  run_tests_tf_gpu:
+#    runs-on: [self-hosted, docker-gpu, single-gpu]
+#    timeout-minutes: 120
+#    container:
+#      image: tensorflow/tensorflow:2.4.1-gpu
+#      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#    steps:
+#      - name: Install dependencies
+#        run: |
+#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+#          pip install --upgrade pip
+#          pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
+#
+#      - name: Launcher docker
+#        uses: actions/checkout@v2
+#        with:
+#          fetch-depth: 2
+#
+#      - name: NVIDIA-SMI
+#        run: |
+#          nvidia-smi
+#
+#      - name: Are GPUs recognized by our DL frameworks
+#        run: |
+#          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+#          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+#
+#      - name: Fetch the tests to run
+#        run: |
+#          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+#
+#      - name: Report fetched tests
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: test_fetched
+#          path: test_preparation.txt
+#
+#      - name: Run all non-slow tests on GPU
+#        env:
+#          TF_NUM_INTRAOP_THREADS: 8
+#          TF_NUM_INTEROP_THREADS: 1
+#        run: |
+#          if [ -f test_list.txt ]; then
+#            python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu $(cat test_list.txt)
+#          fi
+#
+#      - name: Failure short reports
+#        if: ${{ failure() }}
+#        run: cat reports/tests_tf_gpu_failures_short.txt
+#
+#      - name: Test suite reports artifacts
+#        if: ${{ always() }}
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: run_all_tests_tf_gpu_test_reports
+#          path: reports
+

  run_tests_torch_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
@@ -110,18 +197,22 @@ jobs:
      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+          apt install -y libsndfile1-dev
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+      
+      - name: Launcher docker
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2
+
+      - name: NVIDIA-SMI
+        continue-on-error: true
+        run: |
+          nvidia-smi

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -130,14 +221,26 @@ jobs:
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt
+
      - name: Run all non-slow tests on GPU
        env:
          MKL_SERVICE_FORCE_INTEL: 1
        run: |
-          python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_multi_gpu $(cat test_list.txt)
+          fi

      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
        run: cat reports/tests_torch_multi_gpu_failures_short.txt

      - name: Test suite reports artifacts
@@ -147,47 +250,117 @@ jobs:
          name: run_all_tests_torch_multi_gpu_test_reports
          path: reports

-  run_tests_tf_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu]
-    timeout-minutes: 120
-    container:
-      image: tensorflow/tensorflow:2.4.1-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
+#  run_tests_flax_multi_gpu:
+#    runs-on: [self-hosted, docker-gpu, multi-gpu]
+#    container:
+#      image: tensorflow/tensorflow:2.4.1-gpu
+#      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#    steps:
+#      - name: Install dependencies
+#        run: |
+#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+#          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
+#          pip install --upgrade pip
+#          pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
+#
+#      - name: Launcher docker
+#        uses: actions/checkout@v2
+#        with:
+#          fetch-depth: 2
+#
+#      - name: NVIDIA-SMI
+#        continue-on-error: true
+#        run: |
+#          nvidia-smi
+#
+#      - name: Are GPUs recognized by our DL frameworks
+#        run: |
+#          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
+#          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
+#      
+#      - name: Fetch the tests to run
+#        run: |
+#          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+#
+#      - name: Report fetched tests
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: test_fetched
+#          path: test_preparation.txt
+#
+#      - name: Run all non-slow tests on GPU
+#        run: |
+#          if [ -f test_list.txt ]; then
+#            python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_multi_gpu $(cat test_list.txt)
+#          fi
+#
+#      - name: Failure short reports
+#        if: ${{ failure() }}
+#        run: cat reports/tests_flax_multi_gpu_failures_short.txt
+#
+#      - name: Test suite reports artifacts
+#        if: ${{ always() }}
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: run_all_tests_flax_multi_gpu_test_reports
+#          path: reports

-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece]
-
-      - name: Are GPUs recognized by our DL frameworks
-        run: |
-          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
-          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
-
-      - name: Run all non-slow tests on GPU
-        env:
-          TF_NUM_INTRAOP_THREADS: 8
-          TF_NUM_INTEROP_THREADS: 1
-        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
-
-      - name: Failure short reports
-        if: ${{ always() }}
-        run: cat reports/tests_tf_multi_gpu_failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v2
-        with:
-          name: run_all_tests_tf_multi_gpu_test_reports
-          path: reports
+#  run_tests_tf_multi_gpu:
+#    runs-on: [self-hosted, docker-gpu, multi-gpu]
+#    timeout-minutes: 120
+#    container:
+#      image: tensorflow/tensorflow:2.4.1-gpu
+#      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#    steps:
+#      - name: Install dependencies
+#        run: |
+#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+#          pip install --upgrade pip
+#          pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
+#
+#      - name: Launcher docker
+#        uses: actions/checkout@v2
+#        with:
+#          fetch-depth: 2
+#
+#      - name: NVIDIA-SMI
+#        run: |
+#          nvidia-smi
+#
+#      - name: Are GPUs recognized by our DL frameworks
+#        run: |
+#          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+#          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+#
+#      - name: Fetch the tests to run
+#        run: |
+#          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+#
+#      - name: Report fetched tests
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: test_fetched
+#          path: test_preparation.txt
+#
+#      - name: Run all non-slow tests on GPU
+#        env:
+#          TF_NUM_INTRAOP_THREADS: 8
+#          TF_NUM_INTEROP_THREADS: 1
+#        run: |
+#          if [ -f test_list.txt ]; then
+#            python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu $(cat test_list.txt)
+#          fi
+#
+#      - name: Failure short reports
+#        if: ${{ failure() }}
+#        run: cat reports/tests_tf_multi_gpu_failures_short.txt
+#
+#      - name: Test suite reports artifacts
+#        if: ${{ always() }}
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: run_all_tests_tf_multi_gpu_test_reports
+#          path: reports

  run_tests_torch_cuda_extensions_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
@@ -197,6 +370,8 @@ jobs:
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2

      - name: NVIDIA-SMI
        run: |
@@ -214,13 +389,25 @@ jobs:
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+      
+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt
+      
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt

      - name: Run all tests on GPU
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_gpu $(cat test_list.txt)
+          fi

      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
        run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt

      - name: Test suite reports artifacts
@@ -238,8 +425,11 @@ jobs:
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2

      - name: NVIDIA-SMI
+        continue-on-error: true
        run: |
          nvidia-smi

@@ -256,12 +446,24 @@ jobs:
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt
+
      - name: Run all tests on GPU
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_multi_gpu $(cat test_list.txt)
+          fi

      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
        run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt

      - name: Test suite reports artifacts
@@ -278,9 +480,9 @@ jobs:
    if: always()
    needs: [
        run_tests_torch_gpu,
-        run_tests_tf_gpu,
+#        run_tests_tf_gpu,
        run_tests_torch_multi_gpu,
-        run_tests_tf_multi_gpu,
+#        run_tests_tf_multi_gpu,
        run_tests_torch_cuda_extensions_gpu,
        run_tests_torch_cuda_extensions_multi_gpu
    ]
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -14,6 +14,8 @@ env:
  RUN_SLOW: yes
  OMP_NUM_THREADS: 16
  MKL_NUM_THREADS: 16
+  PYTEST_TIMEOUT: 600
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}

 jobs:
  run_all_tests_torch_gpu:
@@ -31,9 +33,9 @@ jobs:

      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
-          pip install .[integrations, sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
+          pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -44,7 +46,7 @@ jobs:

      - name: Run all tests on GPU
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -60,7 +62,7 @@ jobs:
          TRANSFORMERS_IS_CI: yes
        run: |
          pip install -r examples/pytorch/_tests_requirements.txt
-          python -m pytest -n 1 --dist=loadfile --make-reports=examples_torch_gpu examples
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples

      - name: Failure short reports
        if: ${{ always() }}
@@ -71,7 +73,7 @@ jobs:
        env:
          RUN_PIPELINE_TESTS: yes
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -84,6 +86,46 @@ jobs:
          name: run_all_tests_torch_gpu_test_reports
          path: reports

+  run_all_tests_flax_gpu:
+    runs-on: [self-hosted, docker-gpu-test, single-gpu]
+    container:
+      image: tensorflow/tensorflow:2.4.1-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
+        continue-on-error: true
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
+          pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
+          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
+
+      - name: Run all tests on GPU
+        run: |
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_flax_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_flax_gpu_test_reports
+          path: reports
+
  run_all_tests_tf_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
    container:
@@ -99,8 +141,9 @@ jobs:

      - name: Install dependencies
        run: |
+          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnx,sentencepiece]
+          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -112,7 +155,7 @@ jobs:
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -125,7 +168,7 @@ jobs:
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -148,14 +191,15 @@ jobs:
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
+        continue-on-error: true
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
-          pip install .[integrations, sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
+          pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -168,7 +212,7 @@ jobs:
        env:
          MKL_SERVICE_FORCE_INTEL: 1
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -179,7 +223,7 @@ jobs:
        env:
          RUN_PIPELINE_TESTS: yes
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -202,13 +246,15 @@ jobs:
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
+        continue-on-error: true
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
+          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnx,sentencepiece]
+          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -220,7 +266,7 @@ jobs:
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -233,7 +279,7 @@ jobs:
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -246,6 +292,45 @@ jobs:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports

+#  run_all_tests_flax_multi_gpu:
+#    runs-on: [self-hosted, docker-gpu, multi-gpu]
+#    container:
+#      image: tensorflow/tensorflow:2.4.1-gpu
+#      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#    steps:
+#      - name: Launcher docker
+#        uses: actions/checkout@v2
+#
+#      - name: NVIDIA-SMI
+#        run: |
+#          nvidia-smi
+#
+#      - name: Install dependencies
+#        run: |
+#          pip install --upgrade pip
+#          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
+#          pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
+#
+#      - name: Are GPUs recognized by our DL frameworks
+#        run: |
+#          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
+#          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
+#
+#      - name: Run all tests on GPU
+#        run: |
+#          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
+#
+#      - name: Failure short reports
+#        if: ${{ always() }}
+#        run: cat reports/tests_flax_gpu_failures_short.txt
+#
+#      - name: Test suite reports artifacts
+#        if: ${{ always() }}
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: run_all_tests_flax_gpu_test_reports
+#          path: reports
+
  run_all_tests_torch_cuda_extensions_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
    container:
@@ -274,7 +359,7 @@ jobs:

      - name: Run all tests on GPU
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended

      - name: Failure short reports
        if: ${{ always() }}
@@ -297,6 +382,7 @@ jobs:
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
+        continue-on-error: true
        run: |
          nvidia-smi

@@ -315,7 +401,7 @@ jobs:

      - name: Run all tests on GPU
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended

      - name: Failure short reports
        if: ${{ always() }}
@@ -349,6 +435,7 @@ jobs:
        env:
          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}


        run: |
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,82 @@
+cff-version: "1.2.0"
+date-released: 2020-10
+message: "If you use this software, please cite it using these metadata."
+title: "Transformers: State-of-the-Art Natural Language Processing"
+url: "https://github.com/huggingface/transformers"
+authors: 
+  - family-names: Wolf
+    given-names: Thomas
+  - family-names: Debut
+    given-names: Lysandre
+  - family-names: Sanh
+    given-names: Victor
+  - family-names: Chaumond
+    given-names: Julien
+  - family-names: Delangue
+    given-names: Clement
+  - family-names: Moi
+    given-names: Anthony
+  - family-names: Cistac
+    given-names: Perric
+  - family-names: Ma
+    given-names: Clara
+  - family-names: Jernite
+    given-names: Yacine
+  - family-names: Plu
+    given-names: Julien
+  - family-names: Xu
+    given-names: Canwen
+  - family-names: "Le Scao"
+    given-names: Teven
+  - family-names: Gugger
+    given-names: Sylvain
+  - family-names: Drame
+    given-names: Mariama
+  - family-names: Lhoest
+    given-names: Quentin
+  - family-names: Rush
+    given-names: "Alexander M."
+preferred-citation:
+  type: inproceedings
+  authors:
+  - family-names: Wolf
+    given-names: Thomas
+  - family-names: Debut
+    given-names: Lysandre
+  - family-names: Sanh
+    given-names: Victor
+  - family-names: Chaumond
+    given-names: Julien
+  - family-names: Delangue
+    given-names: Clement
+  - family-names: Moi
+    given-names: Anthony
+  - family-names: Cistac
+    given-names: Perric
+  - family-names: Ma
+    given-names: Clara
+  - family-names: Jernite
+    given-names: Yacine
+  - family-names: Plu
+    given-names: Julien
+  - family-names: Xu
+    given-names: Canwen
+  - family-names: "Le Scao"
+    given-names: Teven
+  - family-names: Gugger
+    given-names: Sylvain
+  - family-names: Drame
+    given-names: Mariama
+  - family-names: Lhoest
+    given-names: Quentin
+  - family-names: Rush
+    given-names: "Alexander M."
+  booktitle: "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations"
+  month: 10
+  start: 38
+  end: 45
+  title: "Transformers: State-of-the-Art Natural Language Processing"
+  year: 2020
+  publisher: "Association for Computational Linguistics"
+  url: "https://www.aclweb.org/anthology/2020.emnlp-demos.6"
+  address: "Online"
--- a/8
+++ b/8
@@ -21,10 +21,15 @@ modified_only_fixup:
 deps_table_update:
 	@python setup.py deps_table_update

+deps_table_check_updated:
+	@md5sum src/transformers/dependency_versions_table.py > md5sum.saved
+	@python setup.py deps_table_update
+	@md5sum -c --quiet md5sum.saved || (printf "\nError: the version dependency table is outdated.\nPlease run 'make fixup' or 'make style' and commit the changes.\n\n" && exit 1)
+	@rm md5sum.saved
+
 # autogenerating code

 autogenerate_code: deps_table_update
-	python utils/class_mapping_update.py

 # Check that source code meets quality standards

@@ -34,6 +39,7 @@ extra_quality_checks:
 	python utils/check_dummies.py
 	python utils/check_repo.py
 	python utils/check_inits.py
+	python utils/tests_fetcher.py --sanity_check

 # this target runs checks on all files
 quality:
--- a/README.md
+++ b/README.md
@@ -38,6 +38,14 @@ limitations under the License.
    <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
 </p>

+<h4 align="center">
+    <p>
+        <b>English</b> |
+        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hans.md">简体中文</a> |
+        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hant.md">繁體中文</a>
+    <p>
+</h4>
+
 <h3 align="center">
    <p>State-of-the-art Natural Language Processing for Jax, PyTorch and TensorFlow</p>
 </h3>
@@ -203,6 +211,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
 1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
@@ -212,7 +221,8 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 1. **[ByT5](https://huggingface.co/transformers/model_doc/byt5.html)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
 1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-1. **[CLIP](https://huggingface.co/transformers/model_doc/clip.html)** from (OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CANINE](https://huggingface.co/transformers/model_doc/canine.html)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[CLIP](https://huggingface.co/transformers/model_doc/clip.html)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[CPM](https://huggingface.co/transformers/model_doc/cpm.html)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
@@ -225,20 +235,25 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
 for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
 Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
 1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FNet](https://huggingface.co/transformers/model_doc/fnet.html)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
 1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
+1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
 1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
 1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
@@ -246,14 +261,19 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SpeechEncoderDecoder](https://huggingface.co/transformers/model_doc/speechencoderdecoder.html)**
 1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/model_doc/speech_to_text_2.html)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -0,0 +1,343 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!---
+A useful guide for English-Chinese translation of Hugging Face documentation
+- Add space around English words and numbers when they appear between Chinese characters. E.g., 共 100 多种语言; 使用 transformers 库。
+- Use square quotes, e.g.,「引用」
+
+Dictionary
+
+Hugging Face: 抱抱脸
+token: 词符（并用括号标注原英文）
+tokenize: 词符化（并用括号标注原英文）
+tokenizer: 词符化器（并用括号标注原英文）
+transformer: transformer（不翻译）
+pipeline: 流水线
+API: API (不翻译）
+inference: 推理
+Trainer: 训练器。当作为类名出现时不翻译。
+pretrained/pretrain: 预训练
+finetune: 微调
+community: 社区
+example: 当特指仓库中 example 目录时翻译为「用例」
+Python data structures (e.g., list, set, dict): 翻译为列表，集合，词典，并用括号标注原英文
+NLP/Natural Language Processing: 以 NLP 出现时不翻译，以 Natural Language Processing 出现时翻译为自然语言处理
+checkpoint: 检查点
+-->
+
+<p align="center">
+    <br>
+    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
+    <br>
+<p>
+<p align="center">
+    <a href="https://circleci.com/gh/huggingface/transformers">
+        <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
+        <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
+    </a>
+    <a href="https://huggingface.co/transformers/index.html">
+        <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/transformers/index.html.svg?down_color=red&down_message=offline&up_message=online">
+    </a>
+    <a href="https://github.com/huggingface/transformers/releases">
+        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/master/CODE_OF_CONDUCT.md">
+        <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
+    </a>
+    <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
+</p>
+
+<h4 align="center">
+    <p>
+        <a href="https://github.com/huggingface/transformers/">English</a> |
+        <b>简体中文</b> |
+        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hant.md">繁體中文</a>
+    <p>
+</h4>
+
+<h3 align="center">
+    <p>为 Jax、PyTorch 和 TensorFlow 打造的先进的自然语言处理</p>
+</h3>
+
+<h3 align="center">
+    <a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/course_banner.png"></a>
+</h3>
+
+🤗 Transformers 提供了数以千计的预训练模型，支持 100 多种语言的文本分类、信息抽取、问答、摘要、翻译、文本生成。它的宗旨让最先进的 NLP 技术人人易用。
+
+🤗 Transformers 提供了便于快速下载和使用的API，让你可以把预训练模型用在给定文本、在你的数据集上微调然后通过 [model hub](https://huggingface.co/models) 与社区共享。同时，每个定义的 Python 模块均完全独立，方便修改和快速研究实验。
+
+🤗 Transformers 支持三个最热门的深度学习库： [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/) — 并与之无缝整合。你可以直接使用一个框架训练你的模型然后用另一个加载和推理。
+
+## 在线演示
+
+你可以直接在模型页面上测试大多数 [model hub](https://huggingface.co/models) 上的模型。 我们也提供了 [私有模型托管、模型版本管理以及推理API](https://huggingface.co/pricing)。
+
+这里是一些例子：
+- [用 BERT 做掩码填词](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [用 Electra 做命名实体识别](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [用 GPT-2 做文本生成](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
+- [用 RoBERTa 做自然语言推理](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [用 BART 做文本摘要](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [用 DistilBERT 做问答](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [用 T5 做翻译](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+
+**[Write With Transformer](https://transformer.huggingface.co)**，由抱抱脸团队打造，是一个文本生成的官方 demo。
+
+## 如果你在寻找由抱抱脸团队提供的定制化支持服务
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a><br>
+
+## 快速上手
+
+我们为快速使用模型提供了 `pipeline` （流水线）API。流水线聚合了预训练模型和对应的文本预处理。下面是一个快速使用流水线去判断正负面情绪的例子：
+
+```python
+>>> from transformers import pipeline
+
+# 使用情绪分析流水线
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
+```
+
+第二行代码下载并缓存了流水线使用的预训练模型，而第三行代码则在给定的文本上进行了评估。这里的答案“正面” (positive) 具有 99 的置信度。
+
+许多的 NLP 任务都有开箱即用的预训练流水线。比如说，我们可以轻松的从给定文本中抽取问题答案：
+
+``` python
+>>> from transformers import pipeline
+
+# 使用问答流水线
+>>> question_answerer = pipeline('question-answering')
+>>> question_answerer({
+...     'question': 'What is the name of the repository ?',
+...     'context': 'Pipeline has been included in the huggingface/transformers repository'
+... })
+{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}
+
+```
+
+除了给出答案，预训练模型还给出了对应的置信度分数、答案在词符化 (tokenized) 后的文本中开始和结束的位置。你可以从[这个教程](https://huggingface.co/transformers/task_summary.html)了解更多流水线API支持的任务。
+
+要在你的任务上下载和使用任意预训练模型也很简单，只需三行代码。这里是 PyTorch 版的示例：
+```python
+>>> from transformers import AutoTokenizer, AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = AutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+这里是等效的 TensorFlow 代码：
+```python
+>>> from transformers import AutoTokenizer, TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="tf")
+>>> outputs = model(**inputs)
+```
+
+词符化器 (tokenizer) 为所有的预训练模型提供了预处理，并可以直接对单个字符串进行调用（比如上面的例子）或对列表 (list) 调用。它会输出一个你可以在下游代码里使用或直接通过 `**` 解包表达式传给模型的词典 (dict)。
+
+模型本身是一个常规的 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 或 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)（取决于你的后端），可以常规方式使用。 [这个教程](https://huggingface.co/transformers/training.html)解释了如何将这样的模型整合到经典的 PyTorch 或 TensorFlow 训练循环中，或是如何使用我们的 `Trainer` 训练器）API 来在一个新的数据集上快速微调。
+
+## 为什么要用 transformers？
+
+1. 便于使用的先进模型：
+    - NLU 和 NLG 上表现优越
+    - 对教学和实践友好且低门槛
+    - 高级抽象，只需了解三个类
+    - 对所有模型统一的API
+
+1. 更低计算开销，更少的碳排放：
+    - 研究人员可以分享亿训练的模型而非次次从头开始训练
+    - 工程师可以减少计算用时和生产环境开销
+    - 数十种模型架构、两千多个预训练模型、100多种语言支持
+
+1. 对于模型生命周期的每一个部分都面面俱到：
+    - 训练先进的模型，只需 3 行代码
+    - 模型在不同深度学习框架间任意转移，随你心意
+    - 为训练、评估和生产选择最适合的框架，衔接无缝
+
+1. 为你的需求轻松定制专属模型和用例：
+    - 我们为每种模型架构提供了多个用例来复现原论文结果
+    - 模型内部结构保持透明一致
+    - 模型文件可单独使用，方便魔改和快速实验
+
+## 什么情况下我不该用 transformers？
+
+- 本库并不是模块化的神经网络工具箱。模型文件中的代码特意呈若璞玉，未经额外抽象封装，以便研究人员快速迭代魔改而不致溺于抽象和文件跳转之中。
+- `Trainer` API 并非兼容任何模型，只为本库之模型优化。若是在寻找适用于通用机器学习的训练循环实现，请另觅他库。
+- 尽管我们已尽力而为，[examples 目录](https://github.com/huggingface/transformers/tree/master/examples)中的脚本也仅为用例而已。对于你的特定问题，它们并不一定开箱即用，可能需要改几行代码以适之。
+
+## 安装
+
+### 使用 pip
+
+这个仓库已在 Python 3.6+、Flax 0.3.2+、PyTorch 1.3.1+ 和 TensorFlow 2.3+ 下经过测试。
+
+你可以在[虚拟环境](https://docs.python.org/3/library/venv.html)中安装 🤗 Transformers。如果你还不熟悉 Python 的虚拟环境，请阅此[用户说明](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。
+
+首先，用你打算使用的版本的 Python 创建一个虚拟环境并激活。
+
+然后，你需要安装 Flax、PyTorch 或 TensorFlow 其中之一。关于在你使用的平台上安装这些框架，请参阅 [TensorFlow 安装页](https://www.tensorflow.org/install/), [PyTorch 安装页](https://pytorch.org/get-started/locally/#start-locally) 或 [Flax 安装页](https://github.com/google/flax#quick-install)。
+
+当这些后端之一安装成功后， 🤗 Transformers 可依此安装：
+
+```bash
+pip install transformers
+```
+
+如果你想要试试用例或者想在正式发布前使用最新的开发中代码，你得[从源代码安装](https://huggingface.co/transformers/installation.html#installing-from-source)。
+
+### 使用 conda
+
+自 Transformers 4.0.0 版始，我们有了一个 conda 频道： `huggingface`。
+
+🤗 Transformers 可以通过 conda 依此安装：
+
+```shell script
+conda install -c huggingface transformers
+```
+
+要通过 conda 安装 Flax、PyTorch 或 TensorFlow 其中之一，请参阅它们各自安装页的说明。
+
+## 模型架构
+
+**🤗 Transformers 支持的[所有的模型检查点](https://huggingface.co/models)** 由[用户](https://huggingface.co/users)和[组织](https://huggingface.co/organizations)上传，均与 huggingface.co [model hub](https://huggingface.co) 无缝整合。
+
+目前的检查点数量： ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
+🤗 Transformers 目前支持如下的架构（模型概述请阅[这里](https://huggingface.co/transformers/model_summary.html)）：
+
+1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。
+1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
+1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
+1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (来自 Microsoft) 伴随论文 [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) 由 Hangbo Bao, Li Dong, Furu Wei 发布。
+1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (来自 Google) 伴随论文 [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) 由 Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova 发布。
+1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (来自 Google) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
+1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
+1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
+1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
+1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
+1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。
+1. **[ByT5](https://huggingface.co/transformers/model_doc/byt5.html)** (来自 Google Research) 伴随论文 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 由 Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 发布。
+1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
+1. **[CANINE](https://huggingface.co/transformers/model_doc/canine.html)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
+1. **[CLIP](https://huggingface.co/transformers/model_doc/clip.html)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
+1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
+1. **[CPM](https://huggingface.co/transformers/model_doc/cpm.html)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
+1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
+1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
+1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
+1. **[DeiT](https://huggingface.co/transformers/model_doc/deit.html)** (来自 Facebook) 伴随论文 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 由 Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 发布。
+1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。
+1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
+1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) 和德语版 DistilBERT。
+1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
+1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
+1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
+1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
+1. **[FNet](https://huggingface.co/transformers/master/model_doc/fnet.html)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
+1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
+1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
+1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
+1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
+1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
+1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
+1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
+1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
+1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
+1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) 由 Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei 发布。
+1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
+1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
+1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。
+1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (来自 UNC Chapel Hill) 伴随论文 [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) 由 Hao Tan and Mohit Bansal 发布。
+1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (来自 Facebook) 伴随论文 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 由 Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 发布。
+1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
+1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
+1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
+1. **[Megatron-BERT](https://huggingface.co/transformers/model_doc/megatron_bert.html)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
+1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
+1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
+1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
+1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
+1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
+1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
+1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
+1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
+1. **[SpeechEncoderDecoder](https://huggingface.co/transformers/master/model_doc/speechencoderdecoder.html)** 
+1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (来自 Facebook), 伴随论文 [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino 发布。
+1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/master/model_doc/speech_to_text_2.html)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
+1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
+1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
+1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
+1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
+1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
+1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
+1. **[VisualBERT](https://huggingface.co/transformers/model_doc/visual_bert.html)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
+1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
+1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
+1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
+1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (来自 Facebook AI), 伴随论文 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 由 Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 发布。
+1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。
+1. **[XLSR-Wav2Vec2](https://huggingface.co/transformers/model_doc/xlsr_wav2vec2.html)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。
+1. 想要贡献新的模型？我们这里有一份**详细指引和模板**来引导你添加新的模型。你可以在 [`templates`](./templates) 目录中找到他们。记得查看 [贡献指南](./CONTRIBUTING.md) 并在开始写 PR 前联系维护人员或开一个新的 issue 来获得反馈。
+
+要检查某个模型是否已有 Flax、PyTorch 或 TensorFlow 的实现，或其是否在 🤗 Tokenizers 库中有对应词符化器（tokenizer），敬请参阅[此表](https://huggingface.co/transformers/index.html#supported-frameworks)。
+
+这些实现均已于多个数据集测试（请参看用例脚本）并应于原版实现表现相当。你可以在用例文档的[此节](https://huggingface.co/transformers/examples.html)中了解表现的细节。
+
+
+## 了解更多
+
+| 章节 | 描述 |
+|-|-|
+| [文档](https://huggingface.co/transformers/) | 完整的 API 文档和教程 |
+| [任务总结](https://huggingface.co/transformers/task_summary.html) | 🤗 Transformers 支持的任务 |
+| [预处理教程](https://huggingface.co/transformers/preprocessing.html) | 使用 `Tokenizer` 来为模型准备数据 |
+| [训练和微调](https://huggingface.co/transformers/training.html) | 在 PyTorch/TensorFlow 的训练循环或 `Trainer` API 中使用 🤗 Transformers 提供的模型 |
+| [快速上手：微调和用例脚本](https://github.com/huggingface/transformers/tree/master/examples) | 为各种任务提供的用例脚本 |
+| [模型分享和上传](https://huggingface.co/transformers/model_sharing.html) | 和社区上传和分享你微调的模型 |
+| [迁移](https://huggingface.co/transformers/migration.html) | 从 `pytorch-transformers` 或 `pytorch-pretrained-bert` 迁移到 🤗 Transformers |
+
+## 引用
+
+我们已将此库的[论文](https://www.aclweb.org/anthology/2020.emnlp-demos.6/)正式发表，如果你使用了 🤗 Transformers 库，请引用:
+```bibtex
+@inproceedings{wolf-etal-2020-transformers,
+    title = "Transformers: State-of-the-Art Natural Language Processing",
+    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = oct,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
+    pages = "38--45"
+}
+```
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -0,0 +1,355 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!---
+A useful guide for English-Traditional Chinese translation of Hugging Face documentation
+- Add space around English words and numbers when they appear between Chinese characters. E.g., 共 100 多種語言; 使用 transformers 函式庫。
+- Use square quotes, e.g.,「引用」
+- Some of terms in the file can be found at National Academy for Educational Research (https://terms.naer.edu.tw/), an official website providing bilingual translations between English and Traditional Chinese.
+
+Dictionary
+
+API: API (不翻譯）
+add: 加入
+checkpoint: 檢查點
+code: 程式碼
+community: 社群
+confidence: 信賴度
+dataset: 資料集
+documentation: 文件
+example: 基本翻譯為「範例」，或依語意翻為「例子」
+finetune: 微調
+Hugging Face: Hugging Face（不翻譯）
+implementation: 實作
+inference: 推論
+library: 函式庫
+module: 模組
+NLP/Natural Language Processing: 以 NLP 出現時不翻譯，以 Natural Language Processing 出現時翻譯為自然語言處理
+online demos: 線上Demo
+pipeline: pipeline（不翻譯）
+pretrained/pretrain: 預訓練
+Python data structures (e.g., list, set, dict): 翻譯為串列，集合，字典，並用括號標註原英文
+repository: repository（不翻譯）
+summary: 概覽
+token-: token-（不翻譯）
+Trainer: Trainer（不翻譯）
+transformer: transformer（不翻譯）
+tutorial: 教學
+user: 使用者
+-->
+
+<p align="center">
+    <br>
+    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
+    <br>
+<p>
+<p align="center">
+    <a href="https://circleci.com/gh/huggingface/transformers">
+        <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
+        <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
+    </a>
+    <a href="https://huggingface.co/transformers/index.html">
+        <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/transformers/index.html.svg?down_color=red&down_message=offline&up_message=online">
+    </a>
+    <a href="https://github.com/huggingface/transformers/releases">
+        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/master/CODE_OF_CONDUCT.md">
+        <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
+    </a>
+    <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
+</p>
+
+<h4 align="center">
+    <p>
+        <a href="https://github.com/huggingface/transformers/">English</a> |
+        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hans.md">简体中文</a> |
+        <b>繁體中文</b>
+    <p>
+</h4>
+
+<h3 align="center">
+    <p>為 Jax、PyTorch 以及 TensorFlow 打造的先進自然語言處理函式庫</p>
+</h3>
+
+<h3 align="center">
+    <a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/course_banner.png"></a>
+</h3>
+
+🤗 Transformers 提供了數以千計的預訓練模型，支援 100 多種語言的文本分類、資訊擷取、問答、摘要、翻譯、文本生成。它的宗旨是讓最先進的 NLP 技術人人易用。
+
+🤗 Transformers 提供了便於快速下載和使用的API，讓你可以將預訓練模型用在給定文本、在你的資料集上微調然後經由 [model hub](https://huggingface.co/models) 與社群共享。同時，每個定義的 Python 模組架構均完全獨立，方便修改和快速研究實驗。
+
+🤗 Transformers 支援三個最熱門的深度學習函式庫： [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) 以及 [TensorFlow](https://www.tensorflow.org/) — 並與之完美整合。你可以直接使用其中一個框架訓練你的模型，然後用另一個載入和推論。
+
+## 線上Demo
+
+你可以直接在 [model hub](https://huggingface.co/models) 上測試大多數的模型。我們也提供了 [私有模型託管、模型版本管理以及推論API](https://huggingface.co/pricing)。
+
+這裡是一些範例：
+- [用 BERT 做遮蓋填詞](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [用 Electra 做專有名詞辨識](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [用 GPT-2 做文本生成](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
+- [用 RoBERTa 做自然語言推論](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [用 BART 做文本摘要](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [用 DistilBERT 做問答](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [用 T5 做翻譯](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+
+**[Write With Transformer](https://transformer.huggingface.co)**，由 Hugging Face 團隊所打造，是一個文本生成的官方 demo。
+
+## 如果你在尋找由 Hugging Face 團隊所提供的客製化支援服務
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a><br>
+
+## 快速上手
+
+我們為快速使用模型提供了 `pipeline` API。 Pipeline 包含了預訓練模型和對應的文本預處理。下面是一個快速使用 pipeline 去判斷正負面情緒的例子：
+
+```python
+>>> from transformers import pipeline
+
+# 使用情緒分析 pipeline
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
+```
+
+第二行程式碼下載並快取 pipeline 使用的預訓練模型，而第三行程式碼則在給定的文本上進行了評估。這裡的答案“正面” (positive) 具有 99.97% 的信賴度。
+
+許多的 NLP 任務都有隨選即用的預訓練 `pipeline`。例如，我們可以輕鬆地從給定文本中擷取問題答案：
+
+``` python
+>>> from transformers import pipeline
+
+# 使用問答 pipeline
+>>> question_answerer = pipeline('question-answering')
+>>> question_answerer({
+...     'question': 'What is the name of the repository ?',
+...     'context': 'Pipeline has been included in the huggingface/transformers repository'
+... })
+{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}
+
+```
+
+除了提供問題解答，預訓練模型還提供了對應的信賴度分數以及解答在 tokenized 後的文本中開始和結束的位置。你可以從[這個教學](https://huggingface.co/transformers/task_summary.html)了解更多 `pipeline` API支援的任務。
+
+要在你的任務中下載和使用任何預訓練模型很簡單，只需三行程式碼。這裡是 PyTorch 版的範例：
+```python
+>>> from transformers import AutoTokenizer, AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = AutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+這裡是對應的 TensorFlow 程式碼：
+```python
+>>> from transformers import AutoTokenizer, TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="tf")
+>>> outputs = model(**inputs)
+```
+
+Tokenizer 為所有的預訓練模型提供了預處理，並可以直接轉換單一字串（比如上面的例子）或串列 (list)。它會輸出一個的字典 (dict) 讓你可以在下游程式碼裡使用或直接藉由 `**` 運算式傳給模型。
+
+模型本身是一個常規的 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 或 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)（取決於你的後端），可依常規方式使用。 [這個教學](https://huggingface.co/transformers/training.html)解釋了如何將這樣的模型整合到一般的 PyTorch 或 TensorFlow 訓練迴圈中，或是如何使用我們的 `Trainer` API 在一個新的資料集上快速進行微調。
+
+## 為什麼要用 transformers？
+
+1. 便於使用的先進模型：
+    - NLU 和 NLG 上性能卓越
+    - 對教學和實作友好且低門檻
+    - 高度抽象，使用者只須學習 3 個類別
+    - 對所有模型使用的制式化API
+
+1. 更低的運算成本，更少的碳排放：
+    - 研究人員可以分享預訓練的模型而非從頭開始訓練
+    - 工程師可以減少計算時間以及生產成本
+    - 數十種模型架構、兩千多個預訓練模型、100多種語言支援
+
+1. 對於模型生命週期的每一個部分都面面俱到：
+    - 訓練先進的模型，只需 3 行程式碼
+    - 模型可以在不同深度學習框架之間任意轉換
+    - 為訓練、評估和生產選擇最適合的框架，並完美銜接
+
+1. 為你的需求輕鬆客製化專屬模型和範例：
+    - 我們為每種模型架構提供了多個範例來重現原論文結果
+    - 一致的模型內部架構
+    - 模型檔案可單獨使用，便於修改和快速實驗
+
+## 什麼情況下我不該用 transformers？
+
+- 本函式庫並不是模組化的神經網絡工具箱。模型文件中的程式碼並未做額外的抽象封裝，以便研究人員快速地翻閱及修改程式碼，而不會深陷複雜的類別包裝之中。
+- `Trainer` API 並非相容任何模型，它只為本函式庫中的模型最佳化。對於一般的機器學習用途，請使用其他函式庫。
+- 儘管我們已盡力而為，[examples 目錄](https://github.com/huggingface/transformers/tree/master/examples)中的腳本也僅為範例而已。對於特定問題，它們並不一定隨選即用，可能需要修改幾行程式碼以符合需求。
+
+## 安裝
+
+### 使用 pip
+
+這個 Repository 已在 Python 3.6+、Flax 0.3.2+、PyTorch 1.3.1+ 和 TensorFlow 2.3+ 下經過測試。
+
+你可以在[虛擬環境](https://docs.python.org/3/library/venv.html)中安裝 🤗 Transformers。如果你還不熟悉 Python 的虛擬環境，請閱此[使用者指引](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。
+
+首先，用你打算使用的版本的 Python 創建一個虛擬環境並進入。
+
+然後，你需要安裝 Flax、PyTorch 或 TensorFlow 其中之一。對於該如何在你使用的平台上安裝這些框架，請參閱 [TensorFlow 安裝頁面](https://www.tensorflow.org/install/), [PyTorch 安裝頁面](https://pytorch.org/get-started/locally/#start-locally) 或 [Flax 安裝頁面](https://github.com/google/flax#quick-install)。
+
+當其中一個後端安裝成功後，🤗 Transformers 可依此安裝：
+
+```bash
+pip install transformers
+```
+
+如果你想要試試範例或者想在正式發布前使用最新開發中的程式碼，你必須[從原始碼安裝](https://huggingface.co/transformers/installation.html#installing-from-source)。
+
+### 使用 conda
+
+自 Transformers 4.0.0 版始，我們有了一個 conda channel： `huggingface`。
+
+🤗 Transformers 可以藉由 conda 依此安裝：
+
+```shell script
+conda install -c huggingface transformers
+```
+
+要藉由 conda 安裝 Flax、PyTorch 或 TensorFlow 其中之一，請參閱它們各自安裝頁面的說明。
+
+## 模型架構
+
+**🤗 Transformers 支援的[所有的模型檢查點](https://huggingface.co/models)**，由[使用者](https://huggingface.co/users)和[組織](https://huggingface.co/organizations)上傳，均與 huggingface.co [model hub](https://huggingface.co) 完美結合。
+
+目前的檢查點數量： ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
+🤗 Transformers 目前支援以下的架構（模型概覽請參閱[這裡](https://huggingface.co/transformers/model_summary.html)）：
+
+1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[ByT5](https://huggingface.co/transformers/model_doc/byt5.html)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CANINE](https://huggingface.co/transformers/model_doc/canine.html)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[CLIP](https://huggingface.co/transformers/model_doc/clip.html)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[CPM](https://huggingface.co/transformers/model_doc/cpm.html)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeiT](https://huggingface.co/transformers/model_doc/deit.html)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
+1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FNet](https://huggingface.co/transformers/master/model_doc/fnet.html)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](https://huggingface.co/transformers/model_doc/megatron_bert.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SpeechEncoderDecoder](https://huggingface.co/transformers/master/model_doc/speechencoderdecoder.html)** 
+1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/master/model_doc/speech_to_text_2.html)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VisualBERT](https://huggingface.co/transformers/model_doc/visual_bert.html)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLSR-Wav2Vec2](https://huggingface.co/transformers/model_doc/xlsr_wav2vec2.html)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. 想要貢獻新的模型？我們這裡有一份**詳細指引和模板**來引導你加入新的模型。你可以在 [`templates`](./templates) 目錄中找到它們。記得查看[貢獻指引](./CONTRIBUTING.md)並在開始寫 PR 前聯繫維護人員或開一個新的 issue 來獲得 feedbacks。
+
+要檢查某個模型是否已有 Flax、PyTorch 或 TensorFlow 的實作，或其是否在🤗 Tokenizers 函式庫中有對應的 tokenizer，敬請參閱[此表](https://huggingface.co/transformers/index.html#supported-frameworks)。
+
+這些實作均已於多個資料集測試（請參閱範例腳本）並應與原版實作表現相當。你可以在範例文件的[此節](https://huggingface.co/transformers/examples.html)中了解實作的細節。
+
+
+## 了解更多
+
+| 章節 | 描述 |
+|-|-|
+| [文件](https://huggingface.co/transformers/) | 完整的 API 文件和教學 |
+| [任務概覽](https://huggingface.co/transformers/task_summary.html) | 🤗 Transformers 支援的任務 |
+| [預處理教學](https://huggingface.co/transformers/preprocessing.html) | 使用 `Tokenizer` 來為模型準備資料 |
+| [訓練和微調](https://huggingface.co/transformers/training.html) | 使用 PyTorch/TensorFlow 的內建的訓練方式或於 `Trainer` API 中使用 🤗 Transformers 提供的模型 |
+| [快速上手：微調和範例腳本](https://github.com/huggingface/transformers/tree/master/examples) | 為各種任務提供的範例腳本 |
+| [模型分享和上傳](https://huggingface.co/transformers/model_sharing.html) | 上傳並與社群分享你微調的模型 |
+| [遷移](https://huggingface.co/transformers/migration.html) | 從 `pytorch-transformers` 或 `pytorch-pretrained-bert` 遷移到 🤗 Transformers |
+
+## 引用
+
+我們已將此函式庫的[論文](https://www.aclweb.org/anthology/2020.emnlp-demos.6/)正式發表。如果你使用了 🤗 Transformers 函式庫，可以引用：
+```bibtex
+@inproceedings{wolf-etal-2020-transformers,
+    title = "Transformers: State-of-the-Art Natural Language Processing",
+    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = oct,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
+    pages = "38--45"
+}
+```
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,10 +1,14 @@
 // These two things need to be updated at each release for the version selector.
 // Last stable version
-const stableVersion = "v4.6.0"
+const stableVersion = "v4.10.1"
 // Dictionary doc folder to label. The last stable version should have an empty key.
 const versionMapping = {
    "master": "master",
-    "": "v4.6.0 (stable)",
+    "": "v4.10.0/v4.10.1 (stable)",
+    "v4.9.2": "v4.9.0/v4.9.1/v4.9.2",
+    "v4.8.2": "v4.8.0/v4.8.1/v4.8.2",
+    "v4.7.0": "v4.7.0",
+    "v4.6.0": "v4.6.0",
    "v4.5.1": "v4.5.0/v4.5.1",
    "v4.4.2": "v4.4.0/v4.4.1/v4.4.2",
    "v4.3.3": "v4.3.0/v4.3.1/v4.3.2/v4.3.3",
--- a/docs/source/add_new_pipeline.rst
+++ b/docs/source/add_new_pipeline.rst
@@ -0,0 +1,143 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+How to add a pipeline to 🤗 Transformers?
+=======================================================================================================================
+
+First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
+dictionnaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as
+possible as it makes compatibility easier (even through other languages via JSON). Those will be the :obj:`inputs` of
+the pipeline (:obj:`preprocess`).
+
+Then define the :obj:`outputs`. Same policy as the :obj:`inputs`. The simpler, the better. Those will be the outputs of
+:obj:`postprocess` method.
+
+Start by inheriting the base class :obj:`Pipeline`. with the 4 methods needed to implement :obj:`preprocess`,
+:obj:`_forward`, :obj:`postprocess` and :obj:`_sanitize_parameters`.
+
+
+.. code-block::
+
+    from transformers import Pipeline
+
+    class MyPipeline(Pipeline):
+        def _sanitize_parameters(self, **kwargs)
+            preprocess_kwargs = {}
+            if "maybe_arg" in kwargs:
+                preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+            return preprocess_kwargs, {}, {}
+
+        def preprocess(self, inputs, maybe_arg=2)
+            model_input = Tensor(....)
+            return {"model_input": model_input}
+
+        def _forward(self, model_inputs)
+            # model_inputs == {"model_input": model_input}
+            oututs = self.model(**model_inputs)
+            # Maybe {"logits": Tensor(...)}
+            return outputs
+
+        def postprocess(self, model_outputs)
+            best_class = model_outputs["logits"].softmax(-1)
+            return best_class
+
+
+The structure of this breakdown is to support relatively seemless support for CPU/GPU, while supporting doing
+pre/postprocessing on the CPU on different threads
+
+:obj:`preprocess` will take the original defined inputs, and turn them something feedable to the model. It might
+contain more information and is usally a :obj:`Dict`.
+
+:obj:`_forward` is the implementation detail and is not meant to be called directly :obj:`forward` is the preferred
+called method as it contains safeguards to make sure everything is working on the expected device. If anything is
+linked to a real model it belongs in the :obj:`_forward` method, anything else is in the preprocess/postrocess.
+
+:obj:`postprocess` methods will take the output of :obj:`_forward` and turn it into the final output that were decided
+earlier.
+
+:obj:`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
+time ``pipeline(...., maybe_arg=4)`` or at call time ``pipe = pipeline(...); output = pipe(...., maybe_arg=4)``.
+
+The returns of :obj:`_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to :obj:`preprocess`,
+:obj:`_forward` and :obj:`postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
+allows to keep the default arguments in the function definition which is always more "natural".
+
+A classic example would be a :obj:`top_k` argument in the post processing in classification tasks.
+
+.. code-block::
+
+    >>> pipe = pipeline("my-new-task")
+    >>> pipe("This is a test")
+    [{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
+    {"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
+
+    >>> pipe("This is a test", top_k=2)
+    [{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
+
+In order to achieve that, we'll update our :obj:`postprocess` method with a default parameter to :obj:`5`. and edit
+:obj:`_sanitize_parameters` to allow this new parameter.
+
+
+.. code-block::
+
+
+        def postprocess(self, model_outputs, top_k=5)
+            best_class = model_outputs["logits"].softmax(-1)
+            # Add logic to handle top_k
+            return best_class
+
+        def _sanitize_parameters(self, **kwargs)
+            preprocess_kwargs = {}
+            if "maybe_arg" in kwargs:
+                preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+
+            postprocess_kwargs = {}
+            if "top_k" in kwargs:
+                preprocess_kwargs["top_k"] = kwargs["top_k"]
+            return preprocess_kwargs, {}, postprocess_kwargs
+
+Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
+without requiring users to understand new kind of objects. It's also relatively common to support many different types
+of arguments for ease of use (audio files, can be filenames, URLs or pure bytes)
+
+
+
+Adding it to the list of supported tasks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Go to ``src/transformers/pipelines/__init__.py`` and fill in :obj:`SUPPORTED_TASKS` with your newly created pipeline.
+If possible it should provide a default model.
+
+Adding tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Create a new file ``tests/test_pipelines_MY_PIPELINE.py`` with example with the other tests.
+
+The :obj:`run_pipeline_test` function will be very generic and run on small random models on every possible
+architecture as defined by :obj:`model_mapping` and :obj:`tf_model_mapping`.
+
+This is very important to test future compatibilty, meaning if someone adds a new model for
+:obj:`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
+impossible to check for actual values, that's why There is a helper :obj:`ANY` that will simply attempt to match the
+output of the pipeline TYPE.
+
+You also *need* to implement 2 (ideally 4) tests.
+
+- :obj:`test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
+  and test the pipeline outputs. The results should be the same as :obj:`test_small_model_tf`.
+- :obj:`test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
+  and test the pipeline outputs. The results should be the same as :obj:`test_small_model_pt`.
+- :obj:`test_large_model_pt` (:obj:`optional`): Tests the pipeline on a real pipeline where the results are supposed to
+  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
+  sure there is no drift in future releases
+- :obj:`test_large_model_tf` (:obj:`optional`): Tests the pipeline on a real pipeline where the results are supposed to
+  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
+  sure there is no drift in future releases
--- a/docs/source/community.md
+++ b/docs/source/community.md
@@ -1,4 +1,4 @@
-# Community
+# Community

 This page regroups resources around 🤗 Transformers developed by the community.

@@ -12,6 +12,7 @@ This page regroups resources around 🤗 Transformers developed by the community

 | Notebook     |      Description      |      Author      |      |
 |:----------|:-------------|:-------------|------:|
+| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | How to generate lyrics in the style of your favorite artist by fine-tuning a GPT-2 model |  [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
 | [Train T5 in Tensorflow 2 ](https://github.com/snapthat/TF-T5-text-to-text) | How to train T5 for any task using Tensorflow 2. This notebook demonstrates a Question & Answer task implemented in Tensorflow 2 using SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
 | [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb)  | How to train T5 on SQUAD with Transformers and Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
 | [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb)  | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning |  [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -27,7 +27,12 @@ author = "huggingface"
 # The short X.Y version
 version = ""
 # The full version, including alpha/beta/rc tags
-release = u'4.7.0'
+release = "4.11.2"
+
+
+
+
+



@@ -208,6 +213,9 @@ epub_title = project
 # A list of files that should not be packed into the epub file.
 epub_exclude_files = ["search.html"]

+# Localization
+locale_dirs = ['locale/']
+gettext_compact = False

 def setup(app):
    app.add_css_file("css/huggingface.css")
--- a/docs/source/debugging.rst
+++ b/docs/source/debugging.rst
@@ -24,7 +24,11 @@ Underflow and Overflow Detection

 .. note::

-   This feature can be used with any ``nn.Module``-based model
+   For multi-GPU training it requires DDP (``torch.distributed.launch``).
+
+.. note::
+
+   This feature can be used with any ``nn.Module``-based model.

 If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf`` or ``nan`` in
 activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
--- a/docs/source/imgs/parallelism-deepspeed-3d.png
+++ b/docs/source/imgs/parallelism-deepspeed-3d.png
--- a/docs/source/imgs/parallelism-flexflow.jpeg
+++ b/docs/source/imgs/parallelism-flexflow.jpeg
--- a/docs/source/imgs/parallelism-gpipe-bubble.png
+++ b/docs/source/imgs/parallelism-gpipe-bubble.png
--- a/docs/source/imgs/parallelism-sagemaker-interleaved-pipeline.png
+++ b/docs/source/imgs/parallelism-sagemaker-interleaved-pipeline.png
--- a/docs/source/imgs/parallelism-tp-independent-gelu.png
+++ b/docs/source/imgs/parallelism-tp-independent-gelu.png
--- a/docs/source/imgs/parallelism-tp-parallel_gemm.png
+++ b/docs/source/imgs/parallelism-tp-parallel_gemm.png
--- a/docs/source/imgs/parallelism-tp-parallel_self_attention.png
+++ b/docs/source/imgs/parallelism-tp-parallel_self_attention.png
--- a/docs/source/imgs/parallelism-tp-parallel_shard_processing.png
+++ b/docs/source/imgs/parallelism-tp-parallel_shard_processing.png
--- a/docs/source/imgs/parallelism-zero-dp-pp.png
+++ b/docs/source/imgs/parallelism-zero-dp-pp.png
--- a/docs/source/imgs/parallelism-zero.png
+++ b/docs/source/imgs/parallelism-zero.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -105,184 +105,219 @@ Supported models
 3. :doc:`BARThez <model_doc/barthez>` (from École polytechnique) released with the paper `BARThez: a Skilled Pretrained
   French Sequence-to-Sequence Model <https://arxiv.org/abs/2010.12321>`__ by Moussa Kamal Eddine, Antoine J.-P.
   Tixier, Michalis Vazirgiannis.
-4. :doc:`BERT <model_doc/bert>` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional
+4. :doc:`BEiT <model_doc/beit>` (from Microsoft) released with the paper `BEiT: BERT Pre-Training of Image Transformers
+   <https://arxiv.org/abs/2106.08254>`__ by Hangbo Bao, Li Dong, Furu Wei.
+5. :doc:`BERT <model_doc/bert>` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional
   Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__ by Jacob Devlin, Ming-Wei Chang,
   Kenton Lee and Kristina Toutanova.
-5. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
+6. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
   Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
   Narayan, Aliaksei Severyn.
-6. :doc:`BigBird-RoBERTa <model_doc/bigbird>` (from Google Research) released with the paper `Big Bird: Transformers
+7. :doc:`BigBird-RoBERTa <model_doc/bigbird>` (from Google Research) released with the paper `Big Bird: Transformers
   for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua
   Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-7. :doc:`BigBird-Pegasus <model_doc/bigbird_pegasus>` (from Google Research) released with the paper `Big Bird:
+8. :doc:`BigBird-Pegasus <model_doc/bigbird_pegasus>` (from Google Research) released with the paper `Big Bird:
   Transformers for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by Manzil Zaheer, Guru Guruganesh, Avinava
   Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-8. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
+9. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-9. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building an
-   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
-   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-10. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
+10. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building
+    an open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju,
+    Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+11. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
    <https://arxiv.org/abs/2010.10499>`__ by Adrian de Wynter and Daniel J. Perry.
-11. :doc:`ByT5 <model_doc/byt5>` (from Google Research) released with the paper `ByT5: Towards a token-free future with
+12. :doc:`ByT5 <model_doc/byt5>` (from Google Research) released with the paper `ByT5: Towards a token-free future with
    pre-trained byte-to-byte models <https://arxiv.org/abs/2105.13626>`__ by Linting Xue, Aditya Barua, Noah Constant,
    Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-12. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
+13. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
    French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
    Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-13. :doc:`CLIP <model_doc/clip>` from (OpenAI) released with the paper `Learning Transferable Visual Models From
+14. :doc:`CANINE <model_doc/canine>` (from Google Research) released with the paper `CANINE: Pre-training an Efficient
+    Tokenization-Free Encoder for Language Representation <https://arxiv.org/abs/2103.06874>`__ by Jonathan H. Clark,
+    Dan Garrette, Iulia Turc, John Wieting.
+15. :doc:`CLIP <model_doc/clip>` (from OpenAI) released with the paper `Learning Transferable Visual Models From
    Natural Language Supervision <https://arxiv.org/abs/2103.00020>`__ by Alec Radford, Jong Wook Kim, Chris Hallacy,
    Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen
    Krueger, Ilya Sutskever.
-14. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
+16. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
    Span-based Dynamic Convolution <https://arxiv.org/abs/2008.02496>`__ by Zihang Jiang, Weihao Yu, Daquan Zhou,
    Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-15. :doc:`CPM <model_doc/cpm>` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative
+17. :doc:`CPM <model_doc/cpm>` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative
    Chinese Pre-trained Language Model <https://arxiv.org/abs/2012.00413>`__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei
    Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng,
    Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang,
    Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-16. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
+18. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
    Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
    Lav R. Varshney, Caiming Xiong and Richard Socher.
-17. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
+19. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
    Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu
    Chen.
-18. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
+20. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
    with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
    Weizhu Chen.
-19. :doc:`DeiT <model_doc/deit>` (from Facebook) released with the paper `Training data-efficient image transformers &
+21. :doc:`DeiT <model_doc/deit>` (from Facebook) released with the paper `Training data-efficient image transformers &
    distillation through attention <https://arxiv.org/abs/2012.12877>`__ by Hugo Touvron, Matthieu Cord, Matthijs
    Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-20. :doc:`DETR <model_doc/detr>` (from Facebook) released with the paper `End-to-End Object Detection with Transformers
+22. :doc:`DETR <model_doc/detr>` (from Facebook) released with the paper `End-to-End Object Detection with Transformers
    <https://arxiv.org/abs/2005.12872>`__ by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier,
    Alexander Kirillov, Sergey Zagoruyko.
-21. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+23. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
    Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
    Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-22. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+24. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
    distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
    Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
    `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
    version of DistilBERT.
-23. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+25. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
    Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
    Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-24. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+26. :doc:`EncoderDecoder <model_doc/encoderdecoder>` (from Google Research) released with the paper `Leveraging
+    Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
+    Narayan, Aliaksei Severyn.
+27. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
    Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
    Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-25. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+28. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
    Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
    Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-26. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+29. :doc:`FNet <model_doc/fnet>` (from Google Research) released with the paper `FNet: Mixing Tokens with Fourier
+    Transforms <https://arxiv.org/abs/2105.03824>`__ by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago
+    Ontanon.
+30. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
    Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
    Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-27. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+31. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
    Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
    and Ilya Sutskever.
-28. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+32. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
    Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
    Luan, Dario Amodei** and Ilya Sutskever**.
-29. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
+33. :doc:`GPT-J <model_doc/gptj>` (from EleutherAI) released in the repository `kingoflolz/mesh-transformer-jax
+    <https://github.com/kingoflolz/mesh-transformer-jax/>`__ by Ben Wang and Aran Komatsuzaki.
+34. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
    <https://github.com/EleutherAI/gpt-neo>`__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-30. :doc:`Hubert <model_doc/hubert>` (from Facebook) released with the paper `HuBERT: Self-Supervised Speech
+35. :doc:`Hubert <model_doc/hubert>` (from Facebook) released with the paper `HuBERT: Self-Supervised Speech
    Representation Learning by Masked Prediction of Hidden Units <https://arxiv.org/abs/2106.07447>`__ by Wei-Ning Hsu,
    Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-31. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
-    <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
-32. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+36. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
+    <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+37. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-33. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+38. :doc:`LayoutLMv2 <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutLMv2:
+    Multi-modal Pre-training for Visually-Rich Document Understanding <https://arxiv.org/abs/2012.14740>`__ by Yang Xu,
+    Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min
+    Zhang, Lidong Zhou.
+39. :doc:`LayoutXLM <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutXLM:
+    Multimodal Pre-training for Multilingual Visually-rich Document Understanding <https://arxiv.org/abs/2104.08836>`__
+    by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+40. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-34. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+41. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-35. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
+42. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
    Representations with Entity-aware Self-attention <https://arxiv.org/abs/2010.01057>`__ by Ikuya Yamada, Akari Asai,
    Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-36. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+43. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
    by Hao Tan and Mohit Bansal.
-37. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
-    Machine Translation <https://arxiv.org/abs/2010.11125>`__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi
-    Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman
-    Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-38. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+44. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
+    Machine Translation <https://arxiv.org/abs/2010.11125>`__ by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma,
+    Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal,
+    Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+45. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
    Translator Team.
-39. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+46. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-40. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
+47. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
    Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
    Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-41. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
+48. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-42. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
+49. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-43. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+50. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
    Jianfeng Lu, Tie-Yan Liu.
-44. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+51. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-45. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
-    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
+52. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__ by Jingqing Zhang, Yao Zhao,
    Mohammad Saleh and Peter J. Liu.
-46. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+53. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-47. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+54. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-48. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+55. :doc:`RemBERT <model_doc/rembert>` (from Google Research) released with the paper `Rethinking embedding coupling in
+    pre-trained language models <https://arxiv.org/pdf/2010.12821.pdf>`__ by Hyung Won Chung, Thibault Févry, Henry
+    Tsai, M. Johnson, Sebastian Ruder.
+56. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-49. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
+57. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
    Enhanced Transformer with Rotary Position Embedding <https://arxiv.org/pdf/2104.09864v1.pdf>`__ by Jianlin Su and
    Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-50. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
+58. :doc:`SpeechEncoderDecoder <model_doc/speechencoderdecoder>`
+59. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
    `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
    Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-51. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
-    about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
-    Krishna, and Kurt W. Keutzer.
-52. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+60. :doc:`SpeechToTextTransformer2 <model_doc/speech_to_text_2>` (from Facebook), released together with the paper
+    `Large-Scale Self- and Semi-Supervised Learning for Speech Translation <https://arxiv.org/abs/2104.06678>`__ by
+    Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+61. :doc:`Splinter <model_doc/splinter>` (from Tel Aviv University), released together with the paper `Few-Shot
+    Question Answering by Pretraining Span Selection <https://arxiv.org/abs/2101.00438>`__ by Ori Ram, Yuval Kirstain,
+    Jonathan Berant, Amir Globerson, Omer Levy.
+62. :doc:`SqueezeBert <model_doc/squeezebert>` (from Berkeley) released with the paper `SqueezeBERT: What can computer
+    vision teach NLP about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola,
+    Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+63. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-53. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+64. :doc:`T5v1.1 <model_doc/t5v1.1>` (from Google AI) released in the repository
+    `google-research/text-to-text-transfer-transformer
+    <https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__ by
+    Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi
+    Zhou and Wei Li and Peter J. Liu.
+65. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
    Francesco Piccinno and Julian Martin Eisenschlos.
-54. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+66. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-55. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
+67. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
    Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
    Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
    Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-56. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
+68. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
    Performant Baseline for Vision and Language <https://arxiv.org/pdf/1908.03557>`__ by Liunian Harold Li, Mark
    Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-57. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+69. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
    Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
    Zhou, Abdelrahman Mohamed, Michael Auli.
-58. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+70. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-59. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+71. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-60. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+72. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
    Zettlemoyer and Veselin Stoyanov.
-61. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+73. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-62. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
+74. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
    Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
    Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.

@@ -302,10 +337,12 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
 +=============================+================+================+=================+====================+==============+
-|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            BeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
@@ -316,69 +353,81 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|       BlenderbotSmall       |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            CLIP             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           Canine            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            CLIP             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 | FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           Hubert            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            GPT-J            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -386,27 +435,35 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         Speech2Text         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            TAPAS            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             ViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|             ViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -416,10 +473,6 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+

 .. toctree::
    :maxdepth: 2
@@ -457,7 +510,10 @@ Flax), PyTorch, and/or TensorFlow.
    migration
    contributing
    add_new_model
+    add_new_pipeline
    fast_tokenizers
+    performance
+    parallelism
    testing
    debugging
    serialization
@@ -496,6 +552,7 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/auto
    model_doc/bart
    model_doc/barthez
+    model_doc/beit
    model_doc/bert
    model_doc/bertweet
    model_doc/bertgeneration
@@ -507,6 +564,7 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/bort
    model_doc/byt5
    model_doc/camembert
+    model_doc/canine
    model_doc/clip
    model_doc/convbert
    model_doc/cpm
@@ -521,11 +579,14 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/electra
    model_doc/encoderdecoder
    model_doc/flaubert
+    model_doc/fnet
    model_doc/fsmt
    model_doc/funnel
    model_doc/herbert
    model_doc/ibert
    model_doc/layoutlm
+    model_doc/layoutlmv2
+    model_doc/layoutxlm
    model_doc/led
    model_doc/longformer
    model_doc/luke
@@ -540,6 +601,7 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/mt5
    model_doc/gpt
    model_doc/gpt2
+    model_doc/gptj
    model_doc/gpt_neo
    model_doc/hubert
    model_doc/pegasus
@@ -547,12 +609,17 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/prophetnet
    model_doc/rag
    model_doc/reformer
+    model_doc/rembert
    model_doc/retribert
    model_doc/roberta
    model_doc/roformer
+    model_doc/speechencoderdecoder
    model_doc/speech_to_text
+    model_doc/speech_to_text_2
+    model_doc/splinter
    model_doc/squeezebert
    model_doc/t5
+    model_doc/t5v1.1
    model_doc/tapas
    model_doc/transformerxl
    model_doc/vit
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -172,7 +172,19 @@ python examples/pytorch/translation/run_translation.py --model_name_or_path t5-s
 ```
 and it should succeed without any hanging waiting to timeout.

+#### Fetching models and tokenizers to use offline

+When running a script the first time like mentioned above, the downloaded files will be cached for future reuse. 
+However, it is also possible to download files and point to their local path instead.
+
+Downloading files can be done through the Web Interface by clicking on the "Download" button, but it can also be handled
+programmatically using the `huggingface_hub` library that is a dependency to `transformers`:
+
+- Using `snapshot_download` to download an entire repository
+- Using `hf_hub_download` to download a specific file
+
+See the reference for these methods in the huggingface_hub
+[documentation](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub).

 ## Do you want to run a Transformer model on a mobile device?

--- a/docs/source/internal/file_utils.rst
+++ b/docs/source/internal/file_utils.rst
@@ -51,4 +51,4 @@ Special Properties
 Other Utilities
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.file_utils._BaseLazyModule
+.. autoclass:: transformers.file_utils._LazyModule
--- a/docs/source/internal/modeling_utils.rst
+++ b/docs/source/internal/modeling_utils.rst
@@ -63,7 +63,6 @@ TensorFlow custom layers
    :members: call

 .. autoclass:: transformers.modeling_tf_utils.TFSequenceSummary
-    :members: call


 TensorFlow loss functions
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@@ -17,9 +17,15 @@ The base class :class:`~transformers.PretrainedConfig` implements the common met
 either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
 from HuggingFace's AWS S3 repository).

+Each derived config class implements model specific attributes. Common attributes present in all config classes are:
+:obj:`hidden_size`, :obj:`num_attention_heads`, and :obj:`num_hidden_layers`. Text models further implement:
+:obj:`vocab_size`.
+
+

 PretrainedConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PretrainedConfig
+    :special-members: push_to_hub
    :members:
--- a/docs/source/main_classes/data_collator.rst
+++ b/docs/source/main_classes/data_collator.rst
@@ -18,7 +18,7 @@ the same type as the elements of :obj:`train_dataset` or :obj:`eval_dataset`.

 To be able to build batches, data collators may apply some processing (like padding). Some of them (like
 :class:`~transformers.DataCollatorForLanguageModeling`) also apply some random data augmentation (like random masking)
-oin the formed batch.
+on the formed batch.

 Examples of use can be found in the :doc:`example scripts <../examples>` or :doc:`example notebooks <../notebooks>`.

@@ -54,18 +54,18 @@ DataCollatorForLanguageModeling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.data.data_collator.DataCollatorForLanguageModeling
-    :members: mask_tokens
+    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens


 DataCollatorForWholeWordMask
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.data.data_collator.DataCollatorForWholeWordMask
-    :members: mask_tokens
+    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens


 DataCollatorForPermutationLanguageModeling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.data.data_collator.DataCollatorForPermutationLanguageModeling
-    :members: mask_tokens
+    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
--- a/docs/source/main_classes/deepspeed.rst
+++ b/docs/source/main_classes/deepspeed.rst
@@ -73,8 +73,6 @@ or via ``transformers``' ``extras``:

    pip install transformers[deepspeed]

-(will become available starting from ``transformers==4.6.0``)
-
 or find more details on `the DeepSpeed's GitHub page <https://github.com/microsoft/deepspeed#installation>`__ and
 `advanced install <https://www.deepspeed.ai/tutorials/advanced-install/>`__.

@@ -90,20 +88,31 @@ To make a local build for DeepSpeed:
    git clone https://github.com/microsoft/DeepSpeed/
    cd DeepSpeed
    rm -rf build
-    TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 pip install . \
+    TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
    --global-option="build_ext" --global-option="-j8" --no-cache -v \
    --disable-pip-version-check 2>&1 | tee build.log

-Edit ``TORCH_CUDA_ARCH_LIST`` to insert the code for the architectures of the GPU cards you intend to use.
+If you intend to use NVMe offload you will need to also include ``DS_BUILD_AIO=1`` in the instructions above (and also
+install `libaio-dev` system-wide).

-Or if you need to use the same setup on multiple machines, make a binary wheel:
+Edit ``TORCH_CUDA_ARCH_LIST`` to insert the code for the architectures of the GPU cards you intend to use. Assuming all
+your cards are the same you can get the arch via:
+
+.. code-block:: bash
+
+    CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capability())"
+
+So if you get ``8, 6``, then use ``TORCH_CUDA_ARCH_LIST="8.6"``. If you have multiple different cards, you can list all
+of them like so ``TORCH_CUDA_ARCH_LIST="6.1;8.6"``
+
+If you need to use the same setup on multiple machines, make a binary wheel:

 .. code-block:: bash

    git clone https://github.com/microsoft/DeepSpeed/
    cd DeepSpeed
    rm -rf build
-    TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 \
+    TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
    python setup.py build_ext -j8 bdist_wheel

 it will generate something like ``dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` which now you can install
@@ -692,7 +701,17 @@ be ignored.

 - ``sub_group_size``: ``1e9``

-This one does impact GPU memory usage. But no docs at the moment on Deepspeed side to explain the tuning.
+``sub_group_size`` controls the granularity in which parameters are updated during optimizer steps. Parameters are
+grouped into buckets of ``sub_group_size`` and each buckets is updated one at a time. When used with NVMe offload in
+ZeRO-Infinity, ``sub_group_size`` therefore controls the granularity in which model states are moved in and out of CPU
+memory from NVMe during the optimizer step. This prevents running out of CPU memory for extremely large models.
+
+You can leave ``sub_group_size`` to its default value of `1e9` when not using NVMe offload. You may want to change its
+default value in the following cases:
+
+1. Running into OOM during optimizer step: Reduce ``sub_group_size`` to reduce memory utilization of temporary buffers
+2. Optimizer Step is taking a long time: Increase ``sub_group_size`` to improve bandwidth utilization as a result of
+   the increased data buffers.


 .. _deepspeed-nvme:
@@ -1042,7 +1061,8 @@ optimizers, with the exception of using the combination of HuggingFace scheduler
 | DS Optimizer | No           | Yes          |
 +--------------+--------------+--------------+

-If ``offload_optimizer`` is enabled you must use both DeepSpeed scheduler and DeepSpeed optimizer.
+It is possible to use a non-DeepSpeed optimizer when ``offload_optimizer`` is enabled, as long as it has both CPU and
+GPU implementation (except LAMB).



@@ -1136,8 +1156,8 @@ Here is where the schedulers overlap between 🤗 Transformers and DeepSpeed:
  therefore, if you don't configure the scheduler this is scheduler that will get configured by default.

 If you don't configure the ``scheduler`` entry in the configuration file, the :class:`~transformers.Trainer` will use
-the values of ``--lr_scheduler_type``, ``--learning_rate`` and ``--warmup_steps`` to configure a 🤗 Transformers version
-of it.
+the values of ``--lr_scheduler_type``, ``--learning_rate`` and ``--warmup_steps`` or ``--warmup_ratio`` to configure a
+🤗 Transformers version of it.

 Here is an example of the auto-configured ``scheduler`` entry for ``WarmupLR``:

@@ -1158,9 +1178,10 @@ Since `"auto"` is used the :class:`~transformers.Trainer` arguments will set the
 file. This is so that there is one definitive source of the values and to avoid hard to find errors when, for example,
 the learning rate is set to different values in different places. Command line rules. The values that get set are:

- ``warmup_min_lr`` with the value of ``0``
- ``warmup_max_lr`` with the value of ``--learning_rate``
- ``warmup_num_steps`` with the value of ``--warmup_steps``
+- ``warmup_min_lr`` with the value of ``0``.
+- ``warmup_max_lr`` with the value of ``--learning_rate``.
+- ``warmup_num_steps`` with the value of ``--warmup_steps`` if provided. Otherwise will use ``--warmup_ratio``
+  multiplied by the number of training steps and rounded up.
 - ``total_num_steps`` with either the value of ``--max_steps`` or if it is not provided, derived automatically at run
  time based on the environment and the size of the dataset and other command line arguments (needed for
  ``WarmupDecayLR``).
@@ -1437,8 +1458,56 @@ won't be possible to load it back.

 While the fp16 weights are fine for resuming training, if you finished finetuning your model and want to upload it to
 the `models hub <https://huggingface.co/models>`__ or pass it to someone else you most likely will want to get the fp32
-weights. This cannot be done during training since this is a process that requires a lot of memory, and therefore this
-is performed offline.
+weights. This ideally shouldn't be done during training since this is a process that requires a lot of memory, and
+therefore best to be performed offline after the training is complete. But if desired and you have plenty of free CPU
+memory it can be done in the same training script. The following sections will discuss both approaches.
+
+
+**Live FP32 Weights Recovery:**
+
+This approach may not work if you model is large and you have little free CPU memory left, at the end of the training.
+
+If you have saved at least one checkpoint, and you want to use the latest one, you can do the following:
+
+.. code-block:: python
+
+    from transformers.trainer_utils import get_last_checkpoint
+    from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+    checkpoint_dir = get_last_checkpoint(trainer.args.output_dir)
+    fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+
+If you're using the ``--load_best_model_at_end`` class:`~transformers.TrainingArguments` argument (to track the best
+checkpoint), then you can finish the training by first saving the final model explicitly and then do the same as above:
+
+.. code-block:: python
+
+    from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+    checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final")
+    trainer.deepspeed.save_checkpoint(checkpoint_dir)
+    fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+
+.. note::
+
+    Note, that once ``load_state_dict_from_zero_checkpoint`` was run, the ``model`` will no longer be useable in the
+    DeepSpeed context of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the DeepSpeed magic from it. So do this only at the very end
+    of the training.
+
+Of course, you don't have to use class:`~transformers.Trainer` and you can adjust the examples above to your own
+trainer.
+
+If for some reason you want more refinement, you can also extract the fp32 ``state_dict`` of the weights and apply
+these yourself as is shown in the following example:
+
+.. code-block:: python
+
+    from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+    model = model.cpu()
+    model.load_state_dict(state_dict)
+
+
+**Offline FP32 Weights Recovery:**

 DeepSpeed creates a special conversion script ``zero_to_fp32.py`` which it places in the top-level of the checkpoint
 folder. Using this script you can extract the weights at any point. The script is standalone and you no longer need to
@@ -1467,15 +1536,16 @@ weights just run:

 .. code-block:: bash

-    python zero_to_fp32.py global_step1 pytorch_model.bin
+    python zero_to_fp32.py . pytorch_model.bin

-The script will automatically handle either ZeRO-2 or ZeRO-3 checkpoint.
+This is it. ``pytorch_model.bin`` will now contain the full fp32 model weights consolidated from multiple GPUs.
+
+The script will automatically be able to handle either a ZeRO-2 or ZeRO-3 checkpoint.

 ``python zero_to_fp32.py -h`` will give you usage details.

-If you have multiple DeepSpeed checkpoint sub-folders, pick the one you know to have the desired weights.
-
-This is it. ``pytorch_model.bin`` will now contain the full fp32 model weights consolidated from multiple GPUs.
+The script will auto-discover the deepspeed sub-folder using the contents of the file ``latest``, which in the current
+example will contain ``global_step1``.

 Note: currently the script requires 2x general RAM of the final fp32 model weights.

@@ -1530,6 +1600,8 @@ Note: If the fp16 weights of the model can't fit onto the memory of a single GPU
 For full details on this method and other related features please refer to `Constructing Massive Models
 <https://deepspeed.readthedocs.io/en/latest/zero3.html#constructing-massive-models>`__.

+Also when loading fp16-pretrained models, you will want to tell ``from_pretrained`` to use
+``torch_dtype=torch.float16``. For details, please, see :ref:`from_pretrained-torch-dtype`.


 Gathering Parameters
@@ -1555,6 +1627,56 @@ stress on ``tensor([1.])``, or if you get an error where it says the parameter i
 larger multi-dimensional shape, this means that the parameter is partitioned and what you see is a ZeRO-3 placeholder.


+
+
+Filing Issues
+=======================================================================================================================
+
+Here is how to file an issue so that we could quickly get to the bottom of the issue and help you to unblock your work.
+
+In your report please always include:
+
+1. the full Deepspeed config file in the report
+
+2. either the command line arguments if you were using the :class:`~transformers.Trainer` or
+   :class:`~transformers.TrainingArguments` arguments if you were scripting the Trainer setup yourself. Please do not
+   dump the :class:`~transformers.TrainingArguments` as it has dozens of entries that are irrelevant.
+
+3. Output of:
+
+.. code-block:: bash
+
+    python -c 'import torch; print(f"torch: {torch.__version__}")'
+    python -c 'import transformers; print(f"transformers: {transformers.__version__}")'
+    python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")'
+
+4. If possible include a link to a Google Colab notebook that we can reproduce the problem with. You can use this
+   `notebook <https://github.com/stas00/porting/blob/master/transformers/deepspeed/DeepSpeed_on_colab_CLI.ipynb>`__ as
+   a starting point.
+
+5. Unless it's impossible please always use a standard dataset that we can use and not something custom.
+
+6. If possible try to use one of the existing `examples
+   <https://github.com/huggingface/transformers/tree/master/examples/pytorch>`__ to reproduce the problem with.
+
+Things to consider:
+
+* Deepspeed is often not the cause of the problem.
+
+    Some of the filed issues proved to be Deepspeed-unrelated. That is once Deepspeed was removed from the setup, the
+    problem was still there.
+
+    Therefore, if it's not absolutely obvious it's a DeepSpeed-related problem, as in you can see that there is an
+    exception and you can see that DeepSpeed modules are involved, first re-test your setup without DeepSpeed in it.
+    And only if the problem persists then do mentioned Deepspeed and supply all the required details.
+
+* If it's clear to you that the issue is in the DeepSpeed core and not the integration part, please file the Issue
+  directly with `Deepspeed <https://github.com/microsoft/DeepSpeed/>`__. If you aren't sure, please do not worry,
+  either Issue tracker will do, we will figure it out once you posted it and redirect you to another Issue tracker if
+  need be.
+
+
+
 Troubleshooting
 =======================================================================================================================

@@ -1606,7 +1728,7 @@ For example for a pretrained model:
 .. code-block:: python

    from transformers.deepspeed import HfDeepSpeedConfig
-    from transformers import AugoModel
+    from transformers import AutoModel, deepspeed

    ds_config = { ... } # deepspeed config object or path to the file
    # must run before instantiating the model
@@ -1619,7 +1741,7 @@ or for non-pretrained model:
 .. code-block:: python

    from transformers.deepspeed import HfDeepSpeedConfig
-    from transformers import AugoModel, AutoConfig
+    from transformers import AutoModel, AutoConfig, deepspeed

    ds_config = { ... } # deepspeed config object or path to the file
    # must run before instantiating the model
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -1,4 +1,4 @@
-.. 
+..
    Copyright 2020 The HuggingFace Team. All rights reserved.

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
@@ -35,9 +35,41 @@ PreTrainedModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PreTrainedModel
+    :special-members: push_to_hub
    :members:


+.. _from_pretrained-torch-dtype:
+
+Model Instantiation dtype
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Under Pytorch a model normally gets instantiated with ``torch.float32`` format. This can be an issue if one tries to
+load a model whose weights are in fp16, since it'd require twice as much memory. To overcome this limitation, you can
+either explicitly pass the desired ``dtype`` using ``torch_dtype`` argument:
+
+.. code-block:: python
+
+    model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype=torch.float16)
+
+or, if you want the model to always load in the most optimal memory pattern, you can use the special value ``"auto"``,
+and then ``dtype`` will be automatically derived from the model's weights:
+
+.. code-block:: python
+
+    model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype="auto")
+
+Models instantiated from scratch can also be told which ``dtype`` to use with:
+
+.. code-block:: python
+
+    config = T5Config.from_pretrained("t5")
+    model = AutoModel.from_config(config)
+
+Due to Pytorch design, this functionality is only available for floating dtypes.
+
+
+
 ModuleUtilsMixin
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -49,6 +81,7 @@ TFPreTrainedModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFPreTrainedModel
+    :special-members: push_to_hub
    :members:


@@ -63,6 +96,7 @@ FlaxPreTrainedModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.FlaxPreTrainedModel
+    :special-members: push_to_hub
    :members:


--- a/docs/source/main_classes/output.rst
+++ b/docs/source/main_classes/output.rst
@@ -299,3 +299,93 @@ TFSeq2SeqQuestionAnsweringModelOutput

 .. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput
    :members:
+
+
+FlaxBaseModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutput
+
+
+FlaxBaseModelOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPast
+
+
+FlaxBaseModelOutputWithPooling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPooling
+
+
+FlaxBaseModelOutputWithPastAndCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPastAndCrossAttentions
+
+
+FlaxSeq2SeqModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqModelOutput
+
+
+FlaxCausalLMOutputWithCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxCausalLMOutputWithCrossAttentions
+
+
+FlaxMaskedLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxMaskedLMOutput
+
+
+FlaxSeq2SeqLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqLMOutput
+
+
+FlaxNextSentencePredictorOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxNextSentencePredictorOutput
+
+
+FlaxSequenceClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSequenceClassifierOutput
+
+
+FlaxSeq2SeqSequenceClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqSequenceClassifierOutput
+
+
+FlaxMultipleChoiceModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxMultipleChoiceModelOutput
+
+
+FlaxTokenClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxTokenClassifierOutput
+
+
+FlaxQuestionAnsweringModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxQuestionAnsweringModelOutput
+
+
+FlaxSeq2SeqQuestionAnsweringModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqQuestionAnsweringModelOutput
--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@@ -23,20 +23,22 @@ There are two categories of pipeline abstractions to be aware about:
 - The :func:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines.
 - The other task-specific pipelines:

+    - :class:`~transformers.AudioClassificationPipeline`
    - :class:`~transformers.AutomaticSpeechRecognitionPipeline`
    - :class:`~transformers.ConversationalPipeline`
    - :class:`~transformers.FeatureExtractionPipeline`
    - :class:`~transformers.FillMaskPipeline`
    - :class:`~transformers.ImageClassificationPipeline`
+    - :class:`~transformers.ObjectDetectionPipeline`
    - :class:`~transformers.QuestionAnsweringPipeline`
    - :class:`~transformers.SummarizationPipeline`
+    - :class:`~transformers.TableQuestionAnsweringPipeline`
    - :class:`~transformers.TextClassificationPipeline`
    - :class:`~transformers.TextGenerationPipeline`
+    - :class:`~transformers.Text2TextGenerationPipeline`
    - :class:`~transformers.TokenClassificationPipeline`
    - :class:`~transformers.TranslationPipeline`
    - :class:`~transformers.ZeroShotClassificationPipeline`
-    - :class:`~transformers.Text2TextGenerationPipeline`
-    - :class:`~transformers.TableQuestionAnsweringPipeline`

 The pipeline abstraction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -44,12 +46,60 @@ The pipeline abstraction
 The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any other
 pipeline but requires an additional argument which is the `task`.

+Simple call on one item:
+
+.. code-block::
+
+    >>> pipe = pipeline("text-classification")
+    >>> pipe("This restaurant is awesome")
+    [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+
+To call a pipeline on many items, you can either call with a `list`.
+
+.. code-block::
+
+    >>> pipe = pipeline("text-classification")
+    >>> pipe(["This restaurant is awesome", "This restaurant is aweful"])
+    [{'label': 'POSITIVE', 'score': 0.9998743534088135},
+     {'label': 'NEGATIVE', 'score': 0.9996669292449951}]
+
+
+To iterate of full datasets it is recommended to use a :obj:`dataset` directly. This means you don't need to allocate
+the whole dataset at once, nor do you need to do batching yourself. This should work just as fast as custom loops on
+GPU. If it doesn't don't hesitate to create an issue.
+
+.. code-block::
+
+    pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
+    dataset = datasets.load_dataset("superb", name="asr", split="test")
+
+    # KeyDataset (only `pt`) will simply return the item in the dict returned by the dataset item
+    # as we're not interested in the `target` part of the dataset.
+    for out in tqdm.tqdm(pipe(KeyDataset(dataset, "file"))):
+        print(out)
+        # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
+        # {"text": ....}
+        # ....
+
+
 .. autofunction:: transformers.pipeline

+Implementing a pipeline
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:doc:`Implementing a new pipeline <../add_new_pipeline>`

 The task specific pipelines
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+
+AudioClassificationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.AudioClassificationPipeline
+    :special-members: __call__
+    :members:
+
 AutomaticSpeechRecognitionPipeline
 =======================================================================================================================

@@ -94,6 +144,13 @@ NerPipeline

 See :class:`~transformers.TokenClassificationPipeline` for all details.

+ObjectDetectionPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.ObjectDetectionPipeline
+    :special-members: __call__
+    :members:
+
 QuestionAnsweringPipeline
 =======================================================================================================================

--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -53,10 +53,8 @@ PreTrainedTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PreTrainedTokenizer
-    :special-members: __call__
-    :members: batch_decode, convert_ids_to_tokens, convert_tokens_to_ids, convert_tokens_to_string, decode, encode, 
-        get_added_vocab, get_special_tokens_mask, num_special_tokens_to_add, prepare_for_tokenization, tokenize,
-        vocab_size
+    :special-members: __call__, batch_decode, decode, encode, push_to_hub
+    :members: 


 PreTrainedTokenizerFast
@@ -68,10 +66,8 @@ loaded very simply into 🤗 transformers. Take a look at the :doc:`Using tokeni
 <../fast_tokenizers>` page to understand how this is done.

 .. autoclass:: transformers.PreTrainedTokenizerFast
-    :special-members: __call__
-    :members: batch_decode, convert_ids_to_tokens, convert_tokens_to_ids, convert_tokens_to_string, decode, encode, 
-        get_added_vocab, get_special_tokens_mask, num_special_tokens_to_add,
-        set_truncation_and_padding,tokenize, vocab_size
+    :special-members: __call__, batch_decode, decode, encode, push_to_hub
+    :members:


 BatchEncoding
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -64,9 +64,9 @@ classification:

    class MultilabelTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
-            labels = inputs.pop("labels")
+            labels = inputs.get("labels")
            outputs = model(**inputs)
-            logits = outputs.logits
+            logits = outputs.get('logits')
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                            labels.float().view(-1, self.model.config.num_labels))
@@ -119,6 +119,97 @@ TFTrainingArguments
    :members:


+Checkpoints
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, :class:`~transformers.Trainer` will save all checkpoints in the :obj:`output_dir` you set in the
+:class:`~transformers.TrainingArguments` you are using. Those will go in subfolder named :obj:`checkpoint-xxx` with xxx
+being the step at which the training was at.
+
+Resuming training from a checkpoint can be done when calling :meth:`~transformers.Trainer.train` with either:
+
+- :obj:`resume_from_checkpoint=True` which will resume training from the latest checkpoint
+- :obj:`resume_from_checkpoint=checkpoint_dir` which will resume training from the specific checkpoint in the directory
+  passed.
+
+In addition, you can easily save your checkpoints on the Model Hub when using :obj:`push_to_hub=True`. By default, all
+the models saved in intermediate checkpoints are saved in different commits, but not the optimizer state. You can adapt
+the :obj:`hub-strategy` value of your :class:`~transformers.TrainingArguments` to either:
+
+- :obj:`"checkpoint"`: the latest checkpoint is also pushed in a subfolder named last-checkpoint, allowing you to
+  resume training easily with :obj:`trainer.train(resume_from_checkpoint="output_dir/last-checkpoint")`.
+- :obj:`"all_checkpoints"`: all checkpoints are pushed like they appear in the output folder (so you will get one
+  checkpoint folder per folder in your final repository)
+
+
+Logging
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default :class:`~transformers.Trainer` will use ``logging.INFO`` for the main process and ``logging.WARNING`` for
+the replicas if any.
+
+These defaults can be overridden to use any of the 5 ``logging`` levels with :class:`~transformers.TrainingArguments`'s
+arguments:
+
+- ``log_level`` - for the main process
+- ``log_level_replica`` - for the replicas
+
+Further, if :class:`~transformers.TrainingArguments`'s ``log_on_each_node`` is set to ``False`` only the main node will
+use the log level settings for its main process, all other nodes will use the log level settings for replicas.
+
+Note that :class:`~transformers.Trainer` is going to set ``transformers``'s log level separately for each node in its
+:meth:`~transformers.Trainer.__init__`. So you may want to set this sooner (see the next example) if you tap into other
+``transformers`` functionality before creating the :class:`~transformers.Trainer` object.
+
+Here is an example of how this can be used in an application:
+
+.. code-block:: python
+
+    [...]
+    logger = logging.getLogger(__name__)
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    # set the main code and the modules it uses to the same log-level according to the node
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+
+    trainer = Trainer(...)
+
+And then if you only want to see warnings on the main node and all other nodes to not print any most likely duplicated
+warnings you could run it as:
+
+.. code-block:: bash
+
+    my_app.py ... --log_level warning --log_level_replica error
+
+In the multi-node environment if you also don't want the logs to repeat for each node's main process, you will want to
+change the above to:
+
+.. code-block:: bash
+
+    my_app.py ... --log_level warning --log_level_replica error --log_on_each_node 0
+
+and then only the main process of the first node will log at the "warning" level, and all other processes on the main
+node and all processes on other nodes will log at the "error" level.
+
+If you need your application to be as quiet as possible you could do:
+
+.. code-block:: bash
+
+    my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0
+
+(add ``--log_on_each_node 0`` if on multi-node environment)
+
+
+
 Randomness
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -129,7 +220,7 @@ which should make the "stop and resume" style of training as close as possible t
 However, due to various default non-deterministic pytorch settings this might not fully work. If you want full
 determinism please refer to `Controlling sources of randomness
 <https://pytorch.org/docs/stable/notes/randomness.html>`__. As explained in the document, that some of those settings
-that make things determinstic (.e.g., ``torch.backends.cudnn.deterministic``) may slow things down, therefore this
+that make things deterministic (.e.g., ``torch.backends.cudnn.deterministic``) may slow things down, therefore this
 can't be done by default, but you can enable those yourself if needed.


--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -43,7 +43,8 @@ Tips:
  similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
  number of (repeating) layers.

-This model was contributed by `lysandre <https://huggingface.co/lysandre>`__. The original code can be found `here
+This model was contributed by `lysandre <https://huggingface.co/lysandre>`__. This model jax version was contributed by
+`kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found `here
 <https://github.com/google-research/ALBERT>`__.

 AlbertConfig
@@ -174,3 +175,52 @@ TFAlbertForQuestionAnswering

 .. autoclass:: transformers.TFAlbertForQuestionAnswering
    :members: call
+
+
+FlaxAlbertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertModel
+    :members: __call__
+
+
+FlaxAlbertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForPreTraining
+    :members: __call__
+
+
+FlaxAlbertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForMaskedLM
+    :members: __call__
+
+
+FlaxAlbertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForSequenceClassification
+    :members: __call__
+
+
+FlaxAlbertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForMultipleChoice
+    :members: __call__
+
+
+FlaxAlbertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForTokenClassification
+    :members: __call__
+
+
+FlaxAlbertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForQuestionAnswering
+    :members: __call__
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -135,6 +135,34 @@ AutoModelForImageClassification
    :members:


+AutoModelForAudioClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForAudioClassification
+    :members:
+
+
+AutoModelForCTC
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForCTC
+    :members:
+
+
+AutoModelForSpeechSeq2Seq
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForSpeechSeq2Seq
+    :members:
+
+
+AutoModelForObjectDetection
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForObjectDetection
+    :members:
+
+
 TFAutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -226,6 +254,13 @@ FlaxAutoModelForMaskedLM
    :members:


+FlaxAutoModelForSeq2SeqLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAutoModelForSeq2SeqLM
+    :members:
+
+
 FlaxAutoModelForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -259,3 +294,10 @@ FlaxAutoModelForNextSentencePrediction

 .. autoclass:: transformers.FlaxAutoModelForNextSentencePrediction
    :members:
+
+
+FlaxAutoModelForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAutoModelForImageClassification
+    :members:
--- a/docs/source/model_doc/beit.rst
+++ b/docs/source/model_doc/beit.rst
@@ -0,0 +1,119 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BEiT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BEiT model was proposed in `BEiT: BERT Pre-Training of Image Transformers <https://arxiv.org/abs/2106.08254>`__ by
+Hangbo Bao, Li Dong and Furu Wei. Inspired by BERT, BEiT is the first paper that makes self-supervised pre-training of
+Vision Transformers (ViTs) outperform supervised pre-training. Rather than pre-training the model to predict the class
+of an image (as done in the `original ViT paper <https://arxiv.org/abs/2010.11929>`__), BEiT models are pre-trained to
+predict visual tokens from the codebook of OpenAI's `DALL-E model <https://arxiv.org/abs/2102.12092>`__ given masked
+patches.
+
+The abstract from the paper is the following:
+
+*We introduce a self-supervised vision representation model BEiT, which stands for Bidirectional Encoder representation
+from Image Transformers. Following BERT developed in the natural language processing area, we propose a masked image
+modeling task to pretrain vision Transformers. Specifically, each image has two views in our pre-training, i.e, image
+patches (such as 16x16 pixels), and visual tokens (i.e., discrete tokens). We first "tokenize" the original image into
+visual tokens. Then we randomly mask some image patches and fed them into the backbone Transformer. The pre-training
+objective is to recover the original visual tokens based on the corrupted image patches. After pre-training BEiT, we
+directly fine-tune the model parameters on downstream tasks by appending task layers upon the pretrained encoder.
+Experimental results on image classification and semantic segmentation show that our model achieves competitive results
+with previous pre-training methods. For example, base-size BEiT achieves 83.2% top-1 accuracy on ImageNet-1K,
+significantly outperforming from-scratch DeiT training (81.8%) with the same setup. Moreover, large-size BEiT obtains
+86.3% only using ImageNet-1K, even outperforming ViT-L with supervised pre-training on ImageNet-22K (85.2%).*
+
+Tips:
+
+- BEiT models are regular Vision Transformers, but pre-trained in a self-supervised way rather than supervised. They
+  outperform both the original model (ViT) as well as Data-efficient Image Transformers (DeiT) when fine-tuned on
+  ImageNet-1K and CIFAR-100.
+- As the BEiT models expect each image to be of the same size (resolution), one can use
+  :class:`~transformers.BeitFeatureExtractor` to resize (or rescale) and normalize images for the model.
+- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
+  each checkpoint. For example, :obj:`microsoft/beit-base-patch16-224` refers to a base-sized architecture with patch
+  resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the `hub
+  <https://huggingface.co/models?search=microsoft/beit>`__.
+- The available checkpoints are either (1) pre-trained on `ImageNet-22k <http://www.image-net.org/>`__ (a collection of
+  14 million images and 22k classes) only, (2) also fine-tuned on ImageNet-22k or (3) also fine-tuned on `ImageNet-1k
+  <http://www.image-net.org/challenges/LSVRC/2012/>`__ (also referred to as ILSVRC 2012, a collection of 1.3 million
+  images and 1,000 classes).
+- BEiT uses relative position embeddings, inspired by the T5 model. During pre-training, the authors shared the
+  relative position bias among the several self-attention layers. During fine-tuning, each layer's relative position
+  bias is initialized with the shared relative position bias obtained after pre-training. Note that, if one wants to
+  pre-train a model from scratch, one needs to either set the :obj:`use_relative_position_bias` or the
+  :obj:`use_relative_position_bias` attribute of :class:`~transformers.BeitConfig` to :obj:`True` in order to add
+  position embeddings.
+
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The JAX/FLAX version of this model was
+contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found `here
+<https://github.com/microsoft/unilm/tree/master/beit>`__.
+
+BeitConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitConfig
+    :members:
+
+
+BeitFeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitFeatureExtractor
+    :members: __call__
+
+
+BeitModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitModel
+    :members: forward
+
+
+BeitForMaskedImageModeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitForMaskedImageModeling
+    :members: forward
+
+
+BeitForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitForImageClassification
+    :members: forward
+
+
+FlaxBeitModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBeitModel
+    :members: __call__
+
+
+FlaxBeitForMaskedImageModeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBeitForMaskedImageModeling
+    :members: __call__
+
+
+FlaxBeitForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBeitForImageClassification
+    :members: __call__
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -76,6 +76,9 @@ Bert specific outputs
 .. autoclass:: transformers.models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
    :members:

+.. autoclass:: transformers.models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
+    :members:
+

 BertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/blenderbot_small.rst
+++ b/docs/source/model_doc/blenderbot_small.rst
@@ -57,6 +57,13 @@ BlenderbotSmallTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+BlenderbotSmallTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallTokenizerFast
+    :members:
+
+
 BlenderbotSmallModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/byt5.rst
+++ b/docs/source/model_doc/byt5.rst
@@ -39,8 +39,11 @@ experiments.*
 This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
 found `here <https://github.com/google-research/byt5>`__.

+ByT5's architecture is based on the T5v1.1 model, so one can refer to :doc:`T5v1.1's documentation page <t5v1.1>`. They
+only differ in how inputs should be prepared for the model, see the code examples below.

-ByT5's architecture is based on the T5 model, so one can refer to :doc:`T5's documentation page <t5>`.
+Since ByT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.


 Example
--- a/docs/source/model_doc/canine.rst
+++ b/docs/source/model_doc/canine.rst
@@ -0,0 +1,155 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+CANINE
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The CANINE model was proposed in `CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language
+Representation <https://arxiv.org/abs/2103.06874>`__ by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. It's
+among the first papers that trains a Transformer without using an explicit tokenization step (such as Byte Pair
+Encoding (BPE), WordPiece or SentencePiece). Instead, the model is trained directly at a Unicode character-level.
+Training at a character-level inevitably comes with a longer sequence length, which CANINE solves with an efficient
+downsampling strategy, before applying a deep Transformer encoder.
+
+The abstract from the paper is the following:
+
+*Pipelined NLP systems have largely been superseded by end-to-end neural modeling, yet nearly all commonly-used models
+still require an explicit tokenization step. While recent tokenization approaches based on data-derived subword
+lexicons are less brittle than manually engineered tokenizers, these techniques are not equally suited to all
+languages, and the use of any fixed vocabulary may limit a model's ability to adapt. In this paper, we present CANINE,
+a neural encoder that operates directly on character sequences, without explicit tokenization or vocabulary, and a
+pre-training strategy that operates either directly on characters or optionally uses subwords as a soft inductive bias.
+To use its finer-grained input effectively and efficiently, CANINE combines downsampling, which reduces the input
+sequence length, with a deep transformer stack, which encodes context. CANINE outperforms a comparable mBERT model by
+2.8 F1 on TyDi QA, a challenging multilingual benchmark, despite having 28% fewer model parameters.*
+
+Tips:
+
+- CANINE uses no less than 3 Transformer encoders internally: 2 "shallow" encoders (which only consist of a single
+  layer) and 1 "deep" encoder (which is a regular BERT encoder). First, a "shallow" encoder is used to contextualize
+  the character embeddings, using local attention. Next, after downsampling, a "deep" encoder is applied. Finally,
+  after upsampling, a "shallow" encoder is used to create the final character embeddings. Details regarding up- and
+  downsampling can be found in the paper.
+- CANINE uses a max sequence length of 2048 characters by default. One can use :class:`~transformers.CanineTokenizer`
+  to prepare text for the model.
+- Classification can be done by placing a linear layer on top of the final hidden state of the special [CLS] token
+  (which has a predefined Unicode code point). For token classification tasks however, the downsampled sequence of
+  tokens needs to be upsampled again to match the length of the original character sequence (which is 2048). The
+  details for this can be found in the paper.
+-  Models:
+
+      - `google/canine-c <https://huggingface.co/google/canine-c>`__: Pre-trained with autoregressive character loss,
+        12-layer, 768-hidden, 12-heads, 121M parameters (size ~500 MB).
+      - `google/canine-s <https://huggingface.co/google/canine-s>`__: Pre-trained with subword loss, 12-layer,
+        768-hidden, 12-heads, 121M parameters (size ~500 MB).
+
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
+<https://github.com/google-research/language/tree/master/language/canine>`__.
+
+
+Example
+_______________________________________________________________________________________________________________________
+
+CANINE works on raw characters, so it can be used without a tokenizer:
+
+.. code-block::
+
+    from transformers import CanineModel
+    import torch
+
+    model = CanineModel.from_pretrained('google/canine-c') # model pre-trained with autoregressive character loss
+
+    text = "hello world"
+    # use Python's built-in ord() function to turn each character into its unicode code point id
+    input_ids = torch.tensor([[ord(char) for char in text]])
+
+    outputs = model(input_ids) # forward pass
+    pooled_output = outputs.pooler_output
+    sequence_output = outputs.last_hidden_state
+
+
+For batched inference and training, it is however recommended to make use of the tokenizer (to pad/truncate all
+sequences to the same length):
+
+.. code-block::
+
+    from transformers import CanineTokenizer, CanineModel
+
+    model = CanineModel.from_pretrained('google/canine-c')
+    tokenizer = CanineTokenizer.from_pretrained('google/canine-c')
+
+    inputs = ["Life is like a box of chocolates.", "You never know what you gonna get."]
+    encoding = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt")
+
+    outputs = model(**encoding) # forward pass
+    pooled_output = outputs.pooler_output
+    sequence_output = outputs.last_hidden_state
+
+
+CANINE specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.canine.modeling_canine.CanineModelOutputWithPooling
+    :members:
+
+
+CanineConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CanineConfig
+    :members:
+
+
+CanineTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CanineTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences
+
+
+CanineModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CanineModel
+    :members: forward
+
+
+CanineForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CanineForSequenceClassification
+    :members: forward
+
+
+CanineForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CanineForMultipleChoice
+    :members: forward
+
+
+CanineForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CanineForTokenClassification
+    :members: forward
+
+
+CanineForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CanineForQuestionAnswering
+    :members: forward
--- a/docs/source/model_doc/clip.rst
+++ b/docs/source/model_doc/clip.rst
@@ -60,7 +60,6 @@ encode the text and prepare the images. The following example shows how to get t

 .. code-block::

-        >>> import torch
        >>> from PIL import Image
        >>> import requests

--- a/docs/source/model_doc/deberta.rst
+++ b/docs/source/model_doc/deberta.rst
@@ -38,7 +38,8 @@ the training data performs consistently better on a wide range of NLP tasks, ach
 pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*


-This model was contributed by `DeBERTa <https://huggingface.co/DeBERTa>`__. The original code can be found `here
+This model was contributed by `DeBERTa <https://huggingface.co/DeBERTa>`__. This model TF 2.0 implementation was
+contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__ . The original code can be found `here
 <https://github.com/microsoft/DeBERTa>`__.


@@ -103,3 +104,45 @@ DebertaForQuestionAnswering

 .. autoclass:: transformers.DebertaForQuestionAnswering
    :members: forward
+
+
+TFDebertaModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaModel
+    :members: call
+
+
+TFDebertaPreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaPreTrainedModel
+    :members: call
+
+
+TFDebertaForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaForMaskedLM
+    :members: call
+
+
+TFDebertaForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaForSequenceClassification
+    :members: call
+
+
+TFDebertaForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaForTokenClassification
+    :members: call
+
+
+TFDebertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/deberta_v2.rst
+++ b/docs/source/model_doc/deberta_v2.rst
@@ -53,12 +53,13 @@ New in v2:
  transformer layer to better learn the local dependency of input tokens.
 - **Sharing position projection matrix with content projection matrix in attention layer** Based on previous
  experiments, this can save parameters without affecting the performance.
- **Apply bucket to encode relative postions** The DeBERTa-v2 model uses log bucket to encode relative positions
+- **Apply bucket to encode relative positions** The DeBERTa-v2 model uses log bucket to encode relative positions
  similar to T5.
 - **900M model & 1.5B model** Two additional model sizes are available: 900M and 1.5B, which significantly improves the
  performance of downstream tasks.

-This model was contributed by `DeBERTa <https://huggingface.co/DeBERTa>`__. The original code can be found `here
+This model was contributed by `DeBERTa <https://huggingface.co/DeBERTa>`__. This model TF 2.0 implementation was
+contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found `here
 <https://github.com/microsoft/DeBERTa>`__.


@@ -117,3 +118,45 @@ DebertaV2ForQuestionAnswering

 .. autoclass:: transformers.DebertaV2ForQuestionAnswering
    :members: forward
+
+
+TFDebertaV2Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2Model
+    :members: call
+
+
+TFDebertaV2PreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2PreTrainedModel
+    :members: call
+
+
+TFDebertaV2ForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2ForMaskedLM
+    :members: call
+
+
+TFDebertaV2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2ForSequenceClassification
+    :members: call
+
+
+TFDebertaV2ForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2ForTokenClassification
+    :members: call
+
+
+TFDebertaV2ForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2ForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -44,8 +44,9 @@ Tips:
 - DistilBERT doesn't have options to select the input positions (:obj:`position_ids` input). This could be added if
  necessary though, just let us know if you need this option.

-This model was contributed by `victorsanh <https://huggingface.co/victorsanh>`__. The original code can be found
-:prefix_link:`here <examples/research-projects/distillation>`.
+This model was contributed by `victorsanh <https://huggingface.co/victorsanh>`__. This model jax version was
+contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found :prefix_link:`here
+<examples/research_projects/distillation>`.


 DistilBertConfig
@@ -152,3 +153,45 @@ TFDistilBertForQuestionAnswering

 .. autoclass:: transformers.TFDistilBertForQuestionAnswering
    :members: call
+
+
+FlaxDistilBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertModel
+    :members: __call__
+
+
+FlaxDistilBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForMaskedLM
+    :members: __call__
+
+
+FlaxDistilBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForSequenceClassification
+    :members: __call__
+
+
+FlaxDistilBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForMultipleChoice
+    :members: __call__
+
+
+FlaxDistilBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForTokenClassification
+    :members: __call__
+
+
+FlaxDistilBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForQuestionAnswering
+    :members: __call__
--- a/docs/source/model_doc/encoderdecoder.rst
+++ b/docs/source/model_doc/encoderdecoder.rst
@@ -40,3 +40,10 @@ EncoderDecoderModel

 .. autoclass:: transformers.EncoderDecoderModel
    :members: forward, from_encoder_decoder_pretrained
+
+
+FlaxEncoderDecoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxEncoderDecoderModel
+    :members: __call__, from_encoder_decoder_pretrained
--- a/docs/source/model_doc/fnet.rst
+++ b/docs/source/model_doc/fnet.rst
@@ -0,0 +1,121 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+FNet
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The FNet model was proposed in `FNet: Mixing Tokens with Fourier Transforms <https://arxiv.org/abs/2105.03824>`__ by
+James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. The model replaces the self-attention layer in a BERT
+model with a fourier transform which returns only the real parts of the transform. The model is significantly faster
+than the BERT model because it has fewer parameters and is more memory efficient. The model achieves about 92-97%
+accuracy of BERT counterparts on GLUE benchmark, and trains much faster than the BERT model. The abstract from the
+paper is the following:
+
+*We show that Transformer encoder architectures can be sped up, with limited accuracy costs, by replacing the
+self-attention sublayers with simple linear transformations that "mix" input tokens. These linear mixers, along with
+standard nonlinearities in feed-forward layers, prove competent at modeling semantic relationships in several text
+classification tasks. Most surprisingly, we find that replacing the self-attention sublayer in a Transformer encoder
+with a standard, unparameterized Fourier Transform achieves 92-97% of the accuracy of BERT counterparts on the GLUE
+benchmark, but trains 80% faster on GPUs and 70% faster on TPUs at standard 512 input lengths. At longer input lengths,
+our FNet model is significantly faster: when compared to the "efficient" Transformers on the Long Range Arena
+benchmark, FNet matches the accuracy of the most accurate models, while outpacing the fastest models across all
+sequence lengths on GPUs (and across relatively shorter lengths on TPUs). Finally, FNet has a light memory footprint
+and is particularly efficient at smaller model sizes; for a fixed speed and accuracy budget, small FNet models
+outperform Transformer counterparts.*
+
+Tips on usage:
+
+- The model was trained without an attention mask as it is based on Fourier Transform. The model was trained with
+  maximum sequence length 512 which includes pad tokens. Hence, it is highly recommended to use the same maximum
+  sequence length for fine-tuning and inference.
+
+This model was contributed by `gchhablani <https://huggingface.co/gchhablani>`__. The original code can be found `here
+<https://github.com/google-research/google-research/tree/master/f_net>`__.
+
+FNetConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetConfig
+    :members:
+
+
+FNetTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+FNetTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetTokenizerFast
+    :members:
+
+
+FNetModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetModel
+    :members: forward
+
+
+FNetForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForPreTraining
+    :members: forward
+
+
+FNetForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForMaskedLM
+    :members: forward
+
+
+FNetForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForNextSentencePrediction
+    :members: forward
+
+FNetForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForSequenceClassification
+    :members: forward
+
+
+FNetForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForMultipleChoice
+    :members: forward
+
+
+FNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForTokenClassification
+    :members: forward
+
+
+FNetForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForQuestionAnswering
+    :members: forward
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -36,10 +36,11 @@ Tips:
 - GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
  observed in the `run_generation.py` example script.
- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
-  this `past` value prevents the model from re-computing pre-computed values in the context of text generation. See
-  `reusing the past in generative models <../quickstart.html#using-the-past>`__ for more information on the usage of
-  this argument.
+- The model can take the `past_key_values` (for PyTorch) or `past` (for TF) as input, which is the previously computed
+  key/value attention pairs. Using this (`past_key_values` or `past`) value prevents the model from re-computing
+  pre-computed values in the context of text generation. For PyTorch, see `past_key_values` argument of the
+  :meth:`~transformers.GPT2Model.forward` method, or for TF the `past` argument of the
+  :meth:`~transformers.TFGPT2Model.call` method for more information on its usage.

 `Write With Transformer <https://transformer.huggingface.co/doc/gpt2-large>`__ is a webapp created and hosted by
 Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
@@ -108,6 +109,13 @@ GPT2ForSequenceClassification
    :members: forward


+GPT2ForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPT2ForTokenClassification
+    :members: forward
+
+
 TFGPT2Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/gpt_neo.rst
+++ b/docs/source/model_doc/gpt_neo.rst
@@ -71,3 +71,16 @@ GPTNeoForSequenceClassification

 .. autoclass:: transformers.GPTNeoForSequenceClassification
    :members: forward
+
+FlaxGPTNeoModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxGPTNeoModel
+    :members: __call__
+
+
+FlaxGPTNeoForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxGPTNeoForCausalLM
+    :members: __call__
--- a/docs/source/model_doc/gptj.rst
+++ b/docs/source/model_doc/gptj.rst
@@ -0,0 +1,107 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+GPT-J
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The GPT-J model was released in the `kingoflolz/mesh-transformer-jax
+<https://github.com/kingoflolz/mesh-transformer-jax>`__ repository by Ben Wang and Aran Komatsuzaki. It is a GPT-2-like
+causal language model trained on `the Pile <https://pile.eleuther.ai/>`__ dataset.
+
+This model was contributed by `Stella Biderman <https://huggingface.co/stellaathena>`__.
+
+Tips:
+
+- Running [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) in float32 precision on GPU requires at least 24 GB of
+  RAM. On GPUs with less than 24 GB RAM, one should therefore load the model in half-precision:
+
+.. code-block::
+
+    >>> from transformers import GPTJForCausalLM
+    >>> import torch
+
+    >>> model =  GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
+
+- Although the embedding matrix has a size of 50400, only 50257 entries are used by the GPT-2 tokenizer. These extra
+  tokens are added for the sake of efficiency on TPUs. To avoid the mis-match between embedding matrix size and vocab
+  size, the tokenizer for [GPT-J](https://huggingface.co/EleutherAI/gpt-j-6B) contains 143 extra tokens
+  ``<|extratoken_1|>... <|extratoken_143|>``, so the ``vocab_size`` of tokenizer also becomes 50400.
+
+Generation
+_______________________________________________________________________________________________________________________
+
+The :meth:`~transformers.generation_utils.GenerationMixin.generate` method can be used to generate text using GPT-J
+model.
+
+.. code-block::
+
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+    >>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
+    >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
+
+    >>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
+    ...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
+    ...          "researchers was the fact that the unicorns spoke perfect English."
+
+    >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+
+    >>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
+    >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
+
+...or in float16 precision:
+
+.. code-block::
+
+    >>> from transformers import GPTJForCausalLM, AutoTokenizer
+    >>> import torch
+
+    >>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
+    >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
+
+    >>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
+    ...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
+    ...          "researchers was the fact that the unicorns spoke perfect English."
+
+    >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+
+    >>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
+    >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
+
+
+GPTJConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTJConfig
+    :members:
+
+GPTJModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTJModel
+    :members: forward
+
+
+GPTJForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTJForCausalLM
+    :members: forward
+
+
+GPTJForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTJForSequenceClassification
+    :members: forward
--- a/docs/source/model_doc/hubert.rst
+++ b/docs/source/model_doc/hubert.rst
@@ -63,3 +63,24 @@ HubertForCTC

 .. autoclass:: transformers.HubertForCTC
    :members: forward
+
+
+HubertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.HubertForSequenceClassification
+    :members: forward
+
+
+TFHubertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFHubertModel
+    :members: call
+
+
+TFHubertForCTC
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFHubertForCTC
+    :members: call
--- a/docs/source/model_doc/layoutlmv2.rst
+++ b/docs/source/model_doc/layoutlmv2.rst
@@ -0,0 +1,314 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+LayoutLMV2
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The LayoutLMV2 model was proposed in `LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding
+<https://arxiv.org/abs/2012.14740>`__ by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu,
+Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. LayoutLMV2 improves `LayoutLM
+<https://huggingface.co/transformers/model_doc/layoutlm.html>`__ to obtain state-of-the-art results across several
+document image understanding benchmarks:
+
+- information extraction from scanned documents: the `FUNSD <https://guillaumejaume.github.io/FUNSD/>`__ dataset (a
+  collection of 199 annotated forms comprising more than 30,000 words), the `CORD <https://github.com/clovaai/cord>`__
+  dataset (a collection of 800 receipts for training, 100 for validation and 100 for testing), the `SROIE
+  <https://rrc.cvc.uab.es/?ch=13>`__ dataset (a collection of 626 receipts for training and 347 receipts for testing)
+  and the `Kleister-NDA <https://github.com/applicaai/kleister-nda>`__ dataset (a collection of non-disclosure
+  agreements from the EDGAR database, including 254 documents for training, 83 documents for validation, and 203
+  documents for testing).
+- document image classification: the `RVL-CDIP <https://www.cs.cmu.edu/~aharley/rvl-cdip/>`__ dataset (a collection of
+  400,000 images belonging to one of 16 classes).
+- document visual question answering: the `DocVQA <https://arxiv.org/abs/2007.00398>`__ dataset (a collection of 50,000
+  questions defined on 12,000+ document images).
+
+The abstract from the paper is the following:
+
+*Pre-training of text and layout has proved effective in a variety of visually-rich document understanding tasks due to
+its effective model architecture and the advantage of large-scale unlabeled scanned/digital-born documents. In this
+paper, we present LayoutLMv2 by pre-training text, layout and image in a multi-modal framework, where new model
+architectures and pre-training tasks are leveraged. Specifically, LayoutLMv2 not only uses the existing masked
+visual-language modeling task but also the new text-image alignment and text-image matching tasks in the pre-training
+stage, where cross-modality interaction is better learned. Meanwhile, it also integrates a spatial-aware self-attention
+mechanism into the Transformer architecture, so that the model can fully understand the relative positional
+relationship among different text blocks. Experiment results show that LayoutLMv2 outperforms strong baselines and
+achieves new state-of-the-art results on a wide variety of downstream visually-rich document understanding tasks,
+including FUNSD (0.7895 -> 0.8420), CORD (0.9493 -> 0.9601), SROIE (0.9524 -> 0.9781), Kleister-NDA (0.834 -> 0.852),
+RVL-CDIP (0.9443 -> 0.9564), and DocVQA (0.7295 -> 0.8672). The pre-trained LayoutLMv2 model is publicly available at
+this https URL.*
+
+Tips:
+
+- The main difference between LayoutLMv1 and LayoutLMv2 is that the latter incorporates visual embeddings during
+  pre-training (while LayoutLMv1 only adds visual embeddings during fine-tuning).
+- LayoutLMv2 adds both a relative 1D attention bias as well as a spatial 2D attention bias to the attention scores in
+  the self-attention layers. Details can be found on page 5 of the `paper <https://arxiv.org/abs/2012.14740>`__.
+- Demo notebooks on how to use the LayoutLMv2 model on RVL-CDIP, FUNSD, DocVQA, CORD can be found `here
+  <https://github.com/NielsRogge/Transformers-Tutorials>`__.
+- LayoutLMv2 uses Facebook AI's `Detectron2 <https://github.com/facebookresearch/detectron2/>`__ package for its visual
+  backbone. See `this link <https://detectron2.readthedocs.io/en/latest/tutorials/install.html>`__ for installation
+  instructions.
+- In addition to :obj:`input_ids`, :meth:`~transformer.LayoutLMv2Model.forward` expects 2 additional inputs, namely
+  :obj:`image` and :obj:`bbox`. The :obj:`image` input corresponds to the original document image in which the text
+  tokens occur. The model expects each document image to be of size 224x224. This means that if you have a batch of
+  document images, :obj:`image` should be a tensor of shape (batch_size, 3, 224, 224). This can be either a
+  :obj:`torch.Tensor` or a :obj:`Detectron2.structures.ImageList`. You don't need to normalize the channels, as this is
+  done by the model. Important to note is that the visual backbone expects BGR channels instead of RGB, as all models
+  in Detectron2 are pre-trained using the BGR format. The :obj:`bbox` input are the bounding boxes (i.e. 2D-positions)
+  of the input text tokens. This is identical to :class:`~transformer.LayoutLMModel`. These can be obtained using an
+  external OCR engine such as Google's `Tesseract <https://github.com/tesseract-ocr/tesseract>`__ (there's a `Python
+  wrapper <https://pypi.org/project/pytesseract/>`__ available). Each bounding box should be in (x0, y0, x1, y1)
+  format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1, y1)
+  represents the position of the lower right corner. Note that one first needs to normalize the bounding boxes to be on
+  a 0-1000 scale. To normalize, you can use the following function:
+
+.. code-block::
+
+    def normalize_bbox(bbox, width, height):
+         return [
+             int(1000 * (bbox[0] / width)),
+             int(1000 * (bbox[1] / height)),
+             int(1000 * (bbox[2] / width)),
+             int(1000 * (bbox[3] / height)),
+         ]
+
+Here, :obj:`width` and :obj:`height` correspond to the width and height of the original document in which the token
+occurs (before resizing the image). Those can be obtained using the Python Image Library (PIL) library for example, as
+follows:
+
+.. code-block::
+
+    from PIL import Image
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.")
+
+    width, height = image.size
+
+However, this model includes a brand new :class:`~transformer.LayoutLMv2Processor` which can be used to directly
+prepare data for the model (including applying OCR under the hood). More information can be found in the "Usage"
+section below.
+
+- Internally, :class:`~transformer.LayoutLMv2Model` will send the :obj:`image` input through its visual backbone to
+  obtain a lower-resolution feature map, whose shape is equal to the :obj:`image_feature_pool_shape` attribute of
+  :class:`~transformer.LayoutLMv2Config`. This feature map is then flattened to obtain a sequence of image tokens. As
+  the size of the feature map is 7x7 by default, one obtains 49 image tokens. These are then concatenated with the text
+  tokens, and send through the Transformer encoder. This means that the last hidden states of the model will have a
+  length of 512 + 49 = 561, if you pad the text tokens up to the max length. More generally, the last hidden states
+  will have a shape of :obj:`seq_length` + :obj:`image_feature_pool_shape[0]` *
+  :obj:`config.image_feature_pool_shape[1]`.
+- When calling :meth:`~transformer.LayoutLMv2Model.from_pretrained`, a warning will be printed with a long list of
+  parameter names that are not initialized. This is not a problem, as these parameters are batch normalization
+  statistics, which are going to have values when fine-tuning on a custom dataset.
+- If you want to train the model in a distributed environment, make sure to call :meth:`synchronize_batch_norm` on the
+  model in order to properly synchronize the batch normalization layers of the visual backbone.
+
+In addition, there's LayoutXLM, which is a multilingual version of LayoutLMv2. More information can be found on
+:doc:`LayoutXLM's documentation page <layoutxlm>`.
+
+Usage: LayoutLMv2Processor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The easiest way to prepare data for the model is to use :class:`~transformer.LayoutLMv2Processor`, which internally
+combines a feature extractor (:class:`~transformer.LayoutLMv2FeatureExtractor`) and a tokenizer
+(:class:`~transformer.LayoutLMv2Tokenizer` or :class:`~transformer.LayoutLMv2TokenizerFast`). The feature extractor
+handles the image modality, while the tokenizer handles the text modality. A processor combines both, which is ideal
+for a multi-modal model like LayoutLMv2. Note that you can still use both separately, if you only want to handle one
+modality.
+
+.. code-block::
+
+    from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2TokenizerFast, LayoutLMv2Processor
+
+    feature_extractor = LayoutLMv2FeatureExtractor() # apply_ocr is set to True by default
+    tokenizer = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
+    processor = LayoutLMv2Processor(feature_extractor, tokenizer)
+
+In short, one can provide a document image (and possibly additional data) to :class:`~transformer.LayoutLMv2Processor`,
+and it will create the inputs expected by the model. Internally, the processor first uses
+:class:`~transformer.LayoutLMv2FeatureExtractor` to apply OCR on the image to get a list of words and normalized
+bounding boxes, as well to resize the image to a given size in order to get the :obj:`image` input. The words and
+normalized bounding boxes are then provided to :class:`~transformer.LayoutLMv2Tokenizer` or
+:class:`~transformer.LayoutLMv2TokenizerFast`, which converts them to token-level :obj:`input_ids`,
+:obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`. Optionally, one can provide word labels to the processor,
+which are turned into token-level :obj:`labels`.
+
+:class:`~transformer.LayoutLMv2Processor` uses `PyTesseract <https://pypi.org/project/pytesseract/>`__, a Python
+wrapper around Google's Tesseract OCR engine, under the hood. Note that you can still use your own OCR engine of
+choice, and provide the words and normalized boxes yourself. This requires initializing
+:class:`~transformer.LayoutLMv2FeatureExtractor` with :obj:`apply_ocr` set to :obj:`False`.
+
+In total, there are 5 use cases that are supported by the processor. Below, we list them all. Note that each of these
+use cases work for both batched and non-batched inputs (we illustrate them for non-batched inputs).
+
+**Use case 1: document image classification (training, inference) + token classification (inference), apply_ocr =
+True**
+
+This is the simplest case, in which the processor (actually the feature extractor) will perform OCR on the image to get
+the words and normalized bounding boxes.
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    encoding = processor(image, return_tensors="pt") # you can also add all tokenizer parameters here such as padding, truncation
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+
+**Use case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False**
+
+In case one wants to do OCR themselves, one can initialize the feature extractor with :obj:`apply_ocr` set to
+:obj:`False`. In that case, one should provide the words and corresponding (normalized) bounding boxes themselves to
+the processor.
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    words = ["hello", "world"]
+    boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+    encoding = processor(image, words, boxes=boxes, return_tensors="pt")
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+
+**Use case 3: token classification (training), apply_ocr=False**
+
+For token classification tasks (such as FUNSD, CORD, SROIE, Kleister-NDA), one can also provide the corresponding word
+labels in order to train a model. The processor will then convert these into token-level :obj:`labels`. By default, it
+will only label the first wordpiece of a word, and label the remaining wordpieces with -100, which is the
+:obj:`ignore_index` of PyTorch's CrossEntropyLoss. In case you want all wordpieces of a word to be labeled, you can
+initialize the tokenizer with :obj:`only_label_first_subword` set to :obj:`False`.
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    words = ["hello", "world"]
+    boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+    word_labels = [1, 2]
+    encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'labels', 'image'])
+
+**Use case 4: visual question answering (inference), apply_ocr=True**
+
+For visual question answering tasks (such as DocVQA), you can provide a question to the processor. By default, the
+processor will apply OCR on the image, and create [CLS] question tokens [SEP] word tokens [SEP].
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    question = "What's his name?"
+    encoding = processor(image, question, return_tensors="pt") 
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+
+**Use case 5: visual question answering (inference), apply_ocr=False**
+
+For visual question answering tasks (such as DocVQA), you can provide a question to the processor. If you want to
+perform OCR yourself, you can provide your own words and (normalized) bounding boxes to the processor.
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    question = "What's his name?"
+    words = ["hello", "world"]
+    boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+    encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")  
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+
+LayoutLMv2Config
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2Config
+    :members:
+
+
+LayoutLMv2FeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2FeatureExtractor
+    :members: __call__
+
+
+LayoutLMv2Tokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2Tokenizer
+    :members: __call__, save_vocabulary
+
+
+LayoutLMv2TokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2TokenizerFast
+    :members: __call__
+
+
+LayoutLMv2Processor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2Processor
+    :members: __call__
+
+
+LayoutLMv2Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2Model
+    :members: forward
+
+
+LayoutLMv2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2ForSequenceClassification
+    :members:
+
+
+LayoutLMv2ForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2ForTokenClassification
+    :members:
+
+
+LayoutLMv2ForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2ForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/layoutxlm.rst
+++ b/docs/source/model_doc/layoutxlm.rst
@@ -0,0 +1,56 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+LayoutXLM
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+LayoutXLM was proposed in `LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding
+<https://arxiv.org/abs/2104.08836>`__ by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha
+Zhang, Furu Wei. It's a multilingual extension of the `LayoutLMv2 model <https://arxiv.org/abs/2012.14740>`__ trained
+on 53 languages.
+
+The abstract from the paper is the following:
+
+*Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually-rich document
+understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. In
+this paper, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to
+bridge the language barriers for visually-rich document understanding. To accurately evaluate LayoutXLM, we also
+introduce a multilingual form understanding benchmark dataset named XFUN, which includes form understanding samples in
+7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese), and key-value pairs are manually labeled
+for each language. Experiment results show that the LayoutXLM model has significantly outperformed the existing SOTA
+cross-lingual pre-trained models on the XFUN dataset.*
+
+One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like so:
+
+.. code-block::
+
+    from transformers import LayoutLMv2Model
+
+    model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base') 
+
+Note that LayoutXLM requires a different tokenizer, based on :class:`~transformers.XLMRobertaTokenizer`. You can
+initialize it as follows:
+
+.. code-block::
+
+    from transformers import AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained('microsoft/layoutxlm-base') 
+
+As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page
+<layoutlmv2>` for all tips, code examples and notebooks.
+
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
+<https://github.com/microsoft/unilm>`__.
--- a/docs/source/model_doc/led.rst
+++ b/docs/source/model_doc/led.rst
@@ -46,8 +46,8 @@ Tips:
 - LED makes use of *global attention* by means of the ``global_attention_mask`` (see
  :class:`~transformers.LongformerModel`). For summarization, it is advised to put *global attention* only on the first
  ``<s>`` token. For question answering, it is advised to put *global attention* on all tokens of the question.
- To fine-tune LED on all 16384, it is necessary to enable *gradient checkpointing* by setting
-  ``config.gradient_checkpointing = True``.
+- To fine-tune LED on all 16384, it is necessary to enable *gradient checkpointing* by executing
+  ``model.gradient_checkpointing_enable()``.
 - A notebook showing how to evaluate LED, can be accessed `here
  <https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing>`__.
 - A notebook showing how to fine-tune LED, can be accessed `here
--- a/docs/source/model_doc/m2m_100.rst
+++ b/docs/source/model_doc/m2m_100.rst
@@ -58,7 +58,7 @@ examples. To install :obj:`sentencepiece` run ``pip install sentencepiece``.
    tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M', src_lang="en", tgt_lang="fr")

    src_text = "Life is like a box of chocolates."
-    tgt_lang = "La vie est comme une boîte de chocolat."
+    tgt_text = "La vie est comme une boîte de chocolat."

    model_inputs = tokenizer(src_text, return_tensors="pt")
    with tokenizer.as_target_tokenizer():
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -103,8 +103,8 @@ Here is the code to see all available pretrained models on the hub:

 .. code-block:: python

-    from transformers.hf_api import HfApi
-    model_list = HfApi().model_list()
+    from huggingface_hub.hf_api import HfApi
+    model_list = HfApi().list_models()
    org = "Helsinki-NLP"
    model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
    suffix = [x.split('/')[1] for x in model_ids]
@@ -216,3 +216,17 @@ TFMarianMTModel

 .. autoclass:: transformers.TFMarianMTModel
    :members: call
+
+
+FlaxMarianModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMarianModel
+    :members: __call__
+
+
+FlaxMarianMTModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMarianMTModel
+    :members: __call__
--- a/docs/source/model_doc/mbart.rst
+++ b/docs/source/model_doc/mbart.rst
@@ -49,11 +49,11 @@ inside the context manager :meth:`~transformers.MBartTokenizer.as_target_tokeniz

    >>> from transformers import MBartForConditionalGeneration, MBartTokenizer

-    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
+    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
    >>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"

-    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt", src_lang="en_XX", tgt_lang="ro_RO")
+    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
    >>> with tokenizer.as_target_tokenizer():
    ...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")

@@ -240,3 +240,31 @@ TFMBartForConditionalGeneration

 .. autoclass:: transformers.TFMBartForConditionalGeneration
    :members: call
+
+
+FlaxMBartModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMBartModel
+    :members: __call__, encode, decode
+
+
+FlaxMBartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMBartForConditionalGeneration
+    :members: __call__, encode, decode
+
+
+FlaxMBartForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMBartForSequenceClassification
+    :members: __call__, encode, decode
+
+
+FlaxMBartForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMBartForQuestionAnswering
+    :members: __call__, encode, decode
--- a/docs/source/model_doc/mt5.rst
+++ b/docs/source/model_doc/mt5.rst
@@ -10,7 +10,7 @@
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.

-MT5
+mT5
 -----------------------------------------------------------------------------------------------------------------------

 Overview
@@ -24,9 +24,28 @@ The abstract from the paper is the following:

 *The recent "Text-to-Text Transfer Transformer" (T5) leveraged a unified text-to-text format and scale to attain
 state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a
-multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We describe
+multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail
 the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual
-benchmarks. All of the code and model checkpoints*
+benchmarks. We also describe a simple technique to prevent "accidental translation" in the zero-shot setting, where a
+generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model
+checkpoints used in this work are publicly available.*
+
+Note: mT5 was only pre-trained on `mC4 <https://huggingface.co/datasets/mc4>`__ excluding any supervised training.
+Therefore, this model has to be fine-tuned before it is useable on a downstream task, unlike the original T5 model.
+Since mT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
+
+Google has released the following variants:
+
+- `google/mt5-small <https://huggingface.co/google/mt5-small>`__
+
+- `google/mt5-base <https://huggingface.co/google/mt5-base>`__
+
+- `google/mt5-large <https://huggingface.co/google/mt5-large>`__
+
+- `google/mt5-xl <https://huggingface.co/google/mt5-xl>`__
+
+- `google/mt5-xxl <https://huggingface.co/google/mt5-xxl>`__.

 This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
 found `here <https://github.com/google-research/multilingual-t5>`__.
@@ -94,3 +113,17 @@ TFMT5EncoderModel

 .. autoclass:: transformers.TFMT5EncoderModel
    :members:
+
+
+FlaxMT5Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMT5Model
+    :members:
+
+
+FlaxMT5ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMT5ForConditionalGeneration
+    :members:
--- a/docs/source/model_doc/pegasus.rst
+++ b/docs/source/model_doc/pegasus.rst
@@ -152,3 +152,17 @@ TFPegasusForConditionalGeneration

 .. autoclass:: transformers.TFPegasusForConditionalGeneration
    :members: call
+
+
+FlaxPegasusModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxPegasusModel
+    :members: __call__, encode, decode
+
+
+FlaxPegasusForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxPegasusForConditionalGeneration
+    :members: __call__, encode, decode
--- a/docs/source/model_doc/rembert.rst
+++ b/docs/source/model_doc/rembert.rst
@@ -0,0 +1,161 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+RemBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The RemBERT model was proposed in `Rethinking Embedding Coupling in Pre-trained Language Models
+<https://arxiv.org/abs/2010.12821>`__ by Hyung Won Chung, Thibault Févry, Henry Tsai, Melvin Johnson, Sebastian Ruder.
+
+The abstract from the paper is the following:
+
+*We re-evaluate the standard practice of sharing weights between input and output embeddings in state-of-the-art
+pre-trained language models. We show that decoupled embeddings provide increased modeling flexibility, allowing us to
+significantly improve the efficiency of parameter allocation in the input embedding of multilingual models. By
+reallocating the input embedding parameters in the Transformer layers, we achieve dramatically better performance on
+standard natural language understanding tasks with the same number of parameters during fine-tuning. We also show that
+allocating additional capacity to the output embedding provides benefits to the model that persist through the
+fine-tuning stage even though the output embedding is discarded after pre-training. Our analysis shows that larger
+output embeddings prevent the model's last layers from overspecializing to the pre-training task and encourage
+Transformer representations to be more general and more transferable to other tasks and languages. Harnessing these
+findings, we are able to train models that achieve strong performance on the XTREME benchmark without increasing the
+number of parameters at the fine-tuning stage.*
+
+Tips:
+
+For fine-tuning, RemBERT can be thought of as a bigger version of mBERT with an ALBERT-like factorization of the
+embedding layer. The embeddings are not tied in pre-training, in contrast with BERT, which enables smaller input
+embeddings (preserved during fine-tuning) and bigger output embeddings (discarded at fine-tuning). The tokenizer is
+also similar to the Albert one rather than the BERT one.
+
+RemBertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertConfig
+    :members:
+
+
+RemBertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+RemBertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertTokenizerFast
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+RemBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertModel
+    :members: forward
+
+
+RemBertForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForCausalLM
+    :members: forward
+
+
+RemBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForMaskedLM
+    :members: forward
+
+
+RemBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForSequenceClassification
+    :members: forward
+
+
+RemBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForMultipleChoice
+    :members: forward
+
+
+RemBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForTokenClassification
+    :members: forward
+
+
+RemBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForQuestionAnswering
+    :members: forward
+
+
+TFRemBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertModel
+    :members: call
+
+
+TFRemBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForMaskedLM
+    :members: call
+
+
+TFRemBertForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForCausalLM
+    :members: call
+
+
+TFRemBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForSequenceClassification
+    :members: call
+
+
+TFRemBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForMultipleChoice
+    :members: call
+
+
+TFRemBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForTokenClassification
+    :members: call
+
+
+TFRemBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/roformer.rst
+++ b/docs/source/model_doc/roformer.rst
@@ -56,7 +56,7 @@ RoFormerTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


-RobertaTokenizerFast
+RoFormerTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RoFormerTokenizerFast
--- a/docs/source/model_doc/speech_to_text.rst
+++ b/docs/source/model_doc/speech_to_text.rst
@@ -42,8 +42,8 @@ features. The :class:`~transformers.Speech2TextProcessor` wraps :class:`~transfo
 predicted token ids.

 The feature extractor depends on :obj:`torchaudio` and the tokenizer depends on :obj:`sentencepiece` so be sure to
-install those packages before running the examples. You could either install those as extra speech dependancies with
-``pip install transformers"[speech, sentencepiece]"`` or install the packages seperatly with ``pip install torchaudio
+install those packages before running the examples. You could either install those as extra speech dependencies with
+``pip install transformers"[speech, sentencepiece]"`` or install the packages seperately with ``pip install torchaudio
 sentencepiece``. Also ``torchaudio`` requires the development version of the `libsndfile
 <http://www.mega-nerd.com/libsndfile/>`__ package which can be installed via a system package manager. On Ubuntu it can
 be installed as follows: ``apt install libsndfile1-dev``
--- a/docs/source/model_doc/speech_to_text_2.rst
+++ b/docs/source/model_doc/speech_to_text_2.rst
@@ -0,0 +1,123 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Speech2Text2
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Speech2Text2 model is used together with :doc:`Wav2Vec2 <wav2vec2>` for Speech Translation models proposed in
+`Large-Scale Self- and Semi-Supervised Learning for Speech Translation <https://arxiv.org/abs/2104.06678>`__ by
+Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+
+Speech2Text2 is a *decoder-only* transformer model that can be used with any speech *encoder-only*, such as
+:doc:`Wav2Vec2 <wav2vec2>` or :doc:`HuBERT <hubert>` for Speech-to-Text tasks. Please refer to the
+:doc:`SpeechEncoderDecoder <speechencoderdecoder>` class on how to combine Speech2Text2 with any speech *encoder-only*
+model.
+
+This model was contributed by `Patrick von Platen <https://huggingface.co/patrickvonplaten>`__.
+
+The original code can be found `here
+<https://github.com/pytorch/fairseq/blob/1f7ef9ed1e1061f8c7f88f8b94c7186834398690/fairseq/models/wav2vec/wav2vec2_asr.py#L266>`__.
+
+
+Tips:
+
+- Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see
+  the `official models <https://huggingface.co/models?other=speech2text2>`__ .
+- Speech2Text2 is always used within the :doc:`SpeechEncoderDecoder <speechencoderdecoder>` framework.
+- Speech2Text2's tokenizer currently only supports inference, but not training.
+
+Inference
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Speech2Text2's :class:`~transformers.SpeechEncoderDecoderModel` model accepts raw waveform input values from speech and
+makes use of :func:`~transformers.generation_utils.GenerationMixin.generate` to translate the input speech
+autoregressively to the target language.
+
+The :class:`~transformers.Wav2Vec2FeatureExtractor` class is responsible for preprocessing the input speech and
+:class:`~transformers.Speech2Text2Tokenizer` decodes the generated target tokens to the target string. The
+:class:`~transformers.Speech2Text2Processor` wraps :class:`~transformers.Wav2Vec2FeatureExtractor` and
+:class:`~transformers.Speech2Text2Tokenizer` into a single instance to both extract the input features and decode the
+predicted token ids.
+
+- Step-by-step Speech Translation
+
+.. code-block::
+
+        >>> import torch
+        >>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
+        >>> from datasets import load_dataset
+        >>> import soundfile as sf
+
+        >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
+        >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
+
+        >>> def map_to_array(batch):
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
+
+        >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
+
+        >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
+        >>> generated_ids = model.generate(input_ids=inputs["input_values"], attention_mask=inputs["attention_mask"])
+
+        >>> transcription = processor.batch_decode(generated_ids)
+
+
+- Speech Translation via Pipelines
+
+    The automatic speech recognition pipeline can also be used to translate speech in just a couple lines of code
+
+.. code-block::
+
+        >>> from datasets import load_dataset
+        >>> from transformers import pipeline
+
+        >>> librispeech_en = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+        >>> asr = pipeline("automatic-speech-recognition", model="facebook/s2t-wav2vec2-large-en-de", feature_extractor="facebook/s2t-wav2vec2-large-en-de")
+
+        >>> translation_de = asr(librispeech_en[0]["file"])
+
+
+See `model hub <https://huggingface.co/models?filter=speech2text2>`__ to look for Speech2Text2 checkpoints.
+
+
+Speech2Text2Config
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Speech2Text2Config
+    :members:
+
+
+Speech2TextTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Speech2Text2Tokenizer
+    :members: batch_decode, decode, save_vocabulary
+
+
+Speech2Text2Processor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Speech2Text2Processor
+    :members: __call__, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
+
+
+Speech2Text2ForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Speech2Text2ForCausalLM
+    :members: forward
--- a/docs/source/model_doc/speechencoderdecoder.rst
+++ b/docs/source/model_doc/speechencoderdecoder.rst
@@ -0,0 +1,40 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Speech Encoder Decoder Models
+-----------------------------------------------------------------------------------------------------------------------
+
+The :class:`~transformers.SpeechEncoderDecoderModel` can be used to initialize a speech-sequence-to-text-sequence model
+with any pretrained speech autoencoding model as the encoder (*e.g.* :doc:`Wav2Vec2 <wav2vec2>`, :doc:`Hubert
+<hubert>`) and any pretrained autoregressive model as the decoder.
+
+The effectiveness of initializing speech-sequence-to-text-sequence models with pretrained checkpoints for speech
+recognition and speech translation has *e.g.* been shown in `Large-Scale Self- and Semi-Supervised Learning for Speech
+Translation <https://arxiv.org/abs/2104.06678>`__ by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli,
+Alexis Conneau.
+
+An example of how to use a :class:`~transformers.SpeechEncoderDecoderModel` for inference can be seen in
+:doc:`Speech2Text2 <speech_to_text_2>`.
+
+
+SpeechEncoderDecoderConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SpeechEncoderDecoderConfig
+    :members:
+
+
+SpeechEncoderDecoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SpeechEncoderDecoderModel
+    :members: forward, from_encoder_decoder_pretrained
--- a/docs/source/model_doc/splinter.rst
+++ b/docs/source/model_doc/splinter.rst
@@ -0,0 +1,87 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Splinter
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Splinter model was proposed in `Few-Shot Question Answering by Pretraining Span Selection
+<https://arxiv.org/abs/2101.00438>`__ by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. Splinter
+is an encoder-only transformer (similar to BERT) pretrained using the recurring span selection task on a large corpus
+comprising Wikipedia and the Toronto Book Corpus.
+
+The abstract from the paper is the following:
+
+In several question answering benchmarks, pretrained models have reached human parity through fine-tuning on an order
+of 100,000 annotated questions and answers. We explore the more realistic few-shot setting, where only a few hundred
+training examples are available, and observe that standard models perform poorly, highlighting the discrepancy between
+current pretraining objectives and question answering. We propose a new pretraining scheme tailored for question
+answering: recurring span selection. Given a passage with multiple sets of recurring spans, we mask in each set all
+recurring spans but one, and ask the model to select the correct span in the passage for each masked span. Masked spans
+are replaced with a special token, viewed as a question representation, that is later used during fine-tuning to select
+the answer span. The resulting model obtains surprisingly good results on multiple benchmarks (e.g., 72.7 F1 on SQuAD
+with only 128 training examples), while maintaining competitive performance in the high-resource setting.
+
+Tips:
+
+- Splinter was trained to predict answers spans conditioned on a special [QUESTION] token. These tokens contextualize
+  to question representations which are used to predict the answers. This layer is called QASS, and is the default
+  behaviour in the :class:`~transformers.SplinterForQuestionAnswering` class. Therefore:
+- Use :class:`~transformers.SplinterTokenizer` (rather than :class:`~transformers.BertTokenizer`), as it already
+  contains this special token. Also, its default behavior is to use this token when two sequences are given (for
+  example, in the `run_qa.py` script).
+- If you plan on using Splinter outside `run_qa.py`, please keep in mind the question token - it might be important for
+  the success of your model, especially in a few-shot setting.
+- Please note there are two different checkpoints for each size of Splinter. Both are basically the same, except that
+  one also has the pretrained wights of the QASS layer (`tau/splinter-base-qass` and `tau/splinter-large-qass`) and one
+  doesn't (`tau/splinter-base` and `tau/splinter-large`). This is done to support randomly initializing this layer at
+  fine-tuning, as it is shown to yield better results for some cases in the paper.
+
+This model was contributed by `yuvalkirstain <https://huggingface.co/yuvalkirstain>`__ and `oriram
+<https://huggingface.co/oriram>`__. The original code can be found `here <https://github.com/oriram/splinter>`__.
+
+SplinterConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterConfig
+    :members:
+
+
+SplinterTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+SplinterTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterTokenizerFast
+    :members:
+
+
+SplinterModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterModel
+    :members: forward
+
+
+SplinterForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterForQuestionAnswering
+    :members: forward
--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -13,9 +13,6 @@
 T5
 -----------------------------------------------------------------------------------------------------------------------

-**DISCLAIMER:** This model is still a work in progress, if you see something strange, file a `Github Issue
-<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__.
-
 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -42,28 +39,56 @@ Tips:
  different prefix to the input corresponding to each task, e.g., for translation: *translate English to German: ...*,
  for summarization: *summarize: ...*.

-  For more information about which prefix to use, it is easiest to look into Appendix D of the `paper
-  <https://arxiv.org/pdf/1910.10683.pdf>`__. - For sequence-to-sequence generation, it is recommended to use
-  :meth:`~transformers.generation_utils.GenerationMixin.generate`. This method takes care of feeding the encoded input
-  via cross-attention layers to the decoder and auto-regressively generates the decoder output. - T5 uses relative
-  scalar embeddings. Encoder input padding can be done on the left and on the right.
+- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
+
+- See the :ref:`training`, :ref:`inference` and :ref:`scripts` sections below for all details regarding usage.
+
+T5 comes in different sizes:
+
+- `t5-small <https://huggingface.co/t5-small>`__
+
+- `t5-base <https://huggingface.co/t5-base>`__
+
+- `t5-large <https://huggingface.co/t5-large>`__
+
+- `t5-3b <https://huggingface.co/t5-3b>`__
+
+- `t5-11b <https://huggingface.co/t5-11b>`__.
+
+Based on the original T5 model, Google has released some follow-up works:
+
+- **T5v1.1**: T5v1.1 is an improved version of T5 with some architectural tweaks, and is pre-trained on C4 only without
+  mixing in the supervised tasks. Refer to the documentation of T5v1.1 which can be found :doc:`here <t5v1.1>`.
+
+- **mT5**: mT5 is a multilingual T5 model. It is pre-trained on the mC4 corpus, which includes 101 languages. Refer to
+  the documentation of mT5 which can be found :doc:`here <mt5>`.
+
+- **byT5**: byT5 is a T5 model pre-trained on byte sequences rather than SentencePiece subword token sequences. Refer
+  to the documentation of byT5 which can be found :doc:`here <byt5>`.
+
+All checkpoints can be found on the `hub <https://huggingface.co/models?search=t5>`__.

 This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
 <https://github.com/google-research/text-to-text-transfer-transformer>`__.

+.. _training:
+
 Training
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher
-forcing. This means that for training we always need an input sequence and a target sequence. The input sequence is fed
-to the model using :obj:`input_ids`. The target sequence is shifted to the right, i.e., prepended by a start-sequence
-token and fed to the decoder using the :obj:`decoder_input_ids`. In teacher-forcing style, the target sequence is then
-appended by the EOS token and corresponds to the :obj:`labels`. The PAD token is hereby used as the start-sequence
-token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
+forcing. This means that for training, we always need an input sequence and a corresponding target sequence. The input
+sequence is fed to the model using :obj:`input_ids`. The target sequence is shifted to the right, i.e., prepended by a
+start-sequence token and fed to the decoder using the :obj:`decoder_input_ids`. In teacher-forcing style, the target
+sequence is then appended by the EOS token and corresponds to the :obj:`labels`. The PAD token is hereby used as the
+start-sequence token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
+
+One can use :class:`~transformers.T5ForConditionalGeneration` (or the Tensorflow/Flax variant), which includes the
+language modeling head on top of the decoder.

 - Unsupervised denoising training

-  In this setup spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and
+  In this setup, spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and
  the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. Each
  sentinel token represents a unique mask token for this sentence and should start with :obj:`<extra_id_0>`,
  :obj:`<extra_id_1>`, ... up to :obj:`<extra_id_99>`. As a default, 100 sentinel tokens are available in
@@ -72,26 +97,201 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash
  For instance, the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be
  processed as follows:

-.. code-block::
+    .. code-block::

-    input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
-    labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
-    # the forward function automatically creates the correct decoder_input_ids
-    loss = model(input_ids=input_ids, labels=labels).loss
+        from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
+        labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
+        # the forward function automatically creates the correct decoder_input_ids
+        loss = model(input_ids=input_ids, labels=labels).loss
+
+  If you're interested in pre-training T5 on a new corpus, check out the `run_t5_mlm_flax.py
+  <https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling>`__ script in the Examples
+  directory.

 - Supervised training

-  In this setup the input sequence and output sequence are standard sequence-to-sequence input output mapping. In
-  translation, for instance with the input sequence "The house is wonderful." and output sequence "Das Haus ist
-  wunderbar.", the sentences should be processed as follows:
+  In this setup, the input sequence and output sequence are a standard sequence-to-sequence input-output mapping.
+  Suppose that we want to fine-tune the model for translation for example, and we have a training example: the input
+  sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar.", then they should be prepared for
+  the model as follows:
+
+    .. code-block::
+
+        from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+        labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
+        # the forward function automatically creates the correct decoder_input_ids
+        loss = model(input_ids=input_ids, labels=labels).loss
+
+  As you can see, only 2 inputs are required for the model in order to compute a loss: :obj:`input_ids` (which are the
+  :obj:`input_ids` of the encoded input sequence) and :obj:`labels` (which are the :obj:`input_ids` of the encoded
+  target sequence). The model will automatically create the :obj:`decoder_input_ids` based on the :obj:`labels`, by
+  shifting them one position to the right and prepending the :obj:`config.decoder_start_token_id`, which for T5 is
+  equal to 0 (i.e. the id of the pad token). Also note the task prefix: we prepend the input sequence with 'translate
+  English to German: ' before encoding it. This will help in improving the performance, as this task prefix was used
+  during T5's pre-training.
+
+  However, the example above only shows a single training example. In practice, one trains deep learning models in
+  batches. This entails that we must pad/truncate examples to the same length. For encoder-decoder models, one
+  typically defines a :obj:`max_source_length` and :obj:`max_target_length`, which determine the maximum length of the
+  input and output sequences respectively (otherwise they are truncated). These should be carefully set depending on
+  the task.
+
+  In addition, we must make sure that padding token id's of the :obj:`labels` are not taken into account by the loss
+  function. In PyTorch and Tensorflow, this can be done by replacing them with -100, which is the :obj:`ignore_index`
+  of the :obj:`CrossEntropyLoss`. In Flax, one can use the :obj:`decoder_attention_mask` to ignore padded tokens from
+  the loss (see the `Flax summarization script
+  <https://github.com/huggingface/transformers/tree/master/examples/flax/summarization>`__ for details). We also pass
+  :obj:`attention_mask` as additional input to the model, which makes sure that padding tokens of the inputs are
+  ignored. The code example below illustrates all of this.
+
+    .. code-block::
+
+        from transformers import T5Tokenizer, T5ForConditionalGeneration 
+        import torch
+
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        # the following 2 hyperparameters are task-specific
+        max_source_length = 512
+        max_target_length = 128
+
+        # Suppose we have the following 2 training examples:
+        input_sequence_1 = "Welcome to NYC"
+        output_sequence_1 = "Bienvenue à NYC"
+
+        input_sequence_2 = "HuggingFace is a company"
+        output_sequence_2 = "HuggingFace est une entreprise"
+
+        # encode the inputs
+        task_prefix = "translate English to French: "
+        input_sequences = [input_sequence_1, input_sequence_2]
+        encoding = tokenizer([task_prefix + sequence for sequence in input_sequences], 
+                             padding='longest', 
+                             max_length=max_source_length, 
+                             truncation=True, 
+                             return_tensors="pt")
+        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
+
+        # encode the targets
+        target_encoding = tokenizer([output_sequence_1, output_sequence_2], 
+                                    padding='longest', 
+                                    max_length=max_target_length, 
+                                    truncation=True)
+        labels = target_encoding.input_ids
+
+        # replace padding token id's of the labels by -100
+        labels = [
+                   [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels
+        ] 
+        labels = torch.tensor(labels)
+
+        # forward pass
+        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
+
+Additional training tips:
+
+- T5 models need a slightly higher learning rate than the default one set in the :obj:`Trainer` when using the AdamW
+  optimizer. Typically, 1e-4 and 3e-4 work well for most problems (classification, summarization, translation, question
+  answering, question generation). Note that T5 was pre-trained using the AdaFactor optimizer.
+
+- According to `this forum post <https://discuss.huggingface.co/t/t5-finetuning-tips/684>`__, task prefixes matter when
+  (1) doing multi-task training (2) your task is similar or related to one of the supervised tasks used in T5's
+  pre-training mixture (see Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`__ for the task prefixes
+  used).
+
+- If training on TPU, it is recommended to pad all examples of the dataset to the same length or make use of
+  `pad_to_multiple_of` to have a small number of predefined bucket sizes to fit all examples in. Dynamically padding
+  batches to the longest example is not recommended on TPU as it triggers a recompilation for every batch shape that is
+  encountered during training thus significantly slowing down the training. only padding up to the longest example in a
+  batch) leads to very slow training on TPU.
+
+.. _inference:
+
+Inference
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+At inference time, it is recommended to use :meth:`~transformers.generation_utils.GenerationMixin.generate`. This
+method takes care of encoding the input and feeding the encoded hidden states via cross-attention layers to the decoder
+and auto-regressively generates the decoder output. Check out `this blog post
+<https://huggingface.co/blog/how-to-generate>`__ to know all the details about generating text with Transformers.
+There's also `this blog post <https://huggingface.co/blog/encoder-decoder#encoder-decoder>`__ which explains how
+generation works in general in encoder-decoder models.

 .. code-block::

-    input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
-    labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
-    # the forward function automatically creates the correct decoder_input_ids
-    loss = model(input_ids=input_ids, labels=labels).loss
+        from transformers import T5Tokenizer, T5ForConditionalGeneration 

+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+        outputs = model.generate(input_ids)
+        print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        # Das Haus ist wunderbar.
+
+Note that T5 uses the :obj:`pad_token_id` as the :obj:`decoder_start_token_id`, so when doing generation without using
+:meth:`~transformers.generation_utils.GenerationMixin.generate`, make sure you start it with the :obj:`pad_token_id`.
+
+The example above only shows a single example. You can also do batched inference, like so:
+
+.. code-block::
+
+        from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        # when generating, we will use the logits of right-most token to predict the next token
+        # so the padding should be on the left
+        tokenizer.padding_side = "left" 
+        tokenizer.pad_token = tokenizer.eos_token # to avoid an error
+
+        task_prefix = 'translate English to German: '
+        sentences = ['The house is wonderful.', 'I like to work in NYC.'] # use different length sentences to test batching
+        inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
+
+        output_sequences = model.generate(
+            input_ids=inputs['input_ids'],
+            attention_mask=inputs['attention_mask'],
+            do_sample=False, # disable sampling to test if batching affects output
+        )
+
+        print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
+
+        # ['Das Haus ist wunderbar.', 'Ich arbeite gerne in NYC.']
+
+.. _scripts:
+
+Example scripts
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+T5 is supported by several example scripts, both for pre-training and fine-tuning.
+
+* pre-training: the `run_t5_mlm_flax.py
+  <https://github.com/huggingface/transformers/blob/master/examples/flax/language-modeling/run_t5_mlm_flax.py>`__
+  script allows you to further pre-train T5 or pre-train T5 from scratch on your own data. The `t5_tokenizer_model.py
+  <https://github.com/huggingface/transformers/blob/master/examples/flax/language-modeling/t5_tokenizer_model.py>`__
+  script allows you to further train a T5 tokenizer or train a T5 Tokenizer from scratch on your own data. Note that
+  Flax (a neural network library on top of JAX) is particularly useful to train on TPU hardware.
+
+* fine-tuning: T5 is supported by the official summarization scripts (`PyTorch
+  <https://github.com/huggingface/transformers/tree/master/examples/pytorch/summarization>`__, `Tensorflow
+  <https://github.com/huggingface/transformers/tree/master/examples/tensorflow/summarization>`__, and `Flax
+  <https://github.com/huggingface/transformers/tree/master/examples/flax/summarization>`__) and translation scripts
+  (`PyTorch <https://github.com/huggingface/transformers/tree/master/examples/pytorch/translation>`__ and `Tensorflow
+  <https://github.com/huggingface/transformers/tree/master/examples/tensorflow/translation>`__). These scripts allow
+  you to easily fine-tune T5 on custom data for summarization/translation.

 T5Config
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -152,3 +352,15 @@ TFT5EncoderModel

 .. autoclass:: transformers.TFT5EncoderModel
    :members: call
+
+FlaxT5Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxT5Model
+    :members: __call__, encode, decode
+
+FlaxT5ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxT5ForConditionalGeneration
+    :members: __call__, encode, decode
--- a/docs/source/model_doc/t5v1.1.rst
+++ b/docs/source/model_doc/t5v1.1.rst
@@ -0,0 +1,66 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+T5v1.1
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+T5v1.1 was released in the `google-research/text-to-text-transfer-transformer
+<https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__
+repository by Colin Raffel et al. It's an improved version of the original T5 model.
+
+One can directly plug in the weights of T5v1.1 into a T5 model, like so:
+
+.. code-block::
+
+    from transformers import T5ForConditionalGeneration
+
+    model = T5ForConditionalGeneration.from_pretrained('google/t5-v1_1-base') 
+
+T5 Version 1.1 includes the following improvements compared to the original T5 model:
+
+- GEGLU activation in the feed-forward hidden layer, rather than ReLU. See `this paper
+  <https://arxiv.org/abs/2002.05202>`__.
+
+- Dropout was turned off in pre-training (quality win). Dropout should be re-enabled during fine-tuning.
+
+- Pre-trained on C4 only without mixing in the downstream tasks.
+
+- No parameter sharing between the embedding and classifier layer.
+
+- "xl" and "xxl" replace "3B" and "11B". The model shapes are a bit different - larger :obj:`d_model` and smaller
+  :obj:`num_heads` and :obj:`d_ff`.
+
+Note: T5 Version 1.1 was only pre-trained on `C4 <https://huggingface.co/datasets/c4>`__ excluding any supervised
+training. Therefore, this model has to be fine-tuned before it is useable on a downstream task, unlike the original T5
+model. Since t5v1.1 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
+
+Google has released the following variants:
+
+- `google/t5-v1_1-small <https://huggingface.co/google/t5-v1_1-small>`__
+
+- `google/t5-v1_1-base <https://huggingface.co/google/t5-v1_1-base>`__
+
+- `google/t5-v1_1-large <https://huggingface.co/google/t5-v1_1-large>`__
+
+- `google/t5-v1_1-xl <https://huggingface.co/google/t5-v1_1-xl>`__
+
+- `google/t5-v1_1-xxl <https://huggingface.co/google/t5-v1_1-xxl>`__.
+
+One can refer to :doc:`T5's documentation page <t5>` for all tips, code examples and notebooks.
+
+This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
+found `here
+<https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__.
--- a/docs/source/model_doc/visual_bert.rst
+++ b/docs/source/model_doc/visual_bert.rst
@@ -58,9 +58,17 @@ layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The
 appropriately for the textual and visual parts.

 The :class:`~transformers.BertTokenizer` is used to encode the text. A custom detector/feature extractor must be used
-to get the visual embeddings. For an example on how to generate visual embeddings, see the `colab notebook
-<https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__. The following example shows
-how to get the last hidden state using :class:`~transformers.VisualBertModel`:
+to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models:
+
+* `VisualBERT VQA demo notebook
+  <https://github.com/huggingface/transformers/tree/master/examples/research_projects/visual_bert>`__ : This notebook
+  contains an example on VisualBERT VQA.
+
+* `Generate Embeddings for VisualBERT (Colab Notebook)
+  <https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__ : This notebook contains
+  an example on how to generate visual embeddings.
+
+The following example shows how to get the last hidden state using :class:`~transformers.VisualBertModel`:

 .. code-block::

@@ -74,6 +82,13 @@ how to get the last hidden state using :class:`~transformers.VisualBertModel`:
        >>> # this is a custom function that returns the visual embeddings given the image path
        >>> visual_embeds = get_visual_embeddings(image_path)

+        >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+        >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+        >>> inputs.update({
+        ...     "visual_embeds": visual_embeds,
+        ...     "visual_token_type_ids": visual_token_type_ids,
+        ...     "visual_attention_mask": visual_attention_mask
+        ... })
        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state

--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -66,6 +66,23 @@ Tips:
  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.

+Following the original Vision Transformer, some follow-up works have been made:
+
+- DeiT (Data-efficient Image Transformers) by Facebook AI. DeiT models are distilled vision transformers. Refer to
+  :doc:`DeiT's documentation page <deit>`. The authors of DeiT also released more efficiently trained ViT models, which
+  you can directly plug into :class:`~transformers.ViTModel` or :class:`~transformers.ViTForImageClassification`. There
+  are 4 variants available (in 3 different sizes): `facebook/deit-tiny-patch16-224`, `facebook/deit-small-patch16-224`,
+  `facebook/deit-base-patch16-224` and `facebook/deit-base-patch16-384`. Note that one should use
+  :class:`~transformers.DeiTFeatureExtractor` in order to prepare images for the model.
+
+- BEiT (BERT pre-training of Image Transformers) by Microsoft Research. BEiT models outperform supervised pre-trained
+  vision transformers using a self-supervised method inspired by BERT (masked image modeling) and based on a VQ-VAE.
+  Refer to :doc:`BEiT's documentation page <beit>`.
+
+- DINO (a method for self-supervised training of Vision Transformers) by Facebook AI. Vision Transformers trained using
+  the DINO method show very interesting properties not seen with convolutional models. They are capable of segmenting
+  objects, without having ever been trained to do so. DINO checkpoints can be found on the `hub
+  <https://huggingface.co/models?other=dino>`__.

 This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code (written in JAX) can be
 found `here <https://github.com/google-research/vision_transformer>`__.
--- a/docs/source/model_doc/wav2vec2.rst
+++ b/docs/source/model_doc/wav2vec2.rst
@@ -67,6 +67,22 @@ Wav2Vec2Processor
    :members: __call__, pad, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor


+Wav2Vec2 specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2BaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput
+    :members: 
+
+.. autoclass:: transformers.models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2BaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2ForPreTrainingOutput
+    :members: 
+
+
 Wav2Vec2Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -80,6 +96,14 @@ Wav2Vec2ForCTC
 .. autoclass:: transformers.Wav2Vec2ForCTC
    :members: forward

+
+Wav2Vec2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Wav2Vec2ForSequenceClassification
+    :members: forward
+
+
 Wav2Vec2ForPreTraining
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -99,3 +123,23 @@ TFWav2Vec2ForCTC

 .. autoclass:: transformers.TFWav2Vec2ForCTC
    :members: call
+
+
+FlaxWav2Vec2Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxWav2Vec2Model
+    :members: __call__
+
+
+FlaxWav2Vec2ForCTC
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxWav2Vec2ForCTC
+    :members: __call__
+
+FlaxWav2Vec2ForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxWav2Vec2ForPreTraining
+    :members: __call__
--- a/docs/source/model_sharing.rst
+++ b/docs/source/model_sharing.rst
@@ -341,8 +341,8 @@ Add a model card

 To make sure everyone knows what your model can do, what its limitations, potential bias or ethical considerations are,
 please add a README.md model card to your model repo. You can just create it, or there's also a convenient button
-titled "Add a README.md" on your model page. A model card template can be found `here
-<https://github.com/huggingface/model_card>`__ (meta-suggestions are welcome). model card template (meta-suggestions
+titled "Add a README.md" on your model page. A model card documentation can be found `here
+<https://huggingface.co/docs/hub/model-repos>`__ (meta-suggestions are welcome). model card template (meta-suggestions
 are welcome).

 .. note::
--- a/docs/source/parallelism.md
+++ b/docs/source/parallelism.md
@@ -0,0 +1,357 @@
+<!---
+Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Model Parallelism
+
+
+## Parallelism overview
+
+In the modern machine learning the various approaches to parallelism are used to:
+1. fit very large models onto limited hardware - e.g. t5-11b is 45GB in just model params
+2. significantly speed up training - finish training that would take a year in hours
+
+We will first discuss in depth various 1D parallelism techniques and their pros and cons and then look at how they can be combined into 2D and 3D parallelism to enable an even faster training and to support even bigger models. Various other powerful alternative approaches will be presented.
+
+While the main concepts most likely will apply to any other framework, this article is focused on PyTorch-based implementations.
+
+
+## Concepts
+
+The following is the brief description of the main concepts that will be described later in depth in this document.
+
+1. DataParallel (DP) - the same setup is replicated multiple times, and each being fed a slice of the data. The processing is done in parallel and all setups are synchronized at the end of each training step.
+2. TensorParallel (TP) - each tensor is split up into multiple chunks, so instead of having the whole tensor reside on a single gpu, each shard of the tensor resides on its designated gpu. During processing each shard gets processed separately and in parallel on different GPUs and the results are synced at the end of the step. This is what one may call horizontal parallelism, as the splitting happens on horizontal level.
+3. PipelineParallel (PP) - the model is split up vertically (layer-level) across multiple GPUs, so that only one or several layers of the model are places on a single gpu. Each gpu processes in parallel different stages of the pipeline and working on a small chunk of the batch.
+4. Zero Redundancy Optimizer (ZeRO) - Also performs sharding of the tensors somewhat similar to TP, except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model does't need to be modified. It also supports various offloading techniques to compensate for limited GPU memory.
+5. Sharded DDP - is another name for the foundational ZeRO concept as used by various other implementations of ZeRO.
+
+
+## Data Parallel
+
+Most users with just 2 GPUs already enjoy the increased training speed up thanks to DataParallel (DP) and DistributedDataParallel (DDP) that are almost trivial to use. This is a built-in feature of Pytorch.
+
+## ZeRO Data Parallel
+
+ZeRO-powered data parallelism (ZeRO-DP) is described on the following diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)
+![DeepSpeed-Image-1](imgs/parallelism-zero.png)
+
+It can be difficult to wrap one's head around it, but in reality the concept is quite simple. This is just the usual DataParallel (DP), except, instead of replicating the full model params, gradients and optimizer states, each GPU stores only a slice of it.  And then at run-time when the full layer params are needed just for the given layer, all GPUs synchronize to give each other parts that they miss - this is it.
+
+Consider this simple model with 3 layers, where each layer has 3 params:
+```
+La | Lb | Lc
+---|----|---
+a0 | b0 | c0
+a1 | b1 | c1
+a2 | b2 | c2
+```
+Layer La has weights a0, a1 and a2.
+
+If we have 3 GPUs, the Sharded DDP (= Zero-DP) splits the model onto 3 GPUs like so:
+
+```
+GPU0:
+La | Lb | Lc
+---|----|---
+a0 | b0 | c0
+
+GPU1:
+La | Lb | Lc
+---|----|---
+a1 | b1 | c1
+
+GPU2:
+La | Lb | Lc
+---|----|---
+a2 | b2 | c2
+```
+
+In a way this is the same horizontal slicing, as tensor parallelism, if you imagine the typical DNN diagram. Vertical slicing is where one puts whole layer-groups on different GPUs. But it's just the starting point.
+
+Now each of these GPUs will get the usual mini-batch as it works in DP:
+```
+x0 => GPU0
+x1 => GPU1
+x2 => GPU2
+```
+
+The inputs are unmodified - they think they are going to be processed by the normal model.
+
+First, the inputs hit the layer La.
+
+Let's focus just on GPU0: x0 needs a0, a1, a2 params to do its forward path, but GPU0 has only a0 - it gets sent a1 from GPU1 and a2 from GPU2, bringing all pieces of the model together.
+
+In parallel, GPU1 gets mini-batch x1 and it only has a1, but needs a0 and a2 params, so it gets those from GPU0 and GPU2.
+
+Same happens to GPU2 that gets input x2. It gets a0 and a1 from GPU0 and GPU1, and with its a2 it reconstructs the full tensor.
+
+All 3 GPUs get the full tensors reconstructed and a forward happens.
+
+As soon as the calculation is done, the data that is no longer needed gets dropped - it's only used during the calculation. The reconstruction is done efficiently via a pre-fetch.
+
+And the whole process is repeated for layer Lb, then Lc forward-wise, and then backward Lc -> Lb -> La.
+
+To me this sounds like an efficient group backpacking weight distribution strategy:
+
+1. person A carries the tent
+2. person B carries the stove
+3. person C carries the axe
+
+Now each night they all share what they have with others and get from others what the don't have, and in the morning they pack up their allocated type of gear and continue on their way. This is Sharded DDP / Zero DP.
+
+Compare this strategy to the simple one where each person has to carry their own tent, stove and axe, which would be far more inefficient. This is DataParallel (DP and DDP) in Pytorch.
+
+While reading the literature on this topic you may encounter the following synonyms: Sharded, Partitioned.
+
+If you pay close attention the way ZeRO partitions the model's weights - it looks very similar to tensor parallelism which will be discussed later. This is because it partitions/shards each layer's weights, unlike vertical model parallelism which is discussed next.
+
+Implementations:
+
+- [DeepSpeed](https://www.deepspeed.ai/features/#the-zero-redundancy-optimizer) ZeRO-DP stages 1+2+3
+- [Fairscale](https://github.com/facebookresearch/fairscale/#optimizer-state-sharding-zero) ZeRO-DP stages 1+2+3
+- [`transformers` integration](https://huggingface.co/transformers/master/main_classes/trainer.html#trainer-integrations)
+
+## Naive Model Parallel (Vertical) and Pipeline Parallel
+
+Naive Model Parallel (MP) is where one spreads groups of model layers across multiple GPUs. The mechanism is relatively simple - switch the desired layers `.to()` the desired devices and now whenever the data goes in and out those layers switch the data to the same device as the layer and leave the rest unmodified.
+
+We refer to it as Vertical MP, because if you remember how most models are drawn, we slice the layers vertically. For example, if the following diagram shows an 8-layer model:
+
+```
+===================  ===================
+|  0 | 1 | 2 | 3  |  |  4 | 5 | 6 | 7  |
+===================  ===================
+        gpu0                 gpu1
+```
+we just sliced it in 2 vertically, placing layers 0-3 onto GPU0 and 4-7 to GPU1.
+
+Now while data travels from layer 0 to 1, 1 to 2 and 2 to 3 this is just the normal model. But when data needs to pass from layer 3 to layer 4 it needs to travel from GPU0 to GPU1 which introduces a communication overhead. If the participating GPUs are on the same compute node (e.g. same physical machine) this copying is pretty fast, but if the GPUs are located on different compute nodes (e.g. multiple machines) the communication overhead could be significantly larger.
+
+Then layers 4 to 5 to 6 to 7 are as a normal model would have and when the 7th layer completes we often need to send the data back to layer 0 where the labels are (or alternatively send the labels to the the last layer). Now the loss can be computed and the optimizer can do its work.
+
+Problems:
+- the main deficiency and why this one is called "naive" MP, is that all but one GPU is idle at any given moment. So if 4 GPUs are used, it's almost identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware. Plus there is the overhead of copying the data between devices. So 4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, except the latter will complete the training faster, since it doesn't have the data copying overhead. But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states)
+- shared embeddings may need to get copied back and forth between GPUs.
+
+Pipeline Parallel (PP) is almost identical to a naive MP, but it solves the GPU idling problem, by chunking the incoming batch into micro-batches and artificially creating a pipeline, which allows different GPUs to concurrently participate in the computation process.
+
+The following illustration from the [GPipe paper](https://ai.googleblog.com/2019/03/introducing-gpipe-open-source-library.html) shows the naive MP on the top, and PP on the bottom:
+
+![mp-pp](imgs/parallelism-gpipe-bubble.png)
+
+It's easy to see from the bottom diagram how PP has less dead zones, where GPUs are idle. The idle parts are referred to as the "bubble".
+
+Both parts of the diagram show a parallelism that is of degree 4. That is 4 GPUs are participating in the pipeline. So there is the forward path of 4 pipe stages F0, F1, F2 and F3 and then the return reverse order backward path of B3, B2, B1 and B0.
+
+PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottomw diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).
+
+Note that conceptually this is the same concept as gradient accumulation steps (GAS). Pytorch uses `chunks`, whereas DeepSpeed refers to the same hyper-parameter as GAS.
+
+Because of the chunks, PP introduces the concept of micro-batches (MBS). DP splits the global data batch size into mini-batches, so if you have a DP degree of 4, a global batch size of 1024 gets split up into 4 mini-batches of 256 each (1024/4). And if the number of `chunks` (or GAS) is 32 we end up with a micro-batch size of 8 (256/32). Each Pipeline stage works with a single micro-batch at a time.
+
+To calculate the global batch size of the DP + PP setup we then do: `mbs*chunks*dp_degree` (`8*32*4=1024`).
+
+Let's go back to the diagram.
+
+With `chunks=1` you end up with the naive MP, which is very inefficient. With a very large `chunks` value you end up with tiny micro-batch sizes which could be not every efficient either. So one has to experiment to find the value that leads to the highest efficient utilization of the gpus.
+
+While the diagram shows that there is a bubble of "dead" time that can't be parallelized because the last `forward` stage has to wait for `backward` to complete the pipeline, the purpose of finding the best value for `chunks` is to enable a high concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble.
+
+Problems:
+- have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a `nn.Sequential` sequence of the same, which may require changes to the design of the model.
+- currently the Pipeline API is very restricted. If you had a bunch of python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693
+- have to arrange each layer so that the output of one model becomes an input to the other model
+
+Implementations:
+- [Pytorch](https://pytorch.org/docs/stable/pipeline.html) (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). Some [examples](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py)
+- [FairScale](https://fairscale.readthedocs.io/en/latest/tutorials/pipe.html)
+- [DeepSpeed](https://www.deepspeed.ai/tutorials/pipeline/)
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation - no API.
+
+🤗 Transformers status: as of this writing none of the models supports full-PP. GPT2 and T5 models have naive PP support. The main obstacle is being unable to convert the models to `nn.Sequential` and have all the inputs to be Tensors. This is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that.
+
+Other approaches:
+
+DeepSpeed and SageMaker use the concept of an [Interleaved Pipeline](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html)
+![interleaved-pipeline-execution](imgs/parallelism-sagemaker-interleaved-pipeline.png)
+
+Here the bubble (idle time) is further minimized by prioritizing backward passes.
+
+According to [the same document](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html), it might be able to automate the non `nn.Sequential` model conversion to pipeline. The only problem is that this is currently only available at AWS, so you can't run it on your own hardware.
+
+
+## Tensor Parallelism
+
+In Tensor Parallelism each GPU processes only a slice of a tensor and only aggregates the full tensor for operations that require the whole thing.
+
+In this section we use concepts and diagrams from the [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) paper: [Efficient Large-Scale Language Model Training on GPU Clusters](https://arxiv.org/abs/2104.04473).
+
+The main building block of any transformer is a fully connected `nn.Linear` followed by a nonlinear activation `GeLU`.
+
+Following the Megatron's paper notation, we can write the dot-product part of it as `Y = GeLU(XA)`, where `X` and `Y` are the input and output vectors, and `A` is the weight matrix.
+
+If we look at the computation in matrix form, it's easy to see how the matrix multiplication can be split between multiple GPUs:
+![Parallel GEMM](imgs/parallelism-tp-parallel_gemm.png)
+
+If we split the weight matrix `A` column-wise across `N` GPUs and perform matrix multiplications `XA_1` through `XA_n` in parallel, then we will end up with `N` output vectors `Y_1, Y_2, ..., Y_n` which can be fed into `GeLU` independently:
+![independent GeLU](imgs/parallelism-tp-independent-gelu.png)
+
+Using this principle, we can update an MLP of arbitrary depth, without the need for any synchronization between GPUs until the very end, where we need to reconstruct the output vector from shards. The Megatron-LM paper authors provide a helpful illustration for that:
+![parallel shard processing](imgs/parallelism-tp-parallel_shard_processing.png)
+
+Parallelizing the multi-headed attention layers is even simpler, since they are already inherently parallel, due to having multiple independent heads!
+![parallel self-attention](imgs/parallelism-tp-parallel_self_attention.png)
+
+Special considerations: TP requires very fast network, and therefore it's not advisable to do TP across more than one node. Practically, if a node has 4 GPUs, the highest TP degree is therefore 4. If you need a TP degree of 8, you need to use nodes that have at least 8 GPUs.
+
+This section is based on the original much more [detailed TP overview](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530).
+by [@anton-l](https://github.com/anton-l).
+
+Alternative names:
+- DeepSpeed calls it [tensor slicing](https://www.deepspeed.ai/features/#model-parallelism)
+
+Implementations:
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation, as it's very model-specific
+- [parallelformers](https://github.com/tunib-ai/parallelformers) (only inference at the moment)
+
+🤗 Transformers status:
+- core: not yet implemented in the core
+- but if you want inference [parallelformers](https://github.com/tunib-ai/parallelformers) provides this support for most of our models. So until this is implemented in the core you can use theirs. And hopefully training mode will be supported too.
+- Deepspeed-Inference also supports our BERT, GPT-2, and GPT-Neo models in their super-fast CUDA-kernel-based inference mode, see more [here](https://www.deepspeed.ai/tutorials/inference-tutorial/)
+
+
+
+## DP+PP
+
+The following diagram from the DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/) demonstrates how one combines DP with PP.
+
+![dp-pp-2d](imgs/parallelism-zero-dp-pp.png)
+
+Here it's important to see how DP rank 0 doesn't see GPU2 and DP rank 1 doesn't see GPU3. To DP there is just GPUs 0 and 1 where it feeds data as if there were just 2 GPUs. GPU0 "secretly" offloads some of its load to GPU2 using PP. And GPU1 does the same by enlisting GPU3 to its aid.
+
+Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs.
+
+Implementations:
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed)
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
+
+🤗 Transformers status: not yet implemented
+
+
+## DP+PP+TP
+
+To get an even more efficient training a 3D parallelism is used where PP is combined with TP and DP. This can be seen in the following diagram.
+
+![dp-pp-tp-3d](imgs/parallelism-deepspeed-3d.png)
+
+This diagram is from a blog post [3D parallelism: Scaling to trillion-parameter models](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/), which is a good read as well.
+
+Since each dimension requires at least 2 GPUs, here you'd need at least 8 GPUs.
+
+Implementations:
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed) - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP.
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
+
+🤗 Transformers status: not yet implemented, since we have no PP and TP.
+
+
+## DP+PP+TP+ZeRO
+
+One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. It has already been discussed in [ZeRO Data Parallel](#zero-data-parallel). Normally it's a standalone feature that doesn't require PP or TP. But it can be combined with PP and TP.
+
+When ZeRO-DP is combined with PP (and optinally TP) it typically enables only ZeRO stage 1 (optimizer sharding).
+
+While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have bad performance impacts. There would need to be an additional reduce-scatter collective for every micro-batch to aggregate the gradients before sharding, which adds a potentially significant communication overhead. By nature of Pipeline Parallelism, small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with minimizing the Pipeline bubble (number of micro-batches). Therefore those communication costs are going to hurt.
+
+In addition, There are already fewer layers than normal due to PP and so the memory savings won't be huge. PP already reduces gradient size by ``1/PP``, and so gradient sharding savings on top of that are less significant than pure DP.
+
+ZeRO stage 3 is not a good choice either for the same reason - more inter-node communications required.
+
+And since we have ZeRO, the other benefit is ZeRO-Offload. Since this is stage 1 optimizer states can be offloaded to CPU.
+
+Implementations:
+- [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed)
+
+🤗 Transformers status: not yet implemented, since we have no PP and TP.
+
+
+## FlexFlow
+
+[FlexFlow](https://github.com/flexflow/FlexFlow) also solves the parallelization problem in a slightly different approach.
+
+Paper: ["Beyond Data and Model Parallelism for Deep Neural Networks" by Zhihao Jia, Matei Zaharia, Alex Aiken](https://arxiv.org/abs/1807.05358)
+
+It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter.
+
+1. Sample = Data Parallelism
+2. Operator = part vertical Layer Parallelism, but it can split the layer too - more refined level
+3. Attribute = horizontal Model Parallelism (Megatron-LM style)
+4. Parameter = Sharded model params
+
+and they are working on Pipeline Parallelism. I guess ZeRO-DP is Sample+Parameter in this context.
+
+![flex-flow-soap](imgs/parallelism-flexflow.jpeg)
+
+The significance of this framework is that it takes resources like (1) GPU/TPU/CPU vs. (2) RAM/DRAM vs. (3) fast-intra-connect/slow-inter-connect and it automatically optimizes all these  algorithmically deciding which parallelisation to use where.
+
+One very important aspect is that FlexFlow is designed for optimizing DNN parallelizations for models with static and fixed workloads, since models with dynamic behavior may prefer different parallelization strategies across iterations.
+
+So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best strategy to utilise this specific environment. If you add/remove/replace any parts it'll run and re-optimize the plan for that. And then you can train. A different setup will have its own custom optimization.
+
+🤗 Transformers status: not yet integrated. We already have our models FX-trace-able via [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py), which is a prerequisite for FlexFlow, so someone needs to figure out what needs to be done to make FlexFlow work with our models.
+
+
+## Which Strategy To Use When
+
+Here is a very rough outlook at which parallelism strategy to use when. The first on the list is typically faster.
+
+**⇨ Single GPU**
+
+* Model fits onto a single GPU:
+
+    1. Normal use
+
+* Model doesn't fit onto a single GPU:
+
+    1. ZeRO + Offload CPU and optionally NVMe
+
+
+**⇨ Single Node / Multi-GPU**
+
+* Model fits onto a single GPU:
+
+    1. DDP - Distributed DP
+    2. ZeRO - may or may not be faster depending on the situation and configuration used
+
+* Model doesn't fit onto a single GPU:
+
+    1. PP
+    2. ZeRO
+    3. TP
+
+    With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP and ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup.
+
+
+**⇨ Multi-Node / Multi-GPU**
+
+* When you have fast inter-node connectivity:
+
+    1. ZeRO - as it requires close to no modifications to the model
+    2. PP+TP+DP - less communications, but requires massive changes to the model
+
+* when you have slow inter-node connectivity and still low on GPU memory:
+
+    1. DP+PP+TP+ZeRO-1
--- a/docs/source/performance.md
+++ b/docs/source/performance.md
@@ -0,0 +1,359 @@
+<!---
+Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Performance and Scalability: How To Fit a Bigger Model and Train It Faster
+
+For now the software sections of this document are mainly Pytorch-specific, but the guide can be extended to other frameworks in the future.
+
+## Quick notes
+
+This section gives brief ideas on how to make training faster and support bigger models. Later sections will expand, demonstrate and elucidate each of these.
+
+### Faster Training
+
+Hardware:
+
+- fast connectivity between GPUs
+    * intra-node: NVLink
+    * inter-node: Infiniband / Intel OPA
+
+Software:
+
+- Data Parallel / Distributed Data Parallel
+- fp16 (autocast caching)
+
+
+### Bigger Models
+
+Hardware:
+
+- bigger GPUs
+- more GPUs
+- more CPU and NVMe (offloaded to by DeepSpeed)
+
+Software:
+
+- Deepspeed ZeRO
+- Deepspeed ZeRO-Offload
+- Megatron-LM 3D Parallelism
+- Pipeline Parallelism
+- Tensor Parallelism
+- Low-memory Optimizers
+- fp16/bf16 (smaller data)
+- Gradient checkpointing
+
+
+
+## Hardware
+
+### Multi-GPU Connectivity
+
+If you use multiple GPUs the way cards are inter-connected can have a huge impact on the total training time.
+
+If the GPUs are on the same physical node, you can run:
+
+```
+nvidia-smi topo -m
+```
+
+and it will tell you how the GPUs are inter-connected.
+
+On a machine with dual-GPU and which are connected with NVLink, you will most likely see something like:
+
+```
+        GPU0    GPU1    CPU Affinity    NUMA Affinity
+GPU0     X      NV2     0-23            N/A
+GPU1    NV2      X      0-23            N/A
+```
+
+on a different machine w/o NVLink we may see:
+```
+        GPU0    GPU1    CPU Affinity    NUMA Affinity
+GPU0     X      PHB     0-11            N/A
+GPU1    PHB      X      0-11            N/A
+```
+
+The report includes this legend:
+
+```
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing at most a single PCIe bridge
+  NV#  = Connection traversing a bonded set of # NVLinks
+```
+
+So the first report `NV2` tells us the GPUs are interconnected with 2 NVLinks, and the second report `PHB` we have a typical consumer-level PCIe+Bridge setup.
+
+Check what type of connectivity you have on your setup. Some of these will make the communication between cards faster (e.g. NVLink), others slower (e.g. PHB).
+
+Depending on the type of scalability solution used, the connectivity speed could have a major or a minor impact. If the GPUs need to sync rarely, as in DDP, the impact of a slower connection will be less significant. If the GPUs need to send messages to each other often, as in ZeRO-DP, then faster connectivity becomes super important to achieve faster training.
+
+### NVlink
+
+[NVLink](https://en.wikipedia.org/wiki/NVLink) is a wire-based serial multi-lane near-range communications link developed by Nvidia.
+
+Each new generation provides a faster bandwidth, e.g. here is a quote from [Nvidia Ampere GA102 GPU Architecture](https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf):
+
+> Third-Generation NVLink®
+> GA102 GPUs utilize NVIDIA’s third-generation NVLink interface, which includes four x4 links,
+> with each link providing 14.0625 GB/sec bandwidth in each direction between two GPUs. Four
+> links provide 56.25 GB/sec bandwidth in each direction, and 112.5 GB/sec total bandwidth
+> between two GPUs. Two RTX 3090 GPUs can be connected together for SLI using NVLink.
+> (Note that 3-Way and 4-Way SLI configurations are not supported.)
+
+So the higher `X` you get in the report of `NVX` in the output of `nvidia-smi topo -m` the better. The generation will depend on your GPU architecture.
+
+Let's compare the execution of a gpt2 language model training over a small sample of wikitext.
+
+The results are:
+
+
+| NVlink | Time |
+| -----  | ---: |
+| Y      | 101s |
+| N      | 131s |
+
+
+You can see that NVLink completes the training ~23% faster.
+
+In the second benchmark we use `NCCL_P2P_DISABLE=1` to tell the GPUs not to use NVLink.
+
+Here is the full benchmark code and outputs:
+
+```
+# DDP w/ NVLink
+
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
+--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
+
+# DDP w/o NVLink
+
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \
+--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
+--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
+--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
+```
+
+Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`)
+Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
+
+## Software
+
+### Anatomy of Model's Memory
+
+The components on GPU memory are the following:
+- the model weights
+- the forward activations saved for gradient computation
+- the gradients
+- the optimizer state
+
+### `forward` vs `backward` Execution Speed
+
+For convolutions and linear layers there are 2x flops in the backward compared to the forward, which generally translates into ~2x slower (sometimes more, because sizes in the backward tend to be more awkward). Activations are usually bandwidth-limited, and it’s typical for an activation to have to read more data in the backward than in the forward (e.g. activation forward reads once, writes once, activation backward reads twice, gradOutput and output of the forward, and writes once, gradInput).
+
+### fp16
+
+AMP = Automatic Mixed Precision
+
+If we look at what's happening with FP16 training (mixed precision) we have:
+- the model has two copies in memory: one in half-precision for the forward/backward computations and one in full precision - no memory saved here
+- the forward activations saved for gradient computation are in half-precision - memory is saved here
+- the gradients are computed in half-precision *but* converted to full-precision for the update, no saving there
+- the optimizer states are in full precision as all the updates are done in full-precision
+
+So the savings only happen for the forward activations saved for the backward computation, and there is a slight overhead because the model weights are stored both in half- and full-precision.
+
+Now let's look at a simple text-classification fine-tuning on 2 GPUs (I'm giving the command for reference):
+```
+export BS=16
+python -m torch.distributed.launch \
+    --nproc_per_node 2 examples/pytorch/text-classification/run_glue.py \
+    --model_name_or_path bert-base-cased \
+    --task_name mrpc \
+    --do_train \
+    --do_eval \
+    --max_seq_length 128 \
+    --per_device_train_batch_size $BS \
+    --learning_rate 2e-5 \
+    --num_train_epochs 3.0 \
+    --output_dir /tmp/mrpc \
+    --overwrite_output_dir \
+    --fp16
+```
+Since the only savings we get are in the model activations saved for the backward passed, it's logical that the bigger those activations are, the bigger the saving will be. If we try different batch sizes, I indeed get (this is with `nvidia-smi` so not completely reliable as said above but it will be a fair comparison):
+
+| batch size | w/o --fp16 | w/ --fp16 | savings |
+| ---------: | ---------: | --------: | ------: |
+|          8 |       4247 |      4163 |      84 |
+|         16 |       4971 |      4793 |     178 |
+|         32 |       6827 |      6207 |     620 |
+|         64 |      10037 |      8061 |    1976 |
+
+So there is only a real memory saving if we train at a high batch size (and it's not half) and at batch sizes lower than 8, you actually get a bigger memory footprint (because of the overhead mentioned above). The gain for FP16 training is that in each of those cases, the training with the flag `--fp16` is twice as fast, which does require every tensor to have every dimension be a multiple of 8 (examples pad the tensors to a sequence length that is a multiple of 8).
+
+Summary: FP16 with apex or AMP will only give you some memory savings with a reasonably high batch size.
+
+Additionally, under mixed precision when possible, it's important that the batch size is a multiple of 8 to efficiently use tensor cores.
+
+Some amazing tutorials to read on mixed precision:
+- @sgugger wrote a great explanation of mixed precision [here](https://docs.fast.ai/callback.fp16.html#A-little-bit-of-theory)
+- Aleksey Bilogur's [A developer-friendly guide to mixed precision training with PyTorch](https://spell.ml/blog/mixed-precision-training-with-pytorch-Xuk7YBEAACAASJam)
+
+### fp16 caching
+
+pytorch `autocast` which performs AMP include a caching feature, which speed things up by caching fp16-converted values. Here is the full description from this [comment](https://discuss.pytorch.org/t/autocast-and-torch-no-grad-unexpected-behaviour/93475/3):
+
+Autocast maintains a cache of the FP16 casts of model params (leaves). This helps streamline parameter reuse: if the same FP32 param is used in several different FP16list ops, like several matmuls, instead of re-casting the param to FP16 on entering each matmul, the cast will occur on the first matmul, the casted FP16 copy will be cached, and for all later matmuls the FP16 copy will be reused. The cache is maintained only within a particular outermost autocast context. When you exit the autocast context the cache is dropped. For recommended usage, in which autocast wraps the forward pass, and then you exit the context before calling backward(), this means the cache only lasts the duration of the forward pass each iteration, and will be rebuilt next iteration. (The cache of FP16-casted copies MUST be rebuilt each iteration. The FP32 params get updated by the optimizer, so the FP16 copies must be recreated, otherwise the FP16 values will be stale.)
+
+
+### Gradient Checkpointing
+
+One way to use significantly less GPU memory is to enabled "Gradient Checkpointing" (also known as "activation checkpointing"). When enabled, a lot of memory can be freed at the cost of small decrease in the training speed due to recomputing parts of the graph during back-propagation.
+
+This technique was first shared in the paper: [Training Deep Nets with Sublinear Memory Cost](https://arxiv.org/abs/1604.06174). The paper will also give you the exact details on the savings, but it's in the ballpark of `O(sqrt(n))`, where `n` is the number of feed-forward layers.
+
+To activate this feature in 🤗 Transformers for models that support it, use:
+
+```python
+model.gradient_checkpointing_enable()
+```
+or add `--gradient_checkpointing` to the Trainer arguments.
+
+
+### Batch sizes
+
+One gets the most efficient performance when batch sizes and input/output neuron counts are divisible by a certain number, which typically starts at 8, but can be much higher as well. That number varies a lot depending on the specific hardware being used and the dtype of the model.
+
+For example for fully connected layers (which correspond to GEMMs), NVIDIA provides recommendations for [input/output neuron counts](
+https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features) and [batch size](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#batch-size).
+
+[Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) define the multiplier based on the dtype and the hardware. For example, for fp16 a multiple of 8 is recommended, but on A100 it's 64!
+
+For parameters that are small, there is also [Dimension Quantization Effects](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization) to consider, this is where tiling happens and the right multiplier can have a significant speedup.
+
+
+### DP vs DDP
+
+`DistributedDataParallel` (DDP) is typically faster than `DataParallel` (DP), but it is not always the case:
+* while DP is python threads-based, DDP is multiprocess-based - and as such it has no python threads limitations, such as GIL
+* on the other hand a slow inter-connectivity between the GPU cards could lead to an actual slower outcome with DDP
+
+Here are the main differences in the inter-GPU communication overhead between the two modes:
+
+[DDP](https://pytorch.org/docs/master/notes/ddp.html):
+
+- At the start time the main process replicates the model once from gpu 0 to the rest of gpus
+- Then for each batch:
+   1. each gpu consumes each own mini-batch of data directly
+   2. during `backward`, once the local gradients are ready, they are then averaged across all processes
+
+[DP](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html):
+
+For each batch:
+   1. gpu 0 reads the batch of data and then sends a mini-batch to each gpu
+   2. replicates the up-to-date model from gpu 0 to each gpu
+   3. runs `forward` and sends output from each gpu to gpu 0, computes loss
+   4. scatters loss from gpu 0 to all gpus, runs `backward`
+   5. sends gradients from each gpu to gpu 0 and averages those
+
+The only communication DDP performs per batch is sending gradients, whereas DP does 5 different data exchanges per batch.
+
+DP copies data within the process via python threads, whereas DDP copies data via [torch.distributed](https://pytorch.org/docs/master/distributed.html).
+
+Under DP gpu 0 performs a lot more work than the rest of the gpus, thus resulting in under-utilization of gpus.
+
+You can use DDP across multiple machines, but this is not the case with DP.
+
+There are other differences between DP and DDP but they aren't relevant to this discussion.
+
+If you want to go really deep into understanding these 2 modes, this [article](https://www.telesens.co/2019/04/04/distributed-data-parallel-training-using-pytorch-on-aws/) is highly recommended, as it has great diagrams, includes multiple benchmarks and profiler outputs on various hardware, explains all the nuances that you may need to know.
+
+Let's look at an actual benchmark:
+
+| Type   | NVlink | Time |
+| :----- | -----  | ---: |
+| 2:DP   | Y      | 110s |
+| 2:DDP  | Y      | 101s |
+| 2:DDP  | N      | 131s |
+
+
+Analysis:
+
+Here DP is ~10% slower than DDP w/ NVlink, but ~15% faster than DDP w/o NVlink
+
+The real difference will depend on how much data each GPU needs to sync with the others - the more there is to sync, the more a slow link will slow down the total runtime.
+
+Here is the full benchmark code and outputs:
+
+`NCCL_P2P_DISABLE=1` was used to disable the NVLink feature on the corresponding benchmark.
+
+```
+
+# DP
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
+python examples/pytorch/language-modeling/run_clm.py \
+--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69}
+
+# DDP w/ NVlink
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
+python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
+--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
+
+# DDP w/o NVlink
+rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \
+python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
+--model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
+--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
+
+{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
+```
+
+Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`)
+Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
+
+
+### DataLoader
+
+One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it can handle. By default everything happens in the main process and it might not be able to read the data from disk fast enough, and thus create a bottleneck, leading to GPU under-utilization.
+
+- `DataLoader(pin_memory=True, ...)` which ensures that the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory.
+-  `DataLoader(num_workers=4, ...)` - spawn several workers to pre-load data faster - during training watch the GPU utilization stats and if it's far from 100% experiment with raising the number of workers. Of course, the problem could be elsewhere so a very big number of workers won't necessarily lead to a better performance.
+
+### Faster optimizer
+
+pytorch-nightly introduced `torch.optim._multi_tensor` which should significantly speed up the optimizers for situations with lots of small feature tensors. It should eventually become the default, but if you want to experiment with it sooner and don't mind using the bleed-edge, see: https://github.com/huggingface/transformers/issues/9965
+
+
+## Contribute
+
+This document is far from being complete and a lot more needs to be added, so if you have additions or corrections to make please don't hesitate to open a PR or if you aren't sure start an Issue and we can discuss the details there.
+
+When making contributions that A is better than B, please try to include a reproducible benchmark and/or a link to the source of that information (unless it comes directly from you).
--- a/docs/source/perplexity.rst
+++ b/docs/source/perplexity.rst
@@ -96,11 +96,11 @@ dataset in memory.

 .. code-block:: python

-    from nlp import load_dataset
+    from datasets import load_dataset
    test = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
    encodings = tokenizer('\n\n'.join(test['text']), return_tensors='pt')

-With 🤗 Transformers, we can simply pass the ``input_ids`` as the ``labels`` to our model, and the average
+With 🤗 Transformers, we can simply pass the ``input_ids`` as the ``labels`` to our model, and the average negative
 log-likelihood for each token is returned as the loss. With our sliding window approach, however, there is overlap in
 the tokens we pass to the model at each iteration. We don't want the log-likelihood for the tokens we're just treating
 as context to be included in our loss, so we can set these targets to ``-100`` so that they are ignored. The following
@@ -110,10 +110,13 @@ available to condition on).

 .. code-block:: python

+    import torch
+    from tqdm import tqdm
+
    max_length = model.config.n_positions
    stride = 512

-    lls = []
+    nlls = []
    for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, encodings.input_ids.size(1))
@@ -124,11 +127,11 @@ available to condition on).

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
-            log_likelihood = outputs[0] * trg_len
+            neg_log_likelihood = outputs[0] * trg_len

-        lls.append(log_likelihood)
+        nlls.append(neg_log_likelihood)

-    ppl = torch.exp(torch.stack(lls).sum() / end_loc)
+    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)

 Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window
 strategy we discussed above. The smaller the stride, the more context the model will have in making each prediction,
--- a/docs/source/preprocessing.rst
+++ b/docs/source/preprocessing.rst
@@ -243,15 +243,16 @@ three arguments you need to know for this are :obj:`padding`, :obj:`truncation`

 - :obj:`truncation` controls the truncation. It can be a boolean or a string which should be:

-    - :obj:`True` or :obj:`'only_first'` truncate to a maximum length specified by the :obj:`max_length` argument or
+    - :obj:`True` or :obj:`'longest_first'` truncate to a maximum length specified by the :obj:`max_length` argument or
      the maximum length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will
-      only truncate the first sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.
+      truncate token by token, removing a token from the longest sequence in the pair until the proper length is
+      reached.
    - :obj:`'only_second'` truncate to a maximum length specified by the :obj:`max_length` argument or the maximum
      length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will only truncate
      the second sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.
-    - :obj:`'longest_first'` truncate to a maximum length specified by the :obj:`max_length` argument or the maximum
-      length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will truncate token
-      by token, removing a token from the longest sequence in the pair until the proper length is reached.
+    - :obj:`'only_first'` truncate to a maximum length specified by the :obj:`max_length` argument or the maximum
+      length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will only truncate
+      the first sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.
    - :obj:`False` or :obj:`'do_not_truncate'` to not truncate the sequences. As we have seen before, this is the
      default behavior.

--- a/docs/source/quicktour.rst
+++ b/docs/source/quicktour.rst
@@ -65,10 +65,10 @@ make them readable. For instance:
 .. code-block::

    >>> classifier('We are very happy to show you the 🤗 Transformers library.')
-    [{'label': 'POSITIVE', 'score': 0.9997795224189758}]
+    [{'label': 'POSITIVE', 'score': 0.9998}]

-That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model as a
-`batch`, returning a list of dictionaries like this one:
+That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model, returning
+a list of dictionaries like this one:

 .. code-block::

@@ -79,6 +79,8 @@ That's encouraging! You can use it on a list of sentences, which will be preproc
    label: POSITIVE, with score: 0.9998
    label: NEGATIVE, with score: 0.5309

+To use with a large dataset, look at :doc:`iterating over a pipeline <./main_classes/pipelines>`
+
 You can see the second sentence has been classified as negative (it needs to be positive or negative) but its score is
 fairly neutral.

@@ -195,7 +197,8 @@ sequence:
 .. code-block::

    >>> print(inputs)
-    {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+    {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102],
+     'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

 You can pass a list of sentences directly to your tokenizer. If your goal is to send them through your model as a
 batch, you probably want to pad them all to the same length, truncate them to the maximum length the model can accept
@@ -260,12 +263,12 @@ objects are described in greater detail :doc:`here <main_classes/output>`. For n
    >>> ## PYTORCH CODE
    >>> print(pt_outputs)
    SequenceClassifierOutput(loss=None, logits=tensor([[-4.0833,  4.3364],
-        [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
+            [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
    >>> ## TENSORFLOW CODE
    >>> print(tf_outputs)
    TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
-    array([[-4.0832963 ,  4.3364143 ],
-           [ 0.081807  , -0.04178282]], dtype=float32)>, hidden_states=None, attentions=None)
+    array([[-4.0833 ,  4.3364  ],
+           [ 0.0818, -0.0418]], dtype=float32)>, hidden_states=None, attentions=None)

 Notice how the output object has a ``logits`` attribute. You can use this to access the model's final activations.

@@ -283,7 +286,7 @@ Let's apply the SoftMax activation to get predictions.
    >>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
    >>> ## TENSORFLOW CODE
    >>> import tensorflow as tf
-    >>> tf.nn.softmax(tf_outputs.logits, axis=-1)
+    >>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)

 We can see we get the numbers from before:

@@ -292,8 +295,8 @@ We can see we get the numbers from before:
    >>> ## TENSORFLOW CODE
    >>> print(tf_predictions)
    tf.Tensor(
-    [[2.2042994e-04 9.9977952e-01]
-     [5.3086340e-01 4.6913657e-01]], shape=(2, 2), dtype=float32)
+    [[2.2043e-04 9.9978e-01]
+     [5.3086e-01 4.6914e-01]], shape=(2, 2), dtype=float32)
    >>> ## PYTORCH CODE
    >>> print(pt_predictions)
    tensor([[2.2043e-04, 9.9978e-01],
@@ -309,14 +312,14 @@ attribute:
    >>> pt_outputs = pt_model(**pt_batch, labels = torch.tensor([1, 0]))
    >>> print(pt_outputs)
    SequenceClassifierOutput(loss=tensor(0.3167, grad_fn=<NllLossBackward>), logits=tensor([[-4.0833,  4.3364],
-    [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
+            [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
    >>> ## TENSORFLOW CODE
    >>> import tensorflow as tf
    >>> tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0]))
    >>> print(tf_outputs)
-    TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.2051287e-04, 6.3326043e-01], dtype=float32)>, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
-    array([[-4.0832963 ,  4.3364143 ],
-           [ 0.081807  , -0.04178282]], dtype=float32)>, hidden_states=None, attentions=None)
+    TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.2051e-04, 6.3326e-01], dtype=float32)>, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
+    array([[-4.0833 ,  4.3364  ],
+           [ 0.0818, -0.0418]], dtype=float32)>, hidden_states=None, attentions=None)

 Models are standard `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ or `tf.keras.Model
 <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ so you can use them in your usual training loop. 🤗
--- a/docs/source/sagemaker.md
+++ b/docs/source/sagemaker.md
@@ -16,388 +16,10 @@ limitations under the License.

 # Run training on Amazon SageMaker

-Hugging Face and Amazon are introducing new [Hugging Face Deep Learning Containers (DLCs)](#deep-learning-container-dlc-overview) to make it easier than ever to train Hugging Face Transformer models in [Amazon SageMaker](https://aws.amazon.com/sagemaker/).
+The documentation has been moved to [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). This page will be removed in `transformers` 5.0. 

-You can find a full list of all available [Hugging Face Deep Learning Containers](#deep-learning-container-dlc-overview) at the end of this page. 
+### Table of Content

-To learn how to access and use the new Hugging Face DLCs with the Amazon SageMaker Python SDK, check out the guides and resources below.
-
---
-
-
-## Getting Started: Train a 🤗 Transformers Model
-
-To train a 🤗 Transformers model by using the `HuggingFace` SageMaker Python SDK you need to:
-
- [Prepare a training script](#prepare-a-transformers-fine-tuning-script)
- [Create a `HuggingFace` Estimator](#create-an-huggingface-estimator)
- [Run training by calling the `fit` method](#execute-training)
- [Access you model](#access-trained-model)
-
-### Setup & Installation
-
-Before you can train a transformers models with Amazon SageMaker you need to sign up for an AWS account. If you do not have an AWS account yet learn more [here](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-set-up.html).
-
-After you complete these tasks you can get started using either [SageMaker Studio](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-studio-onboard.html), [SageMaker Notebook Instances](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-console.html), or a local environment. To start training locally you need configure the right [IAM permission](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html).
-
-Upgrade to the latest `sagemaker` version.
-
-```bash
-pip install sagemaker --upgrade
-```
-
-**SageMaker environment**
-
-_Note: The execution role is intended to be available only when running a notebook within SageMaker. If you run `get_execution_role` in a notebook not on SageMaker, expect a "region" error._
-
-```python
-import sagemaker
-sess = sagemaker.Session()
-role = sagemaker.get_execution_role()
-```
-
-**Local environment**
-
-```python
-import sagemaker
-import boto3
-
-iam_client = boto3.client('iam')
-role = iam_client.get_role(RoleName='role-name-of-your-iam-role-with-right-permissions')['Role']['Arn']
-sess = sagemaker.Session()
-```
-
-### Prepare a 🤗 Transformers fine-tuning script.
-
-The training script is very similar to a training script you might run outside of SageMaker, but you can access useful properties about the training environment through various environment variables, including the following:
-
- `SM_MODEL_DIR`: A string that represents the path where the training job writes the model artifacts to. After training, artifacts in this directory are uploaded to S3 for model hosting. `SM_MODEL_DIR` is always set to `/opt/ml/model`.
-
- `SM_NUM_GPUS`: An integer representing the number of GPUs available to the host.
-
- `SM_CHANNEL_XXXX:` A string that represents the path to the directory that contains the input data for the specified channel. For example, if you specify two input channels in the HuggingFace estimator’s fit call, named `train` and `test`, the environment variables `SM_CHANNEL_TRAIN` and `SM_CHANNEL_TEST` are set.
-
-You can find a full list of the exposed environment variables [here](https://github.com/aws/sagemaker-training-toolkit/blob/master/ENVIRONMENT_VARIABLES.md).
-
-Later we define `hyperparameters` in the [HuggingFace Estimator](#create-an-huggingface-estimator), which are passed in as named arguments and and can be processed with the `ArgumentParser()`.
-
-```python
-import transformers
-import datasets
-import argparse
-import os
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser()
-
-    # hyperparameters sent by the client are passed as command-line arguments to the script.
-    parser.add_argument("--epochs", type=int, default=3)
-    parser.add_argument("--per_device_train_batch_size", type=int, default=32)
-    parser.add_argument("--model_name_or_path", type=str)
-
-    # Data, model, and output directories
-    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
-    parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
-    parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
-```
-
-_Note that SageMaker doesn’t support argparse actions. For example, if you want to use a boolean hyperparameter, specify `type` as `bool` in your script and provide an explicit `True` or `False` value._
-
-For a complete example of a 🤗 Transformers training script, see [train.py](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/scripts/train.py)
-
-### Create an HuggingFace Estimator
-
-You run 🤗 Transformers training scripts on SageMaker by creating `HuggingFace` Estimators. The Estimator handles end-to-end Amazon SageMaker training. The training of your script is invoked when you call `fit` on a `HuggingFace` Estimator. In the Estimator you define, which fine-tuning script should be used as `entry_point`, which `instance_type` should be used, which `hyperparameters` are passed in, you can find all possible `HuggingFace` Parameter [here](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/sagemaker.huggingface.html#huggingface-estimator). and an example of a fine-tuning script [here](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/scripts/train.py).
-You can find all useable `instance_types` [here](https://aws.amazon.com/de/sagemaker/pricing/).
-
-The following code sample shows how you train a custom `HuggingFace` script `train.py`, passing in three hyperparameters (`epochs`, `per_device_train_batch_size`, and `model_name_or_path`).
-
-```python
-from sagemaker.huggingface import HuggingFace
-
-
-# hyperparameters, which are passed into the training job
-hyperparameters={'epochs': 1,
-                 'per_device_train_batch_size': 32,
-                 'model_name_or_path': 'distilbert-base-uncased'
-                 }
-
-# create the Estimator
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3.2xlarge',
-        instance_count=1,
-        role=role,
-        transformers_version='4.4',
-        pytorch_version='1.6',
-        py_version='py36',
-        hyperparameters = hyperparameters
-)
-```
-
-To run the `TrainingJob` locally you can define `instance_type='local'` or `instance_type='local-gpu'` for gpu usage. _Note: this does not working within SageMaker Studio_
-
-### Execute Training
-
-You start your `TrainingJob` by calling `fit` on a `HuggingFace` Estimator. In the `fit` method you specify your input training data, like a string S3 URI `s3://my-bucket/my-training-data` or a `FileSystemInput` for [EFS or FSx Lustre](https://sagemaker.readthedocs.io/en/stable/overview.html?highlight=FileSystemInput#use-file-systems-as-training-inputs), see [here](https://sagemaker.readthedocs.io/en/stable/overview.html?highlight=FileSystemInput#use-file-systems-as-training-inputs).
-
-```python
-huggingface_estimator.fit(
-  {'train': 's3://sagemaker-us-east-1-558105141721/samples/datasets/imdb/train',
-   'test': 's3://sagemaker-us-east-1-558105141721/samples/datasets/imdb/test'}
-)
-
-```
-
-SageMaker takes care of starting and managing all the required ec2 instances for ands starts the training job by running.
-
-```bash
-/opt/conda/bin/python train.py --epochs 1 --model_name_or_path distilbert-base-uncased --per_device_train_batch_size 32
-```
-
-### Access trained model
-
-After training is done you can access your model either through the [AWS console](https://console.aws.amazon.com/console/home?nc2=h_ct&src=header-signin) or downloading it directly from S3.
-
-```python
-from sagemaker.s3 import S3Downloader
-
-S3Downloader.download(
-    s3_uri=huggingface_estimator.model_data, # s3 uri where the trained model is located
-    local_path='.', # local path where *.targ.gz is saved
-    sagemaker_session=sess # sagemaker session used for training the model
-)
-```
-
---
-
-## Sample Notebooks
-
-You can find here a list of the official notebooks provided by Hugging Face.
-
-| Notebook                                                                                                                                                                                        | Description                                                                                                      |
-| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------- |
-| [Getting Started Pytorch](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb)                                                   | End-to-End binary Text-Classification example using `Trainer` and `imdb` dataset                                 |
-| [Getting Started Tensorflow](https://github.com/huggingface/notebooks/blob/master/sagemaker/02_getting_started_tensorflow/sagemaker-notebook.ipynb)                                             | End-to-End binary Text-Classification example using `Keras` and `imdb` dataset                                   |
-| [Distributed Training Data Parallelism](https://github.com/huggingface/notebooks/blob/master/sagemaker/03_distributed_training_data_parallelism/sagemaker-notebook.ipynb)                       | End-to-End distributed Question-Answering example using `Trainer` and 🤗 Transformers example script for `SQAuD` |
-| [Distributed Training Model Parallelism](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)                     | End-to-End model parallelism example using `SageMakerTrainer` and `run_glue.py` script                           |
-| [Spot Instances and continues training](https://github.com/huggingface/notebooks/blob/master/sagemaker/05_spot_instances/sagemaker-notebook.ipynb)                                              | End-to-End to Text-Classification example using spot instances with continued training.                          |
-| [SageMaker Metrics](https://github.com/huggingface/notebooks/blob/master/sagemaker/06_sagemaker_metrics/sagemaker-notebook.ipynb)                                                               | End-to-End to Text-Classification example using SageMaker Metrics to extract and log metrics during training     |
-| [Distributed Training Data Parallelism Tensorflow](https://github.com/huggingface/notebooks/blob/master/sagemaker/07_tensorflow_distributed_training_data_parallelism/sagemaker-notebook.ipynb) | End-to-End distributed binary Text-Classification example using `Keras` and `TensorFlow`                    
-| [Distributed Seq2Seq Training with Data Parallelism and BART](https://github.com/huggingface/notebooks/blob/master/sagemaker/08_distributed_summarization_bart_t5/sagemaker-notebook.ipynb) | End-to-End distributed summarization example with `BART-large` and 🤗 Transformers example script for `summarization`                        |
-| [Image Classification using Vision Transformer](https://github.com/huggingface/notebooks/blob/master/sagemaker/09_image_classification_vision_transformer/sagemaker-notebook.ipynb) | End-to-End image classification example with `Vision Transformers`                 |
-
---
-
-## Advanced Features
-
-In addition to the Deep Learning Container and the SageMaker SDK, we have implemented other additional features.
-
-### Distributed Training: Data-Parallel
-
-You can use [SageMaker Data Parallelism Library](https://aws.amazon.com/blogs/aws/managed-data-parallelism-in-amazon-sagemaker-simplifies-training-on-large-datasets/) out of the box for distributed training. We added the functionality of Data Parallelism directly into the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html). If your `train.py` uses the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) API you only need to define the distribution parameter in the HuggingFace Estimator.
-
- [Example Notebook PyTorch](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)
- [Example Notebook TensorFlow](https://github.com/huggingface/notebooks/blob/master/sagemaker/07_tensorflow_distributed_training_data_parallelism/sagemaker-notebook.ipynb)
-
-```python
-# configuration for running training on smdistributed Data Parallel
-distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}
-
-# create the Estimator
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3dn.24xlarge',
-        instance_count=2,
-        role=role,
-        transformers_version='4.4.2',
-        pytorch_version='1.6.0',
-        py_version='py36',
-        hyperparameters = hyperparameters
-        distribution = distribution
-)
-
-```
-
-### Distributed Training: Model-Parallel
-
-You can use [SageMaker Model Parallelism Library](https://aws.amazon.com/blogs/aws/amazon-sagemaker-simplifies-training-deep-learning-models-with-billions-of-parameters/) out of the box for distributed training. We added the functionality of Model Parallelism directly into the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html). If your `train.py` uses the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) API you only need to define the distribution parameter in the HuggingFace Estimator.  
-For detailed information about the adjustments take a look [here](https://sagemaker.readthedocs.io/en/stable/api/training/smd_model_parallel_general.html?highlight=modelparallel#required-sagemaker-python-sdk-parameters).
-
-
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)
-
-
-```python
-# configuration for running training on smdistributed Model Parallel
-mpi_options = {
-    "enabled" : True,
-    "processes_per_host" : 8
-}
-
-smp_options = {
-    "enabled":True,
-    "parameters": {
-        "microbatches": 4,
-        "placement_strategy": "spread",
-        "pipeline": "interleaved",
-        "optimize": "speed",
-        "partitions": 4,
-        "ddp": True,
-    }
-}
-
-distribution={
-    "smdistributed": {"modelparallel": smp_options},
-    "mpi": mpi_options
-}
-
- # create the Estimator
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3dn.24xlarge',
-        instance_count=2,
-        role=role,
-        transformers_version='4.4.2',
-        pytorch_version='1.6.0',
-        py_version='py36',
-        hyperparameters = hyperparameters,
-        distribution = distribution
-)
-```
-
-### Spot Instances
-
-With the creation of HuggingFace Framework extension for the SageMaker Python SDK we can also leverage the benefit of [fully-managed EC2 spot instances](https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html) and save up to 90% of our training cost.
-
-_Note: Unless your training job completes quickly, we recommend you use [checkpointing](https://docs.aws.amazon.com/sagemaker/latest/dg/model-checkpoints.html) with managed spot training, therefore you need to define the `checkpoint_s3_uri`._
-
-To use spot instances with the `HuggingFace` Estimator we have to set the `use_spot_instances` parameter to `True` and define your `max_wait` and `max_run` time. You can read more about the [managed spot training lifecycle here](https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html).
-
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/05_spot_instances/sagemaker-notebook.ipynb)
-
-```python
-# hyperparameters, which are passed into the training job
-hyperparameters={'epochs': 1,
-                 'train_batch_size': 32,
-                 'model_name':'distilbert-base-uncased',
-                 'output_dir':'/opt/ml/checkpoints'
-                 }
-# create the Estimator
-
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3.2xlarge',
-        instance_count=1,
-	    checkpoint_s3_uri=f's3://{sess.default_bucket()}/checkpoints'
-        use_spot_instances=True,
-        max_wait=3600, # This should be equal to or greater than max_run in seconds'
-        max_run=1000,
-        role=role,
-        transformers_version='4.4',
-        pytorch_version='1.6',
-        py_version='py36',
-        hyperparameters = hyperparameters
-)
-
-# Training seconds: 874
-# Billable seconds: 262
-# Managed Spot Training savings: 70.0%
-
-```
-
-### Git Repository
-
-When you create a `HuggingFace` Estimator, you can specify a [training script that is stored in a GitHub repository](https://sagemaker.readthedocs.io/en/stable/overview.html#use-scripts-stored-in-a-git-repository) as the entry point for the estimator, so that you don’t have to download the scripts locally. If Git support is enabled, the `entry_point` and `source_dir` should be relative paths in the Git repo if provided. 
-
-If you are using `git_config` to run the [🤗 Transformers examples scripts](https://github.com/huggingface/transformers/tree/master/examples) keep in mind that you need to configure the right `'branch'` for you `transformers_version`, e.g. if you use `transformers_version='4.4.2` you have to use `'branch':'v4.4.2'`. 
-
-As an example to use `git_config` with an [example script from the transformers repository](https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-classification).
-
-_Tip: define `output_dir` as `/opt/ml/model` in the hyperparameter for the script to save your model to S3 after training._
-
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/02_getting_started_tensorflow/sagemaker-notebook.ipynb)
-
-```python
-# configure git settings
-git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.4.2'} # v4.4.2 is referring to the `transformers_version you use in the estimator.
-
- # create the Estimator
-huggingface_estimator = HuggingFace(
-        entry_point='run_glue.py',
-        source_dir='./examples/pytorch/text-classification',
-        git_config=git_config,
-        instance_type='ml.p3.2xlarge',
-        instance_count=1,
-        role=role,
-        transformers_version='4.4',
-        pytorch_version='1.6',
-        py_version='py36',
-        hyperparameters=hyperparameters
-)
-
-```
-
-### SageMaker Metrics
-
-[SageMaker Metrics](https://docs.aws.amazon.com/sagemaker/latest/dg/training-metrics.html#define-train-metrics) can automatically parse the logs for metrics and send those metrics to CloudWatch. If you want SageMaker to parse logs you have to specify the metrics that you want SageMaker to send to CloudWatch when you configure the training job. You specify the name of the metrics that you want to send and the regular expressions that SageMaker uses to parse the logs that your algorithm emits to find those metrics.
-
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/06_sagemaker_metrics/sagemaker-notebook.ipynb)
-
-```python
-# define metrics definitions
-
-metric_definitions = [
-{"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
-{"Name": "eval_accuracy", "Regex": "eval_accuracy.*=\D*(.*?)$"},
-{"Name": "eval_loss", "Regex": "eval_loss.*=\D*(.*?)$"},
-]
-
-# create the Estimator
-
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3.2xlarge',
-        instance_count=1,
-        role=role,
-        transformers_version='4.4',
-        pytorch_version='1.6',
-        py_version='py36',
-        metric_definitions=metric_definitions,
-        hyperparameters = hyperparameters)
-
-```
-
-
-## Deep Learning Container (DLC) overview
-
-The Deep Learning Container are in every available where Amazon SageMaker is available. You can see the [AWS region table](https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/) for all AWS global infrastructure. To get an detailed overview of all included packages look [here in the release notes](https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/deep-learning-containers-images.html).
-
-| 🤗 Transformers version | 🤗 Datasets version | PyTorch/TensorFlow version | type     | device | Python Version | Example `image_uri`                                                                                                               |
-| ----------------------- | ------------------- | -------------------------- | -------- | ------ | -------------- | --------------------------------------------------------------------------------------------------------------------------------- |
-| 4.4.2                   | 1.5.0               | PyTorch 1.6.0              | training | GPU    | 3.6            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04`    |
-| 4.4.2                   | 1.5.0               | TensorFlow 2.4.1           | training | GPU    | 3.7            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.4.2-gpu-py37-cu110-ubuntu18.04` |
-| 4.5.0                   | 1.5.0               | PyTorch 1.6.0              | training | GPU    | 3.6            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04`    |
-| 4.5.0                   | 1.5.0               | TensorFlow 2.4.1           | training | GPU    | 3.7            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.5.0-gpu-py37-cu110-ubuntu18.04` |
-| 4.6.1                   | 1.6.2               | PyTorch 1.6.0              | training | GPU    | 3.6            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.5.0-gpu-py36-cu110-ubuntu18.04`    |
-| 4.6.1                   | 1.6.2               | PyTorch 1.7.1               | training | GPU    | 3.6            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.7.1-transformers4.6.1-gpu-py36-cu110-ubuntu18.04` |
-| 4.6.1                   | 1.6.2               | TensorFlow 2.4.1           | training | GPU    | 3.7            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.6.1-gpu-py37-cu110-ubuntu18.04` |
-
---
-
-
-## Additional Resources
-
- [Announcement Blog Post](https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face)
-
- [AWS and Hugging Face collaborate to simplify and accelerate adoption of natural language processing](https://aws.amazon.com/blogs/machine-learning/aws-and-hugging-face-collaborate-to-simplify-and-accelerate-adoption-of-natural-language-processing-models/)
-
- [Amazon SageMaker documentation for Hugging Face](https://docs.aws.amazon.com/sagemaker/latest/dg/hugging-face.html)
-
- [SageMaker Python SDK](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/index.html)
+- [Train Hugging Face models on Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train)
+- [Deploy Hugging Face models to Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference)
+- [Frequently Asked Questions](https://huggingface.co/docs/sagemaker/faq)
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -21,11 +21,166 @@ Projects `ONNX (Open Neural Network eXchange) <http://onnx.ai>`_ and `ONNXRuntim
 unified and community-driven format to store and, by extension, efficiently execute neural network leveraging a variety
 of hardware and dedicated optimizations.

+
 Starting from transformers v2.10.0 we partnered with ONNX Runtime to provide an easy export of transformers models to
 the ONNX format. You can have a look at the effort by looking at our joint blog post `Accelerate your NLP pipelines
 using Hugging Face Transformers and ONNX Runtime
 <https://medium.com/microsoftazure/accelerate-your-nlp-pipelines-using-hugging-face-transformers-and-onnx-runtime-2443578f4333>`_.

+
+Configuration-based approach
+-----------------------------------------------------------------------------------------------------------------------
+
+Transformers v4.9.0 introduces a new package: ``transformers.onnx``. This package allows converting checkpoints to an
+ONNX graph by leveraging configuration objects. These configuration objects come ready made for a number of model
+architectures, and are made to be easily extendable to other architectures.
+
+Ready-made configurations include the following models:
+
+- ALBERT
+- BART
+- BERT
+- DistilBERT
+- GPT-2
+- LayoutLM
+- RoBERTa
+- T5
+- XLM-RoBERTa
+
+This conversion is handled with the PyTorch version of models - it, therefore, requires PyTorch to be installed. If you
+would like to be able to convert from TensorFlow, please let us know by opening an issue.
+
+.. note::
+    The models showcased here are close to fully feature complete, but do lack some features that are currently in
+    development. Namely, the ability to handle the past key values for decoder models is currently in the works.
+
+
+Converting an ONNX model using the ``transformers.onnx`` package
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The package may be used as a Python module:
+
+.. code-block::
+
+    python -m transformers.onnx --help
+
+    usage: Hugging Face ONNX Exporter tool [-h] -m MODEL -f {pytorch} [--features {default}] [--opset OPSET] [--atol ATOL] output
+
+    positional arguments:
+      output                Path indicating where to store generated ONNX model.
+
+    optional arguments:
+      -h, --help            show this help message and exit
+      -m MODEL, --model MODEL
+                            Model's name of path on disk to load.
+      --features {default}  Export the model with some additional features.
+      --opset OPSET         ONNX opset version to export the model with (default 12).
+      --atol ATOL           Absolute difference tolerance when validating the model.
+
+Exporting a checkpoint using a ready-made configuration can be done as follows:
+
+.. code-block::
+
+    python -m transformers.onnx --model=bert-base-cased onnx/bert-base-cased/
+
+This exports an ONNX graph of the mentioned checkpoint. Here it is `bert-base-cased`, but it can be any model from the
+hub, or a local path.
+
+It will be exported under ``onnx/bert-base-cased``. You should see similar logs:
+
+.. code-block::
+
+    Validating ONNX model...
+            -[✓] ONNX model outputs' name match reference model ({'pooler_output', 'last_hidden_state'}
+            - Validating ONNX Model output "last_hidden_state":
+                    -[✓] (2, 8, 768) matchs (2, 8, 768)
+                    -[✓] all values close (atol: 0.0001)
+            - Validating ONNX Model output "pooler_output":
+                    -[✓] (2, 768) matchs (2, 768)
+                    -[✓] all values close (atol: 0.0001)
+    All good, model saved at: onnx/bert-base-cased/model.onnx
+
+This export can now be used in the ONNX inference runtime:
+
+.. code-block::
+
+    import onnxruntime as ort
+
+    from transformers import BertTokenizerFast
+    tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
+
+    ort_session = ort.InferenceSession("onnx/bert-base-cased/model.onnx")
+
+    inputs = tokenizer("Using BERT in ONNX!", return_tensors="np")
+    outputs = ort_session.run(["last_hidden_state", "pooler_output"], dict(inputs))
+
+The outputs used (:obj:`["last_hidden_state", "pooler_output"]`) can be obtained by taking a look at the ONNX
+configuration of each model. For example, for BERT:
+
+.. code-block::
+
+    from transformers.models.bert import BertOnnxConfig, BertConfig
+
+    config = BertConfig()
+    onnx_config = BertOnnxConfig(config)
+    output_keys = list(onnx_config.outputs.keys())
+
+Implementing a custom configuration for an unsupported architecture
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Let's take a look at the changes necessary to add a custom configuration for an unsupported architecture. Firstly, we
+will need a custom ONNX configuration object that details the model inputs and outputs. The BERT ONNX configuration is
+visible below:
+
+.. code-block::
+
+    class BertOnnxConfig(OnnxConfig):
+        @property
+        def inputs(self) -> Mapping[str, Mapping[int, str]]:
+            return OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "sequence"}),
+                    ("attention_mask", {0: "batch", 1: "sequence"}),
+                    ("token_type_ids", {0: "batch", 1: "sequence"}),
+                ]
+            )
+
+        @property
+        def outputs(self) -> Mapping[str, Mapping[int, str]]:
+            return OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"}), ("pooler_output", {0: "batch"})])
+
+Let's understand what's happening here. This configuration has two properties: the inputs, and the outputs.
+
+The inputs return a dictionary, where each key corresponds to an expected input, and each value indicates the axis of
+that input.
+
+For BERT, there are three necessary inputs. These three inputs are of similar shape, which is made up of two
+dimensions: the batch is the first dimension, and the second is the sequence.
+
+The outputs return a similar dictionary, where, once again, each key corresponds to an expected output, and each value
+indicates the axis of that output.
+
+Once this is done, a single step remains: adding this configuration object to the initialisation of the model class,
+and to the general ``transformers`` initialisation.
+
+An important fact to notice is the use of `OrderedDict` in both inputs and outputs properties. This is a requirements
+as inputs are matched against their relative position within the `PreTrainedModel.forward()` prototype and outputs are
+match against there position in the returned `BaseModelOutputX` instance.
+
+An example of such an addition is visible here, for the MBart model: `Making MBART ONNX-convertible
+<https://github.com/huggingface/transformers/pull/13049/commits/d097adcebd89a520f04352eb215a85916934204f>`__
+
+If you would like to contribute your addition to the library, we recommend you implement tests. An example of such
+tests is visible here: `Adding tests to the MBART ONNX conversion
+<https://github.com/huggingface/transformers/pull/13049/commits/5d642f65abf45ceeb72bd855ca7bfe2506a58e6a>`__
+
+Graph conversion
+-----------------------------------------------------------------------------------------------------------------------
+
+.. note::
+    The approach detailed here is bing deprecated. We recommend you follow the part above for an up to date approach.
+
+
 Exporting a model is done through the script `convert_graph_to_onnx.py` at the root of the transformers sources. The
 following command shows how easy it is to export a BERT model from the library, simply run:

--- a/docs/source/task_summary.rst
+++ b/docs/source/task_summary.rst
@@ -107,7 +107,8 @@ each other. The process is the following:
    >>> sequence_1 = "Apples are especially bad for your health"
    >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

-    >>> # The tokekenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to the sequence, as well as compute the attention masks.
+    >>> # The tokenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to
+    >>> # the sequence, as well as compute the attention masks.
    >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
    >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")

@@ -141,12 +142,13 @@ each other. The process is the following:
    >>> sequence_1 = "Apples are especially bad for your health"
    >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

-    >>> # The tokekenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to the sequence, as well as compute the attention masks.
+    >>> # The tokenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to
+    >>> # the sequence, as well as compute the attention masks.
    >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="tf")
    >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf")

-    >>> paraphrase_classification_logits = model(paraphrase)[0]
-    >>> not_paraphrase_classification_logits = model(not_paraphrase)[0]
+    >>> paraphrase_classification_logits = model(paraphrase).logits
+    >>> not_paraphrase_classification_logits = model(not_paraphrase).logits

    >>> paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
    >>> not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
@@ -197,11 +199,11 @@ positions of the extracted answer in the text.

    >>> result = question_answerer(question="What is extractive question answering?", context=context)
    >>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
-    Answer: 'the task of extracting an answer from a text given a question.', score: 0.6226, start: 34, end: 96
+    Answer: 'the task of extracting an answer from a text given a question', score: 0.6177, start: 34, end: 95

    >>> result = question_answerer(question="What is a good example of a question answering dataset?", context=context)
    >>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
-    Answer: 'SQuAD dataset,', score: 0.5053, start: 147, end: 161
+    Answer: 'SQuAD dataset', score: 0.5152, start: 147, end: 160


 Here is an example of question answering using a model and a tokenizer. The process is the following:
@@ -247,10 +249,10 @@ Here is an example of question answering using a model and a tokenizer. The proc
    ...     answer_start_scores = outputs.start_logits
    ...     answer_end_scores = outputs.end_logits
    ...
-    ...     answer_start = torch.argmax(
-    ...         answer_start_scores
-    ...     )  # Get the most likely beginning of answer with the argmax of the score
-    ...     answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
+    ...     # Get the most likely beginning of answer with the argmax of the score
+    ...     answer_start = torch.argmax(answer_start_scores)
+    ...     # Get the most likely end of answer with the argmax of the score 
+    ...     answer_end = torch.argmax(answer_end_scores) + 1
    ...
    ...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    ...
@@ -261,7 +263,7 @@ Here is an example of question answering using a model and a tokenizer. The proc
    Question: What does 🤗 Transformers provide?
    Answer: general - purpose architectures
    Question: 🤗 Transformers provides interoperability between which frameworks?
-    Answer: tensorflow 2 . 0 and pytorch
+    Answer: tensorflow 2. 0 and pytorch
    >>> ## TENSORFLOW CODE
    >>> from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
    >>> import tensorflow as tf
@@ -290,12 +292,11 @@ Here is an example of question answering using a model and a tokenizer. The proc
    ...     answer_start_scores = outputs.start_logits
    ...     answer_end_scores = outputs.end_logits
    ...
-    ...     answer_start = tf.argmax(
-    ...         answer_start_scores, axis=1
-    ...     ).numpy()[0]  # Get the most likely beginning of answer with the argmax of the score
-    ...     answer_end = (
-    ...         tf.argmax(answer_end_scores, axis=1) + 1
-    ...     ).numpy()[0]  # Get the most likely end of answer with the argmax of the score
+    ...     # Get the most likely beginning of answer with the argmax of the score
+    ...     answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0]
+    ...     # Get the most likely end of answer with the argmax of the score
+    ...     answer_end = tf.argmax(answer_end_scores, axis=1).numpy()[0] + 1
+    ...
    ...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    ...
    ...     print(f"Question: {question}")
@@ -305,7 +306,7 @@ Here is an example of question answering using a model and a tokenizer. The proc
    Question: What does 🤗 Transformers provide?
    Answer: general - purpose architectures
    Question: 🤗 Transformers provides interoperability between which frameworks?
-    Answer: tensorflow 2 . 0 and pytorch
+    Answer: tensorflow 2. 0 and pytorch



@@ -344,31 +345,31 @@ This outputs the sequences with the mask filled, the confidence score, and the t

    >>> from pprint import pprint
    >>> pprint(unmasker(f"HuggingFace is creating a {unmasker.tokenizer.mask_token} that the community uses to solve NLP tasks."))
-    [{'score': 0.1792745739221573,
-      'sequence': '<s>HuggingFace is creating a tool that the community uses to '
-                  'solve NLP tasks.</s>',
+    [{'score': 0.1793,
+      'sequence': 'HuggingFace is creating a tool that the community uses to solve '
+                  'NLP tasks.',
      'token': 3944,
-      'token_str': 'Ġtool'},
-     {'score': 0.11349421739578247,
-      'sequence': '<s>HuggingFace is creating a framework that the community uses '
-                  'to solve NLP tasks.</s>',
+      'token_str': ' tool'},
+     {'score': 0.1135,
+      'sequence': 'HuggingFace is creating a framework that the community uses to '
+                  'solve NLP tasks.',
      'token': 7208,
-      'token_str': 'Ġframework'},
-     {'score': 0.05243554711341858,
-      'sequence': '<s>HuggingFace is creating a library that the community uses to '
-                  'solve NLP tasks.</s>',
+      'token_str': ' framework'},
+     {'score': 0.0524,
+      'sequence': 'HuggingFace is creating a library that the community uses to '
+                  'solve NLP tasks.',
      'token': 5560,
-      'token_str': 'Ġlibrary'},
-     {'score': 0.03493533283472061,
-      'sequence': '<s>HuggingFace is creating a database that the community uses '
-                  'to solve NLP tasks.</s>',
+      'token_str': ' library'},
+     {'score': 0.0349,
+      'sequence': 'HuggingFace is creating a database that the community uses to '
+                  'solve NLP tasks.',
      'token': 8503,
-      'token_str': 'Ġdatabase'},
-     {'score': 0.02860250137746334,
-      'sequence': '<s>HuggingFace is creating a prototype that the community uses '
-                  'to solve NLP tasks.</s>',
+      'token_str': ' database'},
+     {'score': 0.0286,
+      'sequence': 'HuggingFace is creating a prototype that the community uses to '
+                  'solve NLP tasks.',
      'token': 17715,
-      'token_str': 'Ġprototype'}]
+      'token_str': ' prototype'}]

 Here is an example of doing masked language modeling using a model and a tokenizer. The process is the following:

@@ -385,42 +386,22 @@ Here is an example of doing masked language modeling using a model and a tokeniz
 .. code-block::

    >>> ## PYTORCH CODE
-    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> from transformers import AutoModelForMaskedLM, AutoTokenizer
    >>> import torch

    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-    >>> model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+    >>> model = AutoModelForMaskedLM.from_pretrained("distilbert-base-cased")

-    >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+    >>> sequence = "Distilled models are smaller than the models they mimic. Using them instead of the large " \
+    ...     f"versions would help {tokenizer.mask_token} our carbon footprint."

-    >>> input = tokenizer.encode(sequence, return_tensors="pt")
-    >>> mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
+    >>> inputs = tokenizer(sequence, return_tensors="pt")
+    >>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

-    >>> token_logits = model(input).logits
+    >>> token_logits = model(**inputs).logits
    >>> mask_token_logits = token_logits[0, mask_token_index, :]

    >>> top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
-    >>> ## TENSORFLOW CODE
-    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
-    >>> import tensorflow as tf
-
-    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-    >>> model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")
-
-    >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
-
-    >>> input = tokenizer.encode(sequence, return_tensors="tf")
-    >>> mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
-
-    >>> token_logits = model(input)[0]
-    >>> mask_token_logits = token_logits[0, mask_token_index, :]
-
-    >>> top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
-
-
-This prints five sequences, with the top 5 tokens predicted by the model:
-
-.. code-block::

    >>> for token in top_5_tokens:
    ...     print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
@@ -429,6 +410,34 @@ This prints five sequences, with the top 5 tokens predicted by the model:
    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelForMaskedLM, AutoTokenizer
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    >>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert-base-cased")
+
+    >>> sequence = "Distilled models are smaller than the models they mimic. Using them instead of the large " \
+    ...     f"versions would help {tokenizer.mask_token} our carbon footprint."
+
+    >>> inputs = tokenizer(sequence, return_tensors="tf")
+    >>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
+
+    >>> token_logits = model(**inputs).logits
+    >>> mask_token_logits = token_logits[0, mask_token_index, :]
+
+    >>> top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
+
+    >>> for token in top_5_tokens:
+    ...     print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
+
+
+This prints five sequences, with the top 5 tokens predicted by the model.


 Causal Language Modeling
@@ -449,19 +458,20 @@ of tokens.
 .. code-block::

    >>> ## PYTORCH CODE
-    >>> from transformers import AutoModelWithLMHead, AutoTokenizer, top_k_top_p_filtering
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer, top_k_top_p_filtering
    >>> import torch
    >>> from torch import nn

    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    >>> model = AutoModelWithLMHead.from_pretrained("gpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")

    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and"

-    >>> input_ids = tokenizer.encode(sequence, return_tensors="pt")
+    >>> inputs = tokenizer(sequence, return_tensors="pt")
+    >>> input_ids = inputs["input_ids"]

    >>> # get logits of last hidden state
-    >>> next_token_logits = model(input_ids).logits[:, -1, :]
+    >>> next_token_logits = model(**inputs).logits[:, -1, :]

    >>> # filter
    >>> filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
@@ -473,19 +483,22 @@ of tokens.
    >>> generated = torch.cat([input_ids, next_token], dim=-1)

    >>> resulting_string = tokenizer.decode(generated.tolist()[0])
+    >>> print(resulting_string)
+    Hugging Face is based in DUMBO, New York City, and ...
    >>> ## TENSORFLOW CODE
-    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer, tf_top_k_top_p_filtering
+    >>> from transformers import TFAutoModelForCausalLM, AutoTokenizer, tf_top_k_top_p_filtering
    >>> import tensorflow as tf

    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    >>> model = TFAutoModelWithLMHead.from_pretrained("gpt2")
+    >>> model = TFAutoModelForCausalLM.from_pretrained("gpt2")

-    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and "
+    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and"

-    >>> input_ids = tokenizer.encode(sequence, return_tensors="tf")
+    >>> inputs = tokenizer(sequence, return_tensors="tf")
+    >>> input_ids = inputs["input_ids"]

    >>> # get logits of last hidden state
-    >>> next_token_logits = model(input_ids)[0][:, -1, :]
+    >>> next_token_logits = model(**inputs).logits[:, -1, :]

    >>> # filter
    >>> filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
@@ -496,14 +509,11 @@ of tokens.
    >>> generated = tf.concat([input_ids, next_token], axis=1)

    >>> resulting_string = tokenizer.decode(generated.numpy().tolist()[0])
-
-
-This outputs a (hopefully) coherent next token following the original sequence, which in our case is the word *has*:
-
-.. code-block::
-
    >>> print(resulting_string)
-    Hugging Face is based in DUMBO, New York City, and has
+    Hugging Face is based in DUMBO, New York City, and ...
+
+This outputs a (hopefully) coherent next token following the original sequence, which in our case is the word *is* or
+*features*.

 In the next section, we show how :func:`~transformers.generation_utils.GenerationMixin.generate` can be used to
 generate multiple tokens up to a specified length instead of one token at a time.
@@ -522,7 +532,8 @@ As a default all models apply *Top-K* sampling when used in pipelines, as config

    >>> text_generator = pipeline("text-generation")
    >>> print(text_generator("As far as I am concerned, I will", max_length=50, do_sample=False))
-    [{'generated_text': 'As far as I am concerned, I will be the first to admit that I am not a fan of the idea of a "free market." I think that the idea of a free market is a bit of a stretch. I think that the idea'}]
+    [{'generated_text': 'As far as I am concerned, I will be the first to admit that I am not a fan of the idea of a
+    "free market." I think that the idea of a free market is a bit of a stretch. I think that the idea'}]



@@ -536,9 +547,9 @@ Below is an example of text generation using ``XLNet`` and its tokenizer, which
 .. code-block::

    >>> ## PYTORCH CODE
-    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer

-    >>> model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased")
+    >>> model = AutoModelForCausalLM.from_pretrained("xlnet-base-cased")
    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
@@ -554,41 +565,42 @@ Below is an example of text generation using ``XLNet`` and its tokenizer, which
    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""

    >>> prompt = "Today the weather is really nice and I am planning on "
-    >>> inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")
+    >>> inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]

-    >>> prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    >>> prompt_length = len(tokenizer.decode(inputs[0]))
    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
-    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
-
-    >>> ## TENSORFLOW CODE
-    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
-
-    >>> model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased")
-    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
-
-    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
-    >>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
-    ... (except for Alexei and Maria) are discovered.
-    ... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
-    ... remainder of the story. 1883 Western Siberia,
-    ... a young Grigori Rasputin is asked by his father and a group of men to perform magic.
-    ... Rasputin has a vision and denounces one of the men as a horse thief. Although his
-    ... father initially slaps him for making such an accusation, Rasputin watches as the
-    ... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-    ... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
-    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
-
-    >>> prompt = "Today the weather is really nice and I am planning on "
-    >>> inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")
-
-    >>> prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
-    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
-    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
-
-.. code-block::
+    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:]

    >>> print(generated)
-    Today the weather is really nice and I am planning on anning on taking a nice...... of a great time!<eop>...............
+    Today the weather is really nice and I am planning ...
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelForCausalLM, AutoTokenizer
+
+    >>> model = TFAutoModelForCausalLM.from_pretrained("xlnet-base-cased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
+
+    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
+    >>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
+    ... (except for Alexei and Maria) are discovered.
+    ... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+    ... remainder of the story. 1883 Western Siberia,
+    ... a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+    ... Rasputin has a vision and denounces one of the men as a horse thief. Although his
+    ... father initially slaps him for making such an accusation, Rasputin watches as the
+    ... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    ... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
+
+    >>> prompt = "Today the weather is really nice and I am planning on "
+    >>> inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")["input_ids"]
+
+    >>> prompt_length = len(tokenizer.decode(inputs[0]))
+    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
+    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:]
+
+    >>> print(generated)
+    Today the weather is really nice and I am planning ...
+

 Text generation is currently possible with *GPT-2*, *OpenAi-GPT*, *CTRL*, *XLNet*, *Transfo-XL* and *Reformer* in
 PyTorch and for most models in Tensorflow as well. As can be seen in the example above *XLNet* and *Transfo-XL* often
@@ -638,21 +650,20 @@ Here are the expected results:

 .. code-block::

-    >>> print(ner_pipe(sequence))
-    [
-        {'word': 'Hu', 'score': 0.9995632767677307, 'entity': 'I-ORG'},
-        {'word': '##gging', 'score': 0.9915938973426819, 'entity': 'I-ORG'},
-        {'word': 'Face', 'score': 0.9982671737670898, 'entity': 'I-ORG'},
-        {'word': 'Inc', 'score': 0.9994403719902039, 'entity': 'I-ORG'},
-        {'word': 'New', 'score': 0.9994346499443054, 'entity': 'I-LOC'},
-        {'word': 'York', 'score': 0.9993270635604858, 'entity': 'I-LOC'},
-        {'word': 'City', 'score': 0.9993864893913269, 'entity': 'I-LOC'},
-        {'word': 'D', 'score': 0.9825621843338013, 'entity': 'I-LOC'},
-        {'word': '##UM', 'score': 0.936983048915863, 'entity': 'I-LOC'},
-        {'word': '##BO', 'score': 0.8987102508544922, 'entity': 'I-LOC'},
-        {'word': 'Manhattan', 'score': 0.9758241176605225, 'entity': 'I-LOC'},
-        {'word': 'Bridge', 'score': 0.990249514579773, 'entity': 'I-LOC'}
-    ]
+    >>> for entity in ner_pipe(sequence):
+    ...     print(entity)
+    {'entity': 'I-ORG', 'score': 0.9996, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}
+    {'entity': 'I-ORG', 'score': 0.9910, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}
+    {'entity': 'I-ORG', 'score': 0.9982, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}
+    {'entity': 'I-ORG', 'score': 0.9995, 'index': 4, 'word': 'Inc', 'start': 13, 'end': 16}
+    {'entity': 'I-LOC', 'score': 0.9994, 'index': 11, 'word': 'New', 'start': 40, 'end': 43}
+    {'entity': 'I-LOC', 'score': 0.9993, 'index': 12, 'word': 'York', 'start': 44, 'end': 48}
+    {'entity': 'I-LOC', 'score': 0.9994, 'index': 13, 'word': 'City', 'start': 49, 'end': 53}
+    {'entity': 'I-LOC', 'score': 0.9863, 'index': 19, 'word': 'D', 'start': 79, 'end': 80}
+    {'entity': 'I-LOC', 'score': 0.9514, 'index': 20, 'word': '##UM', 'start': 80, 'end': 82}
+    {'entity': 'I-LOC', 'score': 0.9337, 'index': 21, 'word': '##BO', 'start': 82, 'end': 84}
+    {'entity': 'I-LOC', 'score': 0.9762, 'index': 28, 'word': 'Manhattan', 'start': 114, 'end': 123}
+    {'entity': 'I-LOC', 'score': 0.9915, 'index': 29, 'word': 'Bridge', 'start': 124, 'end': 130}

 Note how the tokens of the sequence "Hugging Face" have been identified as an organisation, and "New York City",
 "DUMBO" and "Manhattan Bridge" have been identified as locations.
@@ -679,26 +690,13 @@ Here is an example of doing named entity recognition, using a model and a tokeni
    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

-    >>> label_list = [
-    ...     "O",       # Outside of a named entity
-    ...     "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
-    ...     "I-MISC",  # Miscellaneous entity
-    ...     "B-PER",   # Beginning of a person's name right after another person's name
-    ...     "I-PER",   # Person's name
-    ...     "B-ORG",   # Beginning of an organisation right after another organisation
-    ...     "I-ORG",   # Organisation
-    ...     "B-LOC",   # Beginning of a location right after another location
-    ...     "I-LOC"    # Location
-    ... ]
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, " \
+    ...            "therefore very close to the Manhattan Bridge."

-    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-    ...            "close to the Manhattan Bridge."
+    >>> inputs = tokenizer(sequence, return_tensors="pt")
+    >>> tokens = inputs.tokens()

-    >>> # Bit of a hack to get the tokens with the special tokens
-    >>> tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
-    >>> inputs = tokenizer.encode(sequence, return_tensors="pt")
-
-    >>> outputs = model(inputs).logits
+    >>> outputs = model(**inputs).logits
    >>> predictions = torch.argmax(outputs, dim=2)
    >>> ## TENSORFLOW CODE
    >>> from transformers import TFAutoModelForTokenClassification, AutoTokenizer
@@ -707,14 +705,13 @@ Here is an example of doing named entity recognition, using a model and a tokeni
    >>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

-    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-    ...            "close to the Manhattan Bridge."
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, " \
+    ...            "therefore very close to the Manhattan Bridge."

-    >>> # Bit of a hack to get the tokens with the special tokens
-    >>> tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
-    >>> inputs = tokenizer.encode(sequence, return_tensors="tf")
+    >>> inputs = tokenizer(sequence, return_tensors="tf")
+    >>> tokens = inputs.tokens()

-    >>> outputs = model(inputs)[0]
+    >>> outputs = model(**inputs)[0]
    >>> predictions = tf.argmax(outputs, axis=2)


@@ -755,8 +752,7 @@ illustrated below:
    (',', 'O')
    ('therefore', 'O')
    ('very', 'O')
-    ('##c', 'O')
-    ('##lose', 'O')
+    ('close', 'O')
    ('to', 'O')
    ('the', 'O')
    ('Manhattan', 'I-LOC')
@@ -764,6 +760,7 @@ illustrated below:
    ('.', 'O')
    ('[SEP]', 'O')

+
 Summarization
 -----------------------------------------------------------------------------------------------------------------------

@@ -811,7 +808,9 @@ below. This outputs the following summary:
 .. code-block::

    >>> print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
-    [{'summary_text': 'Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and 2002. She is believed to still be married to four men.'}]
+    [{'summary_text': ' Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in
+    the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and
+    2002 . At one time, she was married to eight men at once, prosecutors say .'}]

 Here is an example of doing summarization using a model and a tokenizer. The process is the following:

@@ -833,8 +832,15 @@ CNN / Daily Mail), it yields very good results.
    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")

    >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
-    >>> inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="pt", max_length=512, truncation=True)
-    >>> outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
+    >>> inputs = tokenizer("summarize: " + ARTICLE, return_tensors="pt", max_length=512, truncation=True)
+    >>> outputs = model.generate(
+    ...     inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
+    ... )
+
+    >>> print(tokenizer.decode(outputs[0]))
+    <pad> prosecutors say the marriages were part of an immigration scam. if convicted, barrientos faces two criminal
+    counts of "offering a false instrument for filing in the first degree" she has been married 10 times, nine of them
+    between 1999 and 2002.</s>
    >>> ## TENSORFLOW CODE
    >>> from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

@@ -842,13 +848,15 @@ CNN / Daily Mail), it yields very good results.
    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")

    >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
-    >>> inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
-    >>> outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
-
-.. code-block::
+    >>> inputs = tokenizer("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
+    >>> outputs = model.generate(
+    ...     inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
+    ... )

    >>> print(tokenizer.decode(outputs[0]))
-    <pad> prosecutors say the marriages were part of an immigration scam. if convicted, barrientos faces two criminal counts of "offering a false instrument for filing in the first degree" she has been married 10 times, nine of them between 1999 and 2002.</s>
+    <pad> prosecutors say the marriages were part of an immigration scam. if convicted, barrientos faces two criminal
+    counts of "offering a false instrument for filing in the first degree" she has been married 10 times, nine of them
+    between 1999 and 2002.


 Translation
@@ -888,25 +896,32 @@ Here is an example of doing translation using a model and a tokenizer. The proce
 .. code-block::

    >>> ## PYTORCH CODE
-    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

-    >>> model = AutoModelWithLMHead.from_pretrained("t5-base")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")

-    >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="pt")
-    >>> outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
-    >>> ## TENSORFLOW CODE
-    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
-
-    >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base")
-    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-
-    >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="tf")
-    >>> outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
-
-As with the pipeline example, we get the same translation:
-
-.. code-block::
+    >>> inputs = tokenizer(
+    ...     "translate English to German: Hugging Face is a technology company based in New York and Paris",
+    ...     return_tensors="pt"
+    ... )
+    >>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)

    >>> print(tokenizer.decode(outputs[0]))
-    Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.
+    <pad> Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.</s>
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
+
+    >>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    >>> inputs = tokenizer(
+    ...     "translate English to German: Hugging Face is a technology company based in New York and Paris",
+    ...     return_tensors="tf"
+    ... )
+    >>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)
+
+    >>> print(tokenizer.decode(outputs[0]))
+    <pad> Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.
+
+We get the same translation as with the pipeline example.
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -431,6 +431,7 @@ decorators are used to set the requirements of tests CPU/GPU/TPU-wise:
 * ``require_torch_gpu`` - as ``require_torch`` plus requires at least 1 GPU
 * ``require_torch_multi_gpu`` - as ``require_torch`` plus requires at least 2 GPUs
 * ``require_torch_non_multi_gpu`` - as ``require_torch`` plus requires 0 or 1 GPUs
+* ``require_torch_up_to_2_gpus`` - as ``require_torch`` plus requires 0 or 1 or 2 GPUs
 * ``require_torch_tpu`` - as ``require_torch`` plus requires at least 1 TPU

 Let's depict the GPU requirements in the following table:
@@ -447,6 +448,8 @@ Let's depict the GPU requirements in the following table:
 +----------+----------------------------------+
 | ``< 2``  | ``@require_torch_non_multi_gpu`` |
 +----------+----------------------------------+
+| ``< 3``  | ``@require_torch_up_to_2_gpus``  |
+----------+----------------------------------+


 For example, here is a test that must be run only when there are 2 or more GPUs available and pytorch is installed:
@@ -1167,6 +1170,23 @@ To start a debugger at the point of the warning, do this:
    pytest tests/test_logging.py -W error::UserWarning --pdb


+Working with github actions workflows
+-----------------------------------------------------------------------------------------------------------------------
+
+To trigger a self-push workflow CI job, you must:
+
+1. Create a new branch on ``transformers`` origin (not a fork!).
+2. The branch name has to start with either ``ci_`` or ``ci-`` (``master`` triggers it too, but we can't do PRs on
+   ``master``). It also gets triggered only for specific paths - you can find the up-to-date definition in case it
+   changed since this document has been written `here
+   <https://github.com/huggingface/transformers/blob/master/.github/workflows/self-push.yml>`__ under `push:`
+3. Create a PR from this branch.
+4. Then you can see the job appear `here
+   <https://github.com/huggingface/transformers/actions/workflows/self-push.yml>`__. It may not run right away if there
+   is a backlog.
+
+
+

 Testing Experimental CI Features
 -----------------------------------------------------------------------------------------------------------------------
--- a/docs/source/training.rst
+++ b/docs/source/training.rst
@@ -33,7 +33,7 @@ Preparing the datasets
   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
   picture-in-picture" allowfullscreen></iframe>

-We will use the `🤗 Datasets <https:/github.com/huggingface/datasets/>`__ library to download and preprocess the IMDB
+We will use the `🤗 Datasets <https://github.com/huggingface/datasets/>`__ library to download and preprocess the IMDB
 datasets. We will go over this part pretty quickly. Since the focus of this tutorial is on training, you should refer
 to the 🤗 Datasets `documentation <https://huggingface.co/docs/datasets/>`__ or the :doc:`preprocessing` tutorial for
 more information.
@@ -152,7 +152,7 @@ To fine-tune our model, we just need to call
    trainer.train()

 which will start a training that you can follow with a progress bar, which should take a couple of minutes to complete
-(as long as you hav access to a GPU). It won't actually tell you anything useful about how well (or badly) your model
+(as long as you have access to a GPU). It won't actually tell you anything useful about how well (or badly) your model
 is performing however as by default, there is no evaluation during training, and we didn't tell the
 :class:`~transformers.Trainer` to compute any metrics. Let's have a look on how to do that now!

@@ -281,7 +281,7 @@ Fine-tuning in native PyTorch
   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
   picture-in-picture" allowfullscreen></iframe>

-You might need to restart your notebook at this stage to free some memory, or excute the following code:
+You might need to restart your notebook at this stage to free some memory, or execute the following code:

 .. code-block:: python

--- a/examples/flax/README.md
+++ b/examples/flax/README.md
@@ -19,6 +19,17 @@ This folder contains actively maintained examples of 🤗 Transformers using the

 *NOTE*: Currently, there is no "Trainer" abstraction for JAX/Flax -- all examples contain an explicit training loop.

+The following table lists all of our examples on how to use 🤗 Transformers with the JAX/Flax backend:
+- with information about the model and dataset used,
+- whether or not they leverage the [🤗 Datasets](https://github.com/huggingface/datasets) library,
+- links to **Colab notebooks** to walk through the scripts and run them easily.
+
+| Task | Example model | Example dataset | 🤗 Datasets | Colab
+|---|---|---|:---:|:---:|
+| [**`causal-language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling) | GPT2 | OSCAR | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/causal_language_modeling_flax.ipynb)
+| [**`masked-language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling) | RoBERTa | OSCAR | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/masked_language_modeling_flax.ipynb)
+| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/flax/text-classification) | BERT | GLUE | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification_flax.ipynb)
+
 ## Intro: JAX and Flax

 [JAX](https://github.com/google/jax) is a numerical computation library that exposes a NumPy-like API with tracing capabilities. With JAX's `jit`, you can
@@ -35,6 +46,8 @@ module abstraction using Python dataclasses that leads to concise and explicit c
 All of our JAX/Flax models are designed to run efficiently on Google
 Cloud TPUs. Here is [a guide for running JAX on Google Cloud TPU](https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm).

+Consider applying for the [Google TPU Research Cloud project](https://sites.research.google/trc/) for free TPU compute.
+
 Each example README contains more details on the specific model and training
 procedure.

@@ -47,17 +60,4 @@ be adding a guide for porting models from PyTorch in the upcoming few weeks.
 For a complete overview of models that are supported in JAX/Flax, please have a look at [this](https://huggingface.co/transformers/master/index.html#supported-frameworks) table.

 Over 3000 pretrained checkpoints are supported in JAX/Flax as of May 2021.
-Click [here](https://huggingface.co/models?filter=jax) to see the full list on the 🤗 hub. 
-
-## Examples
-
-The following table lists all of our examples on how to use 🤗 Transformers with the JAX/Flax backend:
- with information about the model and dataset used,
- whether or not they leverage the [🤗 Datasets](https://github.com/huggingface/datasets) library,
- links to **Colab notebooks** to walk through the scripts and run them easily.
-
-| Task | Example model | Example dataset | 🤗 Datasets | Colab
-|---|---|---|:---:|:---:|
-| [**`causal-language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling) | GPT2 | OSCAR | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/causal_language_modeling_flax.ipynb)
-| [**`masked-language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling) | RoBERTa | OSCAR | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/masked_language_modeling_flax.ipynb)
-| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/flax/text-classification) | BERT | GLUE | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification_flax.ipynb)
+Click [here](https://huggingface.co/models?filter=jax) to see the full list on the 🤗 hub.
--- a/examples/flax/language-modeling/README.md
+++ b/examples/flax/language-modeling/README.md
@@ -33,11 +33,31 @@ in Norwegian on a single TPUv3-8 pod.

 The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.

-Let's start by creating a folder to save the trained model and a symbolic link to the `run_mlm_flax.py` script.
+Let's start by creating a model repository to save the trained model and logs.
+Here we call the model `"norwegian-roberta-base"`, but you can change the model name as you like.
+
+You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
+you are logged in) or via the command line:
+
+```
+huggingface-cli repo create norwegian-roberta-base
+```
+
+Next we clone the model repository to add the tokenizer and model files.
+
+```
+git clone https://huggingface.co/<your-username>/norwegian-roberta-base
+```
+
+To setup all relevant files for training, let's go into the cloned model directory.
+
+```bash
+cd norwegian-roberta-base
+```
+
+Next, let's add a symbolic link to the `run_mlm_flax.py`.

 ```bash
-export MODEL_DIR="./norwegian-roberta-base"
-mkdir -p ${MODEL_DIR}
 ln -s ~/transformers/examples/flax/language-modeling/run_mlm_flax.py run_mlm_flax.py
 ```

@@ -45,15 +65,13 @@ ln -s ~/transformers/examples/flax/language-modeling/run_mlm_flax.py run_mlm_fla

 In the first step, we train a tokenizer to efficiently process the text input for the model. Similar to how it is shown in [How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train), we use a **`ByteLevelBPETokenizer`**.
 The tokenizer is trained on the complete Norwegian dataset of OSCAR
-and consequently saved in `${MODEL_DIR}`
+and consequently saved in the cloned model directory.
 This can take up to 10 minutes depending on your hardware ☕.

 ```python
 from datasets import load_dataset
 from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer

-model_dir = "./norwegian-roberta-base"  # ${MODEL_DIR}
-
 # load dataset
 dataset = load_dataset("oscar", "unshuffled_deduplicated_no", split="train")

@@ -74,7 +92,7 @@ tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=
 ])

 # Save files to disk
-tokenizer.save(f"{model_dir}/tokenizer.json")
+tokenizer.save("./tokenizer.json")
 ```

 ### Create configuration
@@ -86,22 +104,23 @@ in the local model folder:
 ```python
 from transformers import RobertaConfig

-model_dir = "./norwegian-roberta-base"  # ${MODEL_DIR}
-
-config = RobertaConfig.from_pretrained("roberta-base")
-config.save_pretrained(model_dir)
+config = RobertaConfig.from_pretrained("roberta-base", vocab_size=50265)
+config.save_pretrained("./")
 ```

+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
 ### Train model

 Next we can run the example script to pretrain the model:

 ```bash
 ./run_mlm_flax.py \
-    --output_dir="./runs" \
+    --output_dir="./" \
    --model_type="roberta" \
-    --config_name="${MODEL_DIR}" \
-    --tokenizer_name="${MODEL_DIR}" \
+    --config_name="./" \
+    --tokenizer_name="./" \
    --dataset_name="oscar" \
    --dataset_config_name="unshuffled_deduplicated_no" \
    --max_seq_length="128" \
@@ -111,16 +130,19 @@ Next we can run the example script to pretrain the model:
    --learning_rate="3e-4" \
    --warmup_steps="1000" \
    --overwrite_output_dir \
-    --pad_to_max_length \
    --num_train_epochs="18" \
    --adam_beta1="0.9" \
-    --adam_beta2="0.98"
+    --adam_beta2="0.98" \
+    --logging_steps="500" \
+    --save_steps="2500" \
+    --eval_steps="2500" \
+    --push_to_hub
 ```

 Training should converge at a loss and accuracy 
 of 1.78 and 0.64 respectively after 18 epochs on a single TPUv3-8.
 This should take less than 18 hours.
-Training statistics can be accessed on [tfhub.de](https://tensorboard.dev/experiment/GdYmdak2TWeVz0DDRYOrrg).
+Training statistics can be accessed on [tfhub.dev](https://tensorboard.dev/experiment/GdYmdak2TWeVz0DDRYOrrg).

 For a step-by-step walkthrough of how to do masked language modeling in Flax, please have a 
 look at [this](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/masked_language_modeling_flax.ipynb) google colab.
@@ -135,15 +157,67 @@ in Norwegian on a single TPUv3-8 pod.

 The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.

-Let's start by creating a folder to save the trained model and a symbolic link to the `run_clm_flax.py` script.
+Let's start by creating a model repository to save the trained model and logs.
+Here we call the model `"norwegian-gpt2"`, but you can change the model name as you like.
+
+You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
+you are logged in) or via the command line:
+
+```
+huggingface-cli repo create norwegian-gpt2
+```
+
+Next we clone the model repository to add the tokenizer and model files.
+
+```
+git clone https://huggingface.co/<your-username>/norwegian-gpt2
+```
+
+To setup all relevant files for training, let's go into the cloned model directory.
+
+```bash
+cd norwegian-gpt2
+```
+
+Next, let's add a symbolic link to the training script `run_clm_flax.py`.

 ```bash
-export MODEL_DIR="./norwegian-gpt2"
-mkdir -p ${MODEL_DIR}
 ln -s ~/transformers/examples/flax/language-modeling/run_clm_flax.py run_clm_flax.py
 ```

-Next, we'll follow the same steps as above in [Train tokenizer](#train-tokenizer) to train the tokenizer.
+### Train tokenizer
+
+In the first step, we train a tokenizer to efficiently process the text input for the model. Similar to how it is shown in [How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train), we use a **`ByteLevelBPETokenizer`**.
+The tokenizer is trained on the complete Norwegian dataset of OSCAR
+and consequently saved in the cloned model directory.
+This can take up to 10 minutes depending on your hardware ☕.
+
+```python
+from datasets import load_dataset
+from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
+
+# load dataset
+dataset = load_dataset("oscar", "unshuffled_deduplicated_no", split="train")
+
+# Instantiate tokenizer
+tokenizer = ByteLevelBPETokenizer()
+
+def batch_iterator(batch_size=1000):
+    for i in range(0, len(dataset), batch_size):
+        yield dataset[i: i + batch_size]["text"]
+
+# Customized training
+tokenizer.train_from_iterator(batch_iterator(), vocab_size=50257, min_frequency=2, special_tokens=[
+    "<s>",
+    "<pad>",
+    "</s>",
+    "<unk>",
+    "<mask>",
+])
+
+# Save files to disk
+tokenizer.save("./tokenizer.json")
+```

 ### Create configuration

@@ -154,22 +228,23 @@ in the local model folder:
 ```python
 from transformers import GPT2Config

-model_dir = "./norwegian-gpt2"  # ${MODEL_DIR}
-
-config = GPT2Config.from_pretrained("gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0)
-config.save_pretrained(model_dir)
+config = GPT2Config.from_pretrained("gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0, vocab_size=50257)
+config.save_pretrained("./")
 ```

+Great, we have set up our model repository. During training, we will now automatically
+push the training logs and model weights to the repo.
+
 ### Train model

-Next we can run the example script to pretrain the model:
+Finally, we can run the example script to pretrain the model:

 ```bash
 ./run_clm_flax.py \
-    --output_dir="./runs" \
+    --output_dir="./" \
    --model_type="gpt2" \
-    --config_name="${MODEL_DIR}" \
-    --tokenizer_name="${MODEL_DIR}" \
+    --config_name="./" \
+    --tokenizer_name="./" \
    --dataset_name="oscar" \
    --dataset_config_name="unshuffled_deduplicated_no" \
    --do_train --do_eval \
@@ -180,6 +255,10 @@ Next we can run the example script to pretrain the model:
    --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
    --overwrite_output_dir \
    --num_train_epochs="20" \
+    --logging_steps="500" \
+    --save_steps="2500" \
+    --eval_steps="2500" \
+    --push_to_hub
 ```

 Training should converge at a loss and perplexity 
@@ -187,6 +266,140 @@ of 3.24 and 25.72 respectively after 20 epochs on a single TPUv3-8.
 This should take less than ~21 hours.
 Training statistics can be accessed on [tfhub.de](https://tensorboard.dev/experiment/2zEhLwJ0Qp2FAkI3WVH9qA).

+For a step-by-step walkthrough of how to do causal language modeling in Flax, please have a 
+look at [this](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/causal_language_modeling_flax.ipynb) google colab.
+
+## T5-like span-masked language modeling
+
+In the following, we demonstrate how to train a T5 model using the span-masked language model 
+objective as proposed in the [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683).
+More specifically, we demonstrate how JAX/Flax can be leveraged 
+to pre-train [**`google/t5-v1_1-base`**](https://huggingface.co/google/t5-v1_1-base)
+in Norwegian on a single TPUv3-8 pod.
+
+The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.
+
+Let's start by creating a model repository to save the trained model and logs.
+Here we call the model `"norwegian-t5-base"`, but you can change the model name as you like.
+
+You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
+you are logged in) or via the command line:
+
+```
+huggingface-cli repo create norwegian-t5-base
+```
+
+Next we clone the model repository to add the tokenizer and model files.
+
+```
+git clone https://huggingface.co/<your-username>/norwegian-t5-base
+```
+
+To setup all relevant files for trairing, let's go into the cloned model directory.
+
+```bash
+cd norwegian-t5-base
+```
+
+Next, let's add a symbolic link to the `run_t5_mlm_flax.py` and `t5_tokenizer_model` scripts.
+
+```bash
+ln -s ~/transformers/examples/flax/language-modeling/run_t5_mlm_flax.py run_t5_mlm_flax.py
+ln -s ~/transformers/examples/flax/language-modeling/t5_tokenizer_model.py t5_tokenizer_model.py
+```
+
+### Train tokenizer
+
+In the first step, we train a tokenizer to efficiently process the text input for the model. 
+We make use of the [tokenizers](https://github.com/huggingface/tokenizers) library to train 
+a sentencepiece unigram tokenizer as shown in [t5_tokenizer_model.py](https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling/t5_tokenizer_model.py) 
+which is heavily inspired from [yandex-research/DeDLOC's tokenizer model](https://github.com/yandex-research/DeDLOC/blob/5c994bc64e573702a9a79add3ecd68b38f14b548/sahajbert/tokenizer/tokenizer_model.py) .
+
+The tokenizer is trained on the complete Norwegian dataset of OSCAR
+and consequently saved in the cloned model directory.
+This can take up to 120 minutes depending on your hardware ☕☕☕ .
+
+```python
+import datasets
+
+from t5_tokenizer_model import SentencePieceUnigramTokenizer
+
+
+vocab_size = 32_000
+input_sentence_size = None
+
+# Initialize a dataset
+dataset = datasets.load_dataset("oscar", name="unshuffled_deduplicated_no", split="train")
+
+tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>")
+
+
+# Build an iterator over this dataset
+def batch_iterator(input_sentence_size=None):
+    if input_sentence_size is None:
+        input_sentence_size = len(dataset)
+    batch_length = 100
+    for i in range(0, input_sentence_size, batch_length):
+        yield dataset[i: i + batch_length]["text"]
+
+
+# Train tokenizer
+tokenizer.train_from_iterator(
+    iterator=batch_iterator(input_sentence_size=input_sentence_size),
+    vocab_size=vocab_size,
+    show_progress=True,
+)
+
+# Save files to disk
+tokenizer.save("./tokenizer.json")
+```
+
+### Create configuration
+
+Next, we create the model's configuration file. This is as simple 
+as loading and storing [`**google/t5-v1_1-base**`](https://huggingface.co/google/t5-v1_1-base)
+in the local model folder:
+
+```python
+from transformers import T5Config
+
+config = T5Config.from_pretrained("google/t5-v1_1-base", vocab_size=tokenizer.get_vocab_size())
+config.save_pretrained("./")
+```
+
+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
+### Train model
+
+Next we can run the example script to pretrain the model:
+
+```bash
+./run_t5_mlm_flax.py \
+	--output_dir="./" \
+	--model_type="t5" \
+	--config_name="./" \
+	--tokenizer_name="./" \
+	--dataset_name="oscar" \
+	--dataset_config_name="unshuffled_deduplicated_no" \
+	--max_seq_length="512" \
+	--per_device_train_batch_size="32" \
+	--per_device_eval_batch_size="32" \
+	--adafactor \
+	--learning_rate="0.005" \
+	--weight_decay="0.001" \
+	--warmup_steps="2000" \
+	--overwrite_output_dir \
+	--logging_steps="500" \
+	--save_steps="10000" \
+	--eval_steps="2500" \
+	--push_to_hub
+```
+
+Training should converge at a loss and accuracy 
+of 2.36 and 57.0 respectively after 3 epochs on a single TPUv3-8.
+This should take around 4.5 hours.
+Training statistics can be accessed on directly on the 🤗 [hub](https://huggingface.co/patrickvonplaten/t5-base-norwegian/tensorboard)

 ## Runtime evaluation

@@ -197,14 +410,9 @@ For reproducibility, we state the training commands used for PyTorch/XLA and PyT
 | Task  | [TPU v3-8 (Flax)](https://tensorboard.dev/experiment/GdYmdak2TWeVz0DDRYOrrg/)  | [TPU v3-8 (Pytorch/XLA)](https://tensorboard.dev/experiment/7Jq1kcQQRAmy12KOdXek7A/)| [8 GPU (PyTorch)](https://tensorboard.dev/experiment/PJneV8FQRxa2unPw1QnVHA)  |
 |-------|-----------|------------|------------|
 | MLM   |  15h32m   |  23h46m    | 44h14m     |
-| **COST*** | $124.24  | $187.84 | $877.92 |

-*All experiments are ran on Google Cloud Platform. Prices are on-demand prices
-(not preemptible), obtained on May 12, 2021 for zone Iowa (us-central1) using
-the following tables:
-[TPU pricing table](https://cloud.google.com/tpu/pricing) ($8.00/h for v3-8),
-[GPU pricing table](https://cloud.google.com/compute/gpus-pricing) ($2.48/h per
-V100 GPU). GPU experiments are ran without further optimizations besides JAX
+*All experiments are ran on Google Cloud Platform. 
+GPU experiments are ran without further optimizations besides JAX
 transformations. GPU experiments are ran with full precision (fp32). "TPU v3-8"
 are 8 TPU cores on 4 chips (each chips has 2 cores), while "8 GPU" are 8 GPU chips.

@@ -281,7 +489,7 @@ mkdir -p ${MODEL_DIR}

 ```bash
 python3 -m torch.distributed.launch --nproc_per_node ${NUM_GPUS} run_mlm.py \
-    --output_dir="./runs" \
+    --output_dir="${MODEL_DIR}" \
    --model_type="roberta" \
    --config_name="${MODEL_DIR}" \
    --tokenizer_name="${MODEL_DIR}" \
--- a/examples/flax/language-modeling/requirements.txt
+++ b/examples/flax/language-modeling/requirements.txt
@@ -2,4 +2,4 @@ datasets >= 1.1.3
 jax>=0.2.8
 jaxlib>=0.1.59
 flax>=0.3.4
-optax>=0.0.8
+optax>=0.0.9
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -31,6 +31,7 @@ from pathlib import Path
 from typing import Callable, Optional

 import datasets
+import numpy as np
 from datasets import Dataset, load_dataset
 from tqdm import tqdm

@@ -51,28 +52,13 @@ from transformers import (
    HfArgumentParser,
    TrainingArguments,
    is_tensorboard_available,
+    set_seed,
 )
 from transformers.testing_utils import CaptureLogger


 logger = logging.getLogger(__name__)

-# Cache the result
-has_tensorboard = is_tensorboard_available()
-if has_tensorboard:
-    try:
-        from flax.metrics.tensorboard import SummaryWriter
-    except ImportError as ie:
-        has_tensorboard = False
-        print(f"Unable to display metrics through TensorBoard because some package are not installed: {ie}")
-
-else:
-    print(
-        "Unable to display metrics through TensorBoard because the package is not installed: "
-        "Please run pip install tensorboard to enable."
-    )
-
-
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_CAUSAL_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

@@ -170,6 +156,9 @@ class DataTrainingArguments:
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
+    keep_linebreaks: bool = field(
+        default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
+    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -198,23 +187,21 @@ def data_loader(rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuf
    steps_per_epoch = len(dataset) // batch_size

    if shuffle:
-        batch_idx = jax.random.permutation(rng, len(dataset))
+        batch_idx = np.random.permutation(len(dataset))
    else:
-        batch_idx = jnp.arange(len(dataset))
+        batch_idx = np.arange(len(dataset))

    batch_idx = batch_idx[: steps_per_epoch * batch_size]  # Skip incomplete batch.
    batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))

    for idx in batch_idx:
        batch = dataset[idx]
-        batch = {k: jnp.array(v) for k, v in batch.items()}
-
-        batch = shard(batch)
+        batch = {k: np.array(v) for k, v in batch.items()}

        yield batch


-def write_metric(summary_writer, train_metrics, eval_metrics, train_time, step):
+def write_train_metric(summary_writer, train_metrics, train_time, step):
    summary_writer.scalar("train_time", train_time, step)

    train_metrics = get_metrics(train_metrics)
@@ -223,6 +210,8 @@ def write_metric(summary_writer, train_metrics, eval_metrics, train_time, step):
        for i, val in enumerate(vals):
            summary_writer.scalar(tag, val, step - len(vals) + i + 1)

+
+def write_eval_metric(summary_writer, eval_metrics, step):
    for metric_name, value in eval_metrics.items():
        summary_writer.scalar(f"eval_{metric_name}", value, step)

@@ -267,7 +256,7 @@ def main():

    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
@@ -283,6 +272,9 @@ def main():
    # Set the verbosity to info of the Transformers logger (on main process only):
    logger.info(f"Training/evaluation parameters {training_args}")

+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
    #  Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
@@ -313,6 +305,7 @@ def main():
            )
    else:
        data_files = {}
+        dataset_args = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
@@ -320,7 +313,24 @@ def main():
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
-        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
+        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args)
+
+        if "validation" not in dataset.keys():
+            dataset["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+                **dataset_args,
+            )
+            dataset["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+                **dataset_args,
+            )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -412,7 +422,8 @@ def main():
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
-        total_length = (total_length // block_size) * block_size
+        if total_length >= block_size:
+            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
@@ -450,8 +461,22 @@ def main():
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))

    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
    if has_tensorboard and jax.process_index() == 0:
-        summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir).joinpath("logs").as_posix())
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )

    # Initialize our training
    rng = jax.random.PRNGKey(training_args.seed)
@@ -477,23 +502,36 @@ def main():
    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
    # mask boolean with the same structure as the parameters.
    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxGPT2.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
    def decay_mask_fn(params):
        flat_params = traverse_util.flatten_dict(params)
-        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        flat_mask = {
+            path: (path[-1] != "bias" and path[-2:] not in [("ln_1", "scale"), ("ln_2", "scale"), ("ln_f", "scale")])
+            for path in flat_params
+        }
        return traverse_util.unflatten_dict(flat_mask)

    # create adam optimizer
-    adamw = optax.adamw(
-        learning_rate=linear_decay_lr_schedule_fn,
-        b1=training_args.adam_beta1,
-        b2=training_args.adam_beta2,
-        eps=training_args.adam_epsilon,
-        weight_decay=training_args.weight_decay,
-        mask=decay_mask_fn,
-    )
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=linear_decay_lr_schedule_fn,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=linear_decay_lr_schedule_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            eps=training_args.adam_epsilon,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )

    # Setup train state
-    state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw, dropout_rng=dropout_rng)
+    state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer, dropout_rng=dropout_rng)

    def loss_fn(logits, labels):
        shift_logits = logits[..., :-1, :]
@@ -548,66 +586,80 @@ def main():
    logger.info(f"  Total optimization steps = {total_train_steps}")

    train_time = 0
-    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    train_metrics = []
+    epochs = tqdm(range(num_epochs), desc="Epoch ... ", position=0)
    for epoch in epochs:
        # ======================== Training ================================
        train_start = time.time()

        # Create sampling rng
        rng, input_rng = jax.random.split(rng)
-        train_metrics = []

        # Generate an epoch by shuffling sampling indices from the train dataset
        train_loader = data_loader(input_rng, train_dataset, train_batch_size, shuffle=True)
        steps_per_epoch = len(train_dataset) // train_batch_size
        # train
-        for _ in tqdm(range(steps_per_epoch), desc="Training...", position=1, leave=False):
+        for step in tqdm(range(steps_per_epoch), desc="Training...", position=1, leave=False):
            batch = next(train_loader)
+            batch = shard(batch)
            state, train_metric = p_train_step(state, batch)
            train_metrics.append(train_metric)

-        train_time += time.time() - train_start
+            cur_step = epoch * (len(train_dataset) // train_batch_size) + step

-        train_metric = unreplicate(train_metric)
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)

-        epochs.write(
-            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
-        )
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})"
+                )

-        # ======================== Evaluating ==============================
-        eval_metrics = []
-        eval_loader = data_loader(input_rng, eval_dataset, eval_batch_size)
-        eval_steps = len(eval_dataset) // eval_batch_size
-        for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False):
-            # Model forward
-            batch = next(eval_loader)
-            metrics = p_eval_step(state.params, batch)
-            eval_metrics.append(metrics)
+                train_metrics = []

-        # normalize eval metrics
-        eval_metrics = get_metrics(eval_metrics)
+            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
+                # ======================== Evaluating ==============================
+                eval_metrics = []
+                eval_loader = data_loader(input_rng, eval_dataset, eval_batch_size)
+                eval_steps = len(eval_dataset) // eval_batch_size
+                for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False):
+                    # Model forward
+                    batch = next(eval_loader)
+                    batch = shard(batch)
+                    metrics = p_eval_step(state.params, batch)
+                    eval_metrics.append(metrics)

-        eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
+                # normalize eval metrics
+                eval_metrics = get_metrics(eval_metrics)
+                eval_metrics = jax.tree_map(jnp.mean, eval_metrics)

-        try:
-            eval_metrics["perplexity"] = math.exp(eval_metrics["loss"])
-        except OverflowError:
-            eval_metrics["perplexity"] = float("inf")
+                try:
+                    eval_metrics["perplexity"] = math.exp(eval_metrics["loss"])
+                except OverflowError:
+                    eval_metrics["perplexity"] = float("inf")

-        # Print metrics and update progress bar
-        desc = f"Epoch... ({epoch + 1}/{num_epochs} | Eval Loss: {eval_metrics['loss']} | Eval Perplexity: {eval_metrics['perplexity']})"
-        epochs.write(desc)
-        epochs.desc = desc
+                # Print metrics and update progress bar
+                desc = f"Step... ({cur_step} | Eval Loss: {eval_metrics['loss']} | Eval Perplexity: {eval_metrics['perplexity']})"
+                epochs.write(desc)
+                epochs.desc = desc

-        # Save metrics
-        if has_tensorboard and jax.process_index() == 0:
-            cur_step = epoch * (len(train_dataset) // train_batch_size)
-            write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step)
+                # Save metrics
+                if has_tensorboard and jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)

-    # save last checkpoint
-    if jax.process_index() == 0:
-        params = jax.device_get(unreplicate(state.params))
-        model.save_pretrained(training_args.output_dir, params=params)
+            if cur_step % training_args.save_steps == 0 and cur_step > 0:
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(unreplicate(state.params))
+                    model.save_pretrained(
+                        training_args.output_dir,
+                        params=params,
+                        push_to_hub=training_args.push_to_hub,
+                        commit_message=f"Saving weights and logs of step {cur_step}",
+                    )


 if __name__ == "__main__":
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -56,22 +56,6 @@ from transformers import (
 )


-# Cache the result
-has_tensorboard = is_tensorboard_available()
-if has_tensorboard:
-    try:
-        from flax.metrics.tensorboard import SummaryWriter
-    except ImportError as ie:
-        has_tensorboard = False
-        print(f"Unable to display metrics through TensorBoard because some package are not installed: {ie}")
-
-else:
-    print(
-        "Unable to display metrics through TensorBoard because the package is not installed: "
-        "Please run pip install tensorboard to enable."
-    )
-
-
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

@@ -230,7 +214,7 @@ class FlaxDataCollatorForLanguageModeling:

    def mask_tokens(
        self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
-    ) -> Tuple[jnp.ndarray, jnp.ndarray]:
+    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
@@ -269,7 +253,7 @@ def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndar
    return batch_idx


-def write_metric(train_metrics, eval_metrics, train_time, step):
+def write_train_metric(summary_writer, train_metrics, train_time, step):
    summary_writer.scalar("train_time", train_time, step)

    train_metrics = get_metrics(train_metrics)
@@ -278,6 +262,8 @@ def write_metric(train_metrics, eval_metrics, train_time, step):
        for i, val in enumerate(vals):
            summary_writer.scalar(tag, val, step - len(vals) + i + 1)

+
+def write_eval_metric(summary_writer, eval_metrics, step):
    for metric_name, value in eval_metrics.items():
        summary_writer.scalar(f"eval_{metric_name}", value, step)

@@ -308,17 +294,13 @@ if __name__ == "__main__":

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        level="NOTSET",
        datefmt="[%X]",
    )

    # Log on each process the small summary:
    logger = logging.getLogger(__name__)
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )

    # Set the verbosity to info of the Transformers logger (on main process only):
    logger.info(f"Training/evaluation parameters {training_args}")
@@ -362,6 +344,20 @@ if __name__ == "__main__":
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -449,7 +445,8 @@ if __name__ == "__main__":
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
-            total_length = (total_length // max_seq_length) * max_seq_length
+            if total_length >= max_seq_length:
+                total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
@@ -471,8 +468,22 @@ if __name__ == "__main__":
        )

    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
    if has_tensorboard and jax.process_index() == 0:
-        summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir).joinpath("logs").as_posix())
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )

    # Data collator
    # This one will take care of randomly masking the tokens.
@@ -482,7 +493,14 @@ if __name__ == "__main__":
    rng = jax.random.PRNGKey(training_args.seed)
    dropout_rngs = jax.random.split(rng, jax.local_device_count())

-    model = FlaxAutoModelForMaskedLM.from_config(config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype))
+    if model_args.model_name_or_path:
+        model = FlaxAutoModelForMaskedLM.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    else:
+        model = FlaxAutoModelForMaskedLM.from_config(
+            config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )

    # Store some constant
    num_epochs = int(training_args.num_train_epochs)
@@ -508,23 +526,33 @@ if __name__ == "__main__":
    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
    # mask boolean with the same structure as the parameters.
    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBERT-like models.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
    def decay_mask_fn(params):
        flat_params = traverse_util.flatten_dict(params)
        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
        return traverse_util.unflatten_dict(flat_mask)

    # create adam optimizer
-    adamw = optax.adamw(
-        learning_rate=linear_decay_lr_schedule_fn,
-        b1=training_args.adam_beta1,
-        b2=training_args.adam_beta2,
-        eps=1e-8,
-        weight_decay=training_args.weight_decay,
-        mask=decay_mask_fn,
-    )
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=linear_decay_lr_schedule_fn,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=linear_decay_lr_schedule_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            eps=training_args.adam_epsilon,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )

    # Setup train state
-    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw)
+    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)

    # Define gradient update step fn
    def train_step(state, batch, dropout_rng):
@@ -582,12 +610,12 @@ if __name__ == "__main__":
    # Replicate the train state on each device
    state = jax_utils.replicate(state)

-    train_metrics = []
    train_time = 0
    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
    for epoch in epochs:
        # ======================== Training ================================
        train_start = time.time()
+        train_metrics = []

        # Create sampling rng
        rng, input_rng = jax.random.split(rng)
@@ -598,7 +626,7 @@ if __name__ == "__main__":
        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)

        # Gather the indexes for creating the batch and do a training step
-        for i, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
+        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
            samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
            model_inputs = data_collator(samples, pad_to_multiple_of=16)

@@ -607,44 +635,57 @@ if __name__ == "__main__":
            state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
            train_metrics.append(train_metric)

-        train_time += time.time() - train_start
+            cur_step = epoch * (num_train_samples // train_batch_size) + step

-        epochs.write(
-            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
-        )
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = jax_utils.unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)

-        # ======================== Evaluating ==============================
-        num_eval_samples = len(tokenized_datasets["validation"])
-        eval_samples_idx = jnp.arange(num_eval_samples)
-        eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
+                )

-        eval_metrics = []
-        for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
-            samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
-            model_inputs = data_collator(samples, pad_to_multiple_of=16)
+                train_metrics = []

-            # Model forward
-            model_inputs = shard(model_inputs.data)
-            metrics = p_eval_step(state.params, model_inputs)
-            eval_metrics.append(metrics)
+            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
+                # ======================== Evaluating ==============================
+                num_eval_samples = len(tokenized_datasets["validation"])
+                eval_samples_idx = jnp.arange(num_eval_samples)
+                eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)

-        # normalize eval metrics
-        eval_metrics = get_metrics(eval_metrics)
-        eval_metrics = jax.tree_map(jnp.sum, eval_metrics)
-        eval_normalizer = eval_metrics.pop("normalizer")
-        eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
+                eval_metrics = []
+                for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+                    samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+                    model_inputs = data_collator(samples, pad_to_multiple_of=16)

-        # Update progress bar
-        epochs.desc = (
-            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
-        )
+                    # Model forward
+                    model_inputs = shard(model_inputs.data)
+                    metrics = p_eval_step(state.params, model_inputs)
+                    eval_metrics.append(metrics)

-        # Save metrics
-        if has_tensorboard and jax.process_index() == 0:
-            cur_step = epoch * (len(tokenized_datasets["train"]) // train_batch_size)
-            write_metric(train_metrics, eval_metrics, train_time, cur_step)
+                # normalize eval metrics
+                eval_metrics = get_metrics(eval_metrics)
+                eval_metrics = jax.tree_map(jnp.sum, eval_metrics)
+                eval_normalizer = eval_metrics.pop("normalizer")
+                eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)

-    # save last checkpoint
-    if jax.process_index() == 0:
-        params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-        model.save_pretrained(training_args.output_dir, params=params)
+                # Update progress bar
+                epochs.desc = f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
+
+                # Save metrics
+                if has_tensorboard and jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+
+            if cur_step % training_args.save_steps == 0 and cur_step > 0:
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+                    model.save_pretrained(
+                        training_args.output_dir,
+                        params=params,
+                        push_to_hub=training_args.push_to_hub,
+                        commit_message=f"Saving weights and logs of step {cur_step}",
+                    )
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -0,0 +1,799 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Pretraining the library models for T5-like span-masked language modeling on a text file or a dataset.
+
+Here is the full list of checkpoints on the hub that can be pretrained by this script:
+https://huggingface.co/models?filter=t5
+"""
+# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
+import logging
+import os
+import sys
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import numpy as np
+from datasets import load_dataset
+from tqdm import tqdm
+
+import flax
+import jax
+import jax.numpy as jnp
+import optax
+from flax import jax_utils, traverse_util
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from transformers import (
+    CONFIG_MAPPING,
+    FLAX_MODEL_FOR_MASKED_LM_MAPPING,
+    AutoTokenizer,
+    BatchEncoding,
+    FlaxT5ForConditionalGeneration,
+    HfArgumentParser,
+    PreTrainedTokenizerBase,
+    T5Config,
+    TrainingArguments,
+    is_tensorboard_available,
+    set_seed,
+)
+from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
+
+
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    train_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
+    )
+    validation_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization and masking. Sequences longer than this will be truncated. Default to the max input length of the model."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for span masked language modeling loss"}
+    )
+    mean_noise_span_length: float = field(
+        default=3.0,
+        metadata={"help": "Mean span length of masked tokens"},
+    )
+
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+
+
+def compute_input_and_target_lengths(inputs_length, noise_density, mean_noise_span_length):
+    """This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2466>`__ .
+
+    Training parameters to avoid padding with random_spans_noise_mask.
+    When training a model with random_spans_noise_mask, we would like to set the other
+    training hyperparmeters in a way that avoids padding.
+    This function helps us compute these hyperparameters.
+    We assume that each noise span in the input is replaced by extra_tokens_per_span_inputs sentinel tokens,
+    and each non-noise span in the targets is replaced by extra_tokens_per_span_targets sentinel tokens.
+    This function tells us the required number of tokens in the raw example (for split_tokens())
+    as well as the length of the encoded targets. Note that this function assumes
+    the inputs and targets will have EOS appended and includes that in the reported length.
+
+    Args:
+        inputs_length: an integer - desired length of the tokenized inputs sequence
+        noise_density: a float
+        mean_noise_span_length: a float
+    Returns:
+        tokens_length: length of original text in tokens
+        targets_length: an integer - length in tokens of encoded targets sequence
+    """
+
+    def _tokens_length_to_inputs_length_targets_length(tokens_length):
+        num_noise_tokens = int(round(tokens_length * noise_density))
+        num_nonnoise_tokens = tokens_length - num_noise_tokens
+        num_noise_spans = int(round(num_noise_tokens / mean_noise_span_length))
+        # inputs contain all nonnoise tokens, sentinels for all noise spans
+        # and one EOS token.
+        _input_length = num_nonnoise_tokens + num_noise_spans + 1
+        _output_length = num_noise_tokens + num_noise_spans + 1
+        return _input_length, _output_length
+
+    tokens_length = inputs_length
+
+    while _tokens_length_to_inputs_length_targets_length(tokens_length + 1)[0] <= inputs_length:
+        tokens_length += 1
+
+    inputs_length, targets_length = _tokens_length_to_inputs_length_targets_length(tokens_length)
+
+    # minor hack to get the targets length to be equal to inputs length
+    # which is more likely to have been set to a nice round number.
+    if noise_density == 0.5 and targets_length > inputs_length:
+        tokens_length -= 1
+        targets_length -= 1
+    return tokens_length, targets_length
+
+
+@flax.struct.dataclass
+class FlaxDataCollatorForT5MLM:
+    """
+    Data collator used for T5 span-masked language modeling.
+    It is made sure that after masking the inputs are of length `data_args.max_seq_length` and targets are also of fixed length.
+    For more information on how T5 span-masked language modeling works, one can take a look
+    at the `official paper <https://arxiv.org/pdf/1910.10683.pdf>`__
+    or the `official code for preprocessing <https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/data/preprocessors.py>`__ .
+
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        noise_density (:obj:`float`):
+            The probability with which to (randomly) mask tokens in the input.
+        mean_noise_span_length (:obj:`float`):
+            The average span length of the masked tokens.
+        input_length (:obj:`int`):
+            The expected input length after masking.
+        target_length (:obj:`int`):
+            The expected target length after masking.
+        pad_token_id: (:obj:`int`):
+            The pad token id of the model
+        decoder_start_token_id: (:obj:`int):
+            The decoder start token id of the model
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    noise_density: float
+    mean_noise_span_length: float
+    input_length: int
+    target_length: int
+    pad_token_id: int
+    decoder_start_token_id: int
+
+    def __call__(self, examples: List[Dict[str, np.ndarray]]) -> Dict[str, np.ndarray]:
+
+        # convert list to dict and tensorize input
+        batch = BatchEncoding(
+            {k: np.array([examples[i][k] for i in range(len(examples))]) for k, v in examples[0].items()}
+        )
+
+        input_ids = batch["input_ids"]
+        batch_size, expandend_input_length = input_ids.shape
+
+        mask_indices = np.asarray([self.random_spans_noise_mask(expandend_input_length) for i in range(batch_size)])
+        labels_mask = ~mask_indices
+
+        input_ids_sentinel = self.create_sentinel_ids(mask_indices.astype(np.int8))
+        labels_sentinel = self.create_sentinel_ids(labels_mask.astype(np.int8))
+
+        batch["input_ids"] = self.filter_input_ids(input_ids, input_ids_sentinel)
+        batch["labels"] = self.filter_input_ids(input_ids, labels_sentinel)
+
+        if batch["input_ids"].shape[-1] != self.input_length:
+            raise ValueError(
+                f"`input_ids` are incorrectly preprocessed. `input_ids` length is {batch['input_ids'].shape[-1]}, but should be {self.target_length}."
+            )
+
+        if batch["labels"].shape[-1] != self.target_length:
+            raise ValueError(
+                f"`labels` are incorrectly preprocessed. `labels` length is {batch['labels'].shape[-1]}, but should be {self.target_length}."
+            )
+
+        # to check that tokens are correctly proprocessed, one can run `self.tokenizer.batch_decode(input_ids)` and `self.tokenizer.batch_decode(labels)` here...
+        batch["decoder_input_ids"] = shift_tokens_right(
+            batch["labels"], self.pad_token_id, self.decoder_start_token_id
+        )
+
+        return batch
+
+    def create_sentinel_ids(self, mask_indices):
+        """
+        Sentinel ids creation given the indices that should be masked.
+        The start indices of each mask are replaced by the sentinel ids in increasing
+        order. Consecutive mask indices to be deleted are replaced with `-1`.
+        """
+        start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices
+        start_indices[:, 0] = mask_indices[:, 0]
+
+        sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices)
+        sentinel_ids = np.where(sentinel_ids != 0, (sentinel_ids + self.tokenizer.vocab_size - 1), 0)
+        sentinel_ids -= mask_indices - start_indices
+
+        return sentinel_ids
+
+    def filter_input_ids(self, input_ids, sentinel_ids):
+        """
+        Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting.
+        This will reduce the sequence length from `expanded_inputs_length` to `input_length`.
+        """
+        batch_size = input_ids.shape[0]
+
+        input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids)
+        input_ids = input_ids_full[input_ids_full > 0].reshape((batch_size, -1))
+        input_ids = np.concatenate(
+            [input_ids, np.full((batch_size, 1), self.tokenizer.eos_token_id, dtype=np.int32)], axis=-1
+        )
+        return input_ids
+
+    def random_spans_noise_mask(self, length):
+
+        """This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2682>`__ .
+
+        Noise mask consisting of random spans of noise tokens.
+        The number of noise tokens and the number of noise spans and non-noise spans
+        are determined deterministically as follows:
+        num_noise_tokens = round(length * noise_density)
+        num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length)
+        Spans alternate between non-noise and noise, beginning with non-noise.
+        Subject to the above restrictions, all masks are equally likely.
+
+        Args:
+            length: an int32 scalar (length of the incoming token sequence)
+            noise_density: a float - approximate density of output mask
+            mean_noise_span_length: a number
+
+        Returns:
+            a boolean tensor with shape [length]
+        """
+
+        orig_length = length
+
+        num_noise_tokens = int(np.round(length * self.noise_density))
+        # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens.
+        num_noise_tokens = min(max(num_noise_tokens, 1), length - 1)
+        num_noise_spans = int(np.round(num_noise_tokens / self.mean_noise_span_length))
+
+        # avoid degeneracy by ensuring positive number of noise spans
+        num_noise_spans = max(num_noise_spans, 1)
+        num_nonnoise_tokens = length - num_noise_tokens
+
+        # pick the lengths of the noise spans and the non-noise spans
+        def _random_segmentation(num_items, num_segments):
+            """Partition a sequence of items randomly into non-empty segments.
+            Args:
+                num_items: an integer scalar > 0
+                num_segments: an integer scalar in [1, num_items]
+            Returns:
+                a Tensor with shape [num_segments] containing positive integers that add
+                up to num_items
+            """
+            mask_indices = np.arange(num_items - 1) < (num_segments - 1)
+            np.random.shuffle(mask_indices)
+            first_in_segment = np.pad(mask_indices, [[1, 0]])
+            segment_id = np.cumsum(first_in_segment)
+            # count length of sub segments assuming that list is sorted
+            _, segment_length = np.unique(segment_id, return_counts=True)
+            return segment_length
+
+        noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans)
+        nonnoise_span_lengths = _random_segmentation(num_nonnoise_tokens, num_noise_spans)
+
+        interleaved_span_lengths = np.reshape(
+            np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1), [num_noise_spans * 2]
+        )
+        span_starts = np.cumsum(interleaved_span_lengths)[:-1]
+        span_start_indicator = np.zeros((length,), dtype=np.int8)
+        span_start_indicator[span_starts] = True
+        span_num = np.cumsum(span_start_indicator)
+        is_noise = np.equal(span_num % 2, 1)
+
+        return is_noise[:orig_length]
+
+
+def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
+    num_samples = len(samples_idx)
+    samples_to_remove = num_samples % batch_size
+
+    if samples_to_remove != 0:
+        samples_idx = samples_idx[:-samples_to_remove]
+    sections_split = num_samples // batch_size
+    batch_idx = np.split(samples_idx, sections_split)
+    return batch_idx
+
+
+def write_train_metric(summary_writer, train_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+
+
+def write_eval_metric(summary_writer, eval_metrics, step):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+
+
+if __name__ == "__main__":
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        level="NOTSET",
+        datefmt="[%X]",
+    )
+
+    # Log on each process the small summary:
+    logger = logging.getLogger(__name__)
+
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = data_args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+
+    if model_args.config_name:
+        config = T5Config.from_pretrained(
+            model_args.config_name, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer)
+        )
+    elif model_args.model_name_or_path:
+        config = T5Config.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer)
+        )
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+    else:
+        column_names = datasets["validation"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+    # Since we make sure that all sequences are of the same length, no attention_mask is needed.
+    def tokenize_function(examples):
+        return tokenizer(examples[text_column_name], return_attention_mask=False)
+
+    tokenized_datasets = datasets.map(
+        tokenize_function,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=not data_args.overwrite_cache,
+    )
+
+    # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token.
+    # To ensure that the input length is `max_seq_length`, we need to increase the maximum length
+    # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly.
+    expanded_inputs_length, targets_length = compute_input_and_target_lengths(
+        inputs_length=max_seq_length,
+        noise_density=data_args.mlm_probability,
+        mean_noise_span_length=data_args.mean_noise_span_length,
+    )
+
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        if total_length >= expanded_inputs_length:
+            total_length = (total_length // expanded_inputs_length) * expanded_inputs_length
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length)]
+            for k, t in concatenated_examples.items()
+        }
+        return result
+
+    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+    # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+    # might be slower to preprocess.
+    #
+    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    tokenized_datasets = tokenized_datasets.map(
+        group_texts,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        load_from_cache_file=not data_args.overwrite_cache,
+    )
+
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
+
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    dropout_rngs = jax.random.split(rng, jax.local_device_count())
+
+    if model_args.model_name_or_path:
+        model = FlaxT5ForConditionalGeneration.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    else:
+        model = FlaxT5ForConditionalGeneration(config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype))
+
+    # Data collator
+    # This one will take care of randomly masking the tokens.
+    data_collator = FlaxDataCollatorForT5MLM(
+        tokenizer=tokenizer,
+        noise_density=data_args.mlm_probability,
+        mean_noise_span_length=data_args.mean_noise_span_length,
+        input_length=max_seq_length,
+        target_length=targets_length,
+        pad_token_id=model.config.pad_token_id,
+        decoder_start_token_id=model.config.decoder_start_token_id,
+    )
+
+    # Store some constant
+    num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+
+    num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
+
+    # Create learning rate schedule
+    warmup_fn = optax.linear_schedule(
+        init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
+    )
+    decay_fn = optax.linear_schedule(
+        init_value=training_args.learning_rate,
+        end_value=0,
+        transition_steps=num_train_steps - training_args.warmup_steps,
+    )
+    linear_decay_lr_schedule_fn = optax.join_schedules(
+        schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
+    )
+
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {
+            path: (path[-1] != "bias" and path[-2:] not in [("layer_norm", "scale"), ("final_layer_norm", "scale")])
+            for path in flat_params
+        }
+        return traverse_util.unflatten_dict(flat_mask)
+
+    # create adam optimizer
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=linear_decay_lr_schedule_fn,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=linear_decay_lr_schedule_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )
+
+    # Setup train state
+    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
+
+    # Define gradient update step fn
+    def train_step(state, batch, dropout_rng):
+        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
+
+        def loss_fn(params):
+            labels = batch.pop("labels")
+
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+
+            # compute loss
+            loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])).mean()
+
+            return loss
+
+        grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+
+        metrics = jax.lax.pmean(
+            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
+        )
+
+        return new_state, metrics, new_dropout_rng
+
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+
+    # Define eval fn
+    def eval_step(params, batch):
+        labels = batch.pop("labels")
+
+        logits = model(**batch, params=params, train=False)[0]
+
+        # compute loss
+        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1]))
+
+        # compute accuracy
+        accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels)
+
+        # summarize metrics
+        metrics = {"loss": loss.mean(), "accuracy": accuracy.mean()}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+
+        return metrics
+
+    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
+
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+
+    train_time = 0
+    epochs = tqdm(range(num_epochs), desc="Epoch ... ", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+        train_start = time.time()
+        train_metrics = []
+
+        # Create sampling rng
+        rng, input_rng = jax.random.split(rng)
+
+        # Generate an epoch by shuffling sampling indices from the train dataset
+        num_train_samples = len(tokenized_datasets["train"])
+        train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
+        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
+
+        # Gather the indexes for creating the batch and do a training step
+        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
+            samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples)
+
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+            train_metrics.append(train_metric)
+
+            cur_step = epoch * (num_train_samples // train_batch_size) + step
+
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = jax_utils.unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})"
+                )
+
+                train_metrics = []
+
+            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
+                # ======================== Evaluating ==============================
+                num_eval_samples = len(tokenized_datasets["validation"])
+                eval_samples_idx = jnp.arange(num_eval_samples)
+                eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+
+                eval_metrics = []
+                for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+                    samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+                    model_inputs = data_collator(samples)
+
+                    # Model forward
+                    model_inputs = shard(model_inputs.data)
+                    metrics = p_eval_step(state.params, model_inputs)
+                    eval_metrics.append(metrics)
+
+                # get eval metrics
+                eval_metrics = get_metrics(eval_metrics)
+                eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
+
+                # Update progress bar
+                epochs.write(f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})")
+
+                # Save metrics
+                if has_tensorboard and jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+
+            if cur_step % training_args.save_steps == 0 and cur_step > 0:
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+                    model.save_pretrained(
+                        training_args.output_dir,
+                        params=params,
+                        push_to_hub=training_args.push_to_hub,
+                        commit_message=f"Saving weights and logs of step {cur_step}",
+                    )
--- a/examples/flax/language-modeling/t5_tokenizer_model.py
+++ b/examples/flax/language-modeling/t5_tokenizer_model.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+import json
+from typing import Iterator, List, Union
+
+from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers
+from tokenizers.implementations.base_tokenizer import BaseTokenizer
+from tokenizers.models import Unigram
+from tokenizers.processors import TemplateProcessing
+
+
+class SentencePieceUnigramTokenizer(BaseTokenizer):
+    """
+    This class is a copy of `DeDLOC's tokenizer implementation <https://github.com/yandex-research/DeDLOC/blob/main/sahajbert/tokenizer/tokenizer_model.py>`__ .
+
+    Custom SentencePiece Unigram Tokenizer with NMT, NKFC, spaces and lower-casing characters normalization
+    Represents the Unigram algorithm, with the pretokenization used by SentencePiece
+    """
+
+    def __init__(
+        self,
+        replacement: str = "▁",
+        add_prefix_space: bool = True,
+        unk_token: Union[str, AddedToken] = "<unk>",
+        eos_token: Union[str, AddedToken] = "</s>",
+        pad_token: Union[str, AddedToken] = "<pad>",
+    ):
+        self.special_tokens = {
+            "pad": {"id": 0, "token": pad_token},
+            "eos": {"id": 1, "token": eos_token},
+            "unk": {"id": 2, "token": unk_token},
+        }
+
+        self.special_tokens_list = [None] * len(self.special_tokens)
+        for token_dict in self.special_tokens.values():
+            self.special_tokens_list[token_dict["id"]] = token_dict["token"]
+
+        tokenizer = Tokenizer(Unigram())
+
+        tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.Nmt(),
+                normalizers.NFKC(),
+                normalizers.Replace(Regex(" {2,}"), " "),
+                normalizers.Lowercase(),
+            ]
+        )
+        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space),
+                pre_tokenizers.Digits(individual_digits=True),
+                pre_tokenizers.Punctuation(),
+            ]
+        )
+        tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
+
+        tokenizer.post_processor = TemplateProcessing(
+            single=f"$A {self.special_tokens['eos']['token']}",
+            special_tokens=[(self.special_tokens["eos"]["token"], self.special_tokens["eos"]["id"])],
+        )
+
+        parameters = {
+            "model": "SentencePieceUnigram",
+            "replacement": replacement,
+            "add_prefix_space": add_prefix_space,
+        }
+
+        super().__init__(tokenizer, parameters)
+
+    def train(
+        self,
+        files: Union[str, List[str]],
+        vocab_size: int = 8000,
+        show_progress: bool = True,
+    ):
+        """Train the model using the given files"""
+
+        trainer = trainers.UnigramTrainer(
+            vocab_size=vocab_size,
+            special_tokens=self.special_tokens_list,
+            show_progress=show_progress,
+        )
+
+        if isinstance(files, str):
+            files = [files]
+        self._tokenizer.train(files, trainer=trainer)
+
+        self.add_unk_id()
+
+    def train_from_iterator(
+        self,
+        iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+        vocab_size: int = 8000,
+        show_progress: bool = True,
+    ):
+        """Train the model using the given iterator"""
+
+        trainer = trainers.UnigramTrainer(
+            vocab_size=vocab_size,
+            special_tokens=self.special_tokens_list,
+            show_progress=show_progress,
+        )
+
+        self._tokenizer.train_from_iterator(iterator, trainer=trainer)
+
+        self.add_unk_id()
+
+    def add_unk_id(self):
+        tokenizer_json = json.loads(self._tokenizer.to_str())
+
+        tokenizer_json["model"]["unk_id"] = self.special_tokens["unk"]["id"]
+
+        self._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
--- a/examples/flax/question-answering/README.md
+++ b/examples/flax/question-answering/README.md
@@ -0,0 +1,128 @@
+<!---
+Copyright 2021 The Google Flax Team Authors and HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Question Answering examples
+
+Based on the script [`run_qa.py`](https://github.com/huggingface/transformers/blob/master/examples/flax/question-answering/run_qa.py).
+
+**Note:** This script only works with models that have a fast tokenizer (backed by the 🤗 Tokenizers library) as it
+uses special features of those tokenizers. You can check if your favorite model has a fast tokenizer in
+[this table](https://huggingface.co/transformers/index.html#supported-frameworks), if it doesn't you can still use the old version
+of the script.
+
+
+The following example fine-tunes BERT on SQuAD:
+
+To begin with it is recommended to create a model repository to save the trained model and logs.
+Here we call the model `"bert-qa-squad-test"`, but you can change the model name as you like.
+
+You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
+you are logged in) or via the command line:
+
+```
+huggingface-cli repo create bert-qa-squad-test
+```
+
+Next we clone the model repository to add the tokenizer and model files.
+
+```
+git clone https://huggingface.co/<your-username>/bert-qa-squad-test
+```
+
+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
+Next, let's add a symbolic link to the `run_qa.py`.
+
+```bash
+export MODEL_DIR="./bert-qa-squad-test"
+ln -s ~/transformers/examples/flax/question-answering/run_qa.py run_qa.py
+```
+
+```bash
+python run_qa.py \
+  --model_name_or_path bert-base-uncased \
+  --dataset_name squad \
+  --do_train   \
+  --do_eval   \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2 \
+  --per_device_train_batch_size 12 \
+  --output_dir ${MODEL_DIR} \
+  --eval_steps 1000 \
+  --push_to_hub
+```
+
+Using the command above, the script will train for 2 epochs and run eval after each epoch. 
+Metrics and hyperparameters are stored in Tensorflow event files in `--output_dir`.
+You can see the results by running `tensorboard` in that directory:
+
+```bash
+$ tensorboard --logdir .
+```
+
+or directly on the hub under *Training metrics*.
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 88.62
+exact_match = 81.34
+```
+
+sample Metrics - [tfhub.dev](https://tensorboard.dev/experiment/6gU75Hx8TGCnc6tr4ZgI9Q)
+
+Here is an example training on 4 TITAN RTX GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python run_qa.py   \
+--model_name_or_path bert-large-uncased-whole-word-masking   \
+--dataset_name squad   \
+--do_train   \
+--do_eval   \
+--per_device_train_batch_size 6   \
+--learning_rate 3e-5   \
+--num_train_epochs 2   \
+--max_seq_length 384   \
+--doc_stride 128   \
+--output_dir /tmp/wwm_uncased_finetuned_squad/ \
+--eval_steps 1000
+```
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 93.31
+exact_match = 87.04
+```
+
+
+### Usage notes
+
+Note that when contexts are long they may be split into multiple training cases, not all of which may contain
+the answer span. 
+
+As-is, the example script will train on SQuAD or any other question-answering dataset formatted the same way, and can handle user
+inputs as well.
+
+### Memory usage and data loading
+
+One thing to note is that all data is loaded into memory in this script. Most question answering datasets are small
+enough that this is not an issue, but if you have a very large dataset you will need to modify the script to handle
+data streaming.
--- a/examples/flax/question-answering/requirements.txt
+++ b/examples/flax/question-answering/requirements.txt
@@ -0,0 +1,5 @@
+datasets >= 1.8.0
+jax>=0.2.17
+jaxlib>=0.1.68
+flax>=0.3.4
+optax>=0.0.8
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -0,0 +1,905 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for question answering.
+"""
+# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
+
+import logging
+import os
+import random
+import sys
+import time
+from dataclasses import dataclass, field
+from itertools import chain
+from typing import Any, Callable, Dict, Optional, Tuple
+
+import datasets
+import numpy as np
+from datasets import load_dataset, load_metric
+from tqdm import tqdm
+
+import jax
+import jax.numpy as jnp
+import optax
+import transformers
+from flax import struct, traverse_util
+from flax.jax_utils import replicate, unreplicate
+from flax.metrics import tensorboard
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    EvalPrediction,
+    FlaxAutoModelForQuestionAnswering,
+    HfArgumentParser,
+    PreTrainedTokenizerFast,
+    TrainingArguments,
+)
+from transformers.utils import check_min_version
+from utils_qa import postprocess_qa_predictions
+
+
+logger = logging.getLogger(__name__)
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.11.0")
+
+Array = Any
+Dataset = datasets.arrow_dataset.Dataset
+PRNGKey = Any
+
+
+# region Arguments
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_seq_length: int = field(
+        default=384,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
+            "be faster on GPU but will be slower on TPU)."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+            "value if set."
+        },
+    )
+    version_2_with_negative: bool = field(
+        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
+    )
+    null_score_diff_threshold: float = field(
+        default=0.0,
+        metadata={
+            "help": "The threshold used to select the null answer: if the best answer has a score that is less than "
+            "the score of the null answer minus this threshold, the null answer is selected for this example. "
+            "Only useful when `version_2_with_negative=True`."
+        },
+    )
+    doc_stride: int = field(
+        default=128,
+        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
+    )
+    n_best_size: int = field(
+        default=20,
+        metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
+    )
+    max_answer_length: int = field(
+        default=30,
+        metadata={
+            "help": "The maximum length of an answer that can be generated. This is needed because the start "
+            "and end predictions are not conditioned on one another."
+        },
+    )
+
+    def __post_init__(self):
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+            and self.test_file is None
+        ):
+            raise ValueError("Need either a dataset name or a training/validation file/test_file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
+
+
+# endregion
+
+# region Create a train state
+def create_train_state(
+    model: FlaxAutoModelForQuestionAnswering,
+    learning_rate_fn: Callable[[int], float],
+    num_labels: int,
+    training_args: TrainingArguments,
+) -> train_state.TrainState:
+    """Create initial training state."""
+
+    class TrainState(train_state.TrainState):
+        """Train state with an Optax optimizer.
+
+        The two functions below differ depending on whether the task is classification
+        or regression.
+
+        Args:
+          logits_fn: Applied to last layer to obtain the logits.
+          loss_fn: Function to compute the loss.
+        """
+
+        logits_fn: Callable = struct.field(pytree_node=False)
+        loss_fn: Callable = struct.field(pytree_node=False)
+
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBERT-like models.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+
+    tx = optax.adamw(
+        learning_rate=learning_rate_fn,
+        b1=training_args.adam_beta1,
+        b2=training_args.adam_beta2,
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+        mask=decay_mask_fn,
+    )
+
+    def cross_entropy_loss(logits, labels):
+        start_loss = optax.softmax_cross_entropy(logits[0], onehot(labels[0], num_classes=num_labels))
+        end_loss = optax.softmax_cross_entropy(logits[1], onehot(labels[1], num_classes=num_labels))
+        xentropy = (start_loss + end_loss) / 2.0
+        return jnp.mean(xentropy)
+
+    return TrainState.create(
+        apply_fn=model.__call__,
+        params=model.params,
+        tx=tx,
+        logits_fn=lambda logits: logits,
+        loss_fn=cross_entropy_loss,
+    )
+
+
+# endregion
+
+
+# region Create learning rate function
+def create_learning_rate_fn(
+    train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, linear_decay learning rate function."""
+    steps_per_epoch = train_ds_size // train_batch_size
+    num_train_steps = steps_per_epoch * num_train_epochs
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+
+
+# endregion
+
+# region train data iterator
+def train_data_collator(rng: PRNGKey, dataset: Dataset, batch_size: int):
+    """Returns shuffled batches of size `batch_size` from truncated `train dataset`, sharded over all local devices."""
+    steps_per_epoch = len(dataset) // batch_size
+    perms = jax.random.permutation(rng, len(dataset))
+    perms = perms[: steps_per_epoch * batch_size]  # Skip incomplete batch.
+    perms = perms.reshape((steps_per_epoch, batch_size))
+
+    for perm in perms:
+        batch = dataset[perm]
+        batch = {k: np.array(v) for k, v in batch.items()}
+        batch = shard(batch)
+
+        yield batch
+
+
+# endregion
+
+# region eval data iterator
+def eval_data_collator(dataset: Dataset, batch_size: int):
+    """Returns batches of size `batch_size` from `eval dataset`, sharded over all local devices."""
+    for i in range(len(dataset) // batch_size):
+        batch = dataset[i * batch_size : (i + 1) * batch_size]
+        batch = {k: np.array(v) for k, v in batch.items()}
+        batch = shard(batch)
+
+        yield batch
+
+
+# endregion
+
+
+def main():
+    # region Argument parsing
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # endregion
+
+    # region Logging
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    # endregion
+
+    # region Load Data
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
+    else:
+        # Loading the dataset from local csv or json file.
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # endregion
+
+    # region Load pretrained model and tokenizer
+    #
+    # Load pretrained model and tokenizer
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=True,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    # endregion
+
+    # region Tokenizer check: this script requires a fast tokenizer.
+    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        raise ValueError(
+            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
+            "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
+            "requirement"
+        )
+    # endregion
+
+    # region Preprocessing the datasets
+    # Preprocessing is slightly different for training and evaluation.
+    if training_args.do_train:
+        column_names = raw_datasets["train"].column_names
+    elif training_args.do_eval:
+        column_names = raw_datasets["validation"].column_names
+    else:
+        column_names = raw_datasets["test"].column_names
+    question_column_name = "question" if "question" in column_names else column_names[0]
+    context_column_name = "context" if "context" in column_names else column_names[1]
+    answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+    # Padding side determines if we do (question|context) or (context|question).
+    pad_on_right = tokenizer.padding_side == "right"
+
+    if data_args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    # Training preprocessing
+    def prepare_train_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=data_args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length",
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # The offset mappings will give us a map from token to character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+
+        # Let's label those examples!
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+
+        for i, offsets in enumerate(offset_mapping):
+            # We will label impossible answers with the index of the CLS token.
+            input_ids = tokenized_examples["input_ids"][i]
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            answers = examples[answer_column_name][sample_index]
+            # If no answers are given, set the cls_index as answer.
+            if len(answers["answer_start"]) == 0:
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                # Start/end character index of the answer in the text.
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+
+                # Start token index of the current span in the text.
+                token_start_index = 0
+                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+                    token_start_index += 1
+
+                # End token index of the current span in the text.
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+                    token_end_index -= 1
+
+                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                else:
+                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                    # Note: we could go after the last offset if the answer is the last word (edge case).
+                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
+                        token_start_index += 1
+                    tokenized_examples["start_positions"].append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    tokenized_examples["end_positions"].append(token_end_index + 1)
+
+        return tokenized_examples
+
+    processed_raw_datasets = dict()
+    if training_args.do_train:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = raw_datasets["train"]
+        if data_args.max_train_samples is not None:
+            # We will select sample from whole data if agument is specified
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+        # Create train feature from dataset
+        train_dataset = train_dataset.map(
+            prepare_train_features,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        if data_args.max_train_samples is not None:
+            # Number of samples might increase during Feature Creation, We select only specified max samples
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+        processed_raw_datasets["train"] = train_dataset
+
+    # Validation preprocessing
+    def prepare_validation_features(examples):
+        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+        # left whitespace
+        examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=data_args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length",
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+        # corresponding example_id and we will store the offset mappings.
+        tokenized_examples["example_id"] = []
+
+        for i in range(len(tokenized_examples["input_ids"])):
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            context_index = 1 if pad_on_right else 0
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+
+        return tokenized_examples
+
+    if training_args.do_eval:
+        if "validation" not in raw_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_examples = raw_datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            # We will select sample from whole data
+            eval_examples = eval_examples.select(range(data_args.max_eval_samples))
+        # Validation Feature Creation
+        eval_dataset = eval_examples.map(
+            prepare_validation_features,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        if data_args.max_eval_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+        processed_raw_datasets["validation"] = eval_dataset
+
+    if training_args.do_predict:
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_examples = raw_datasets["test"]
+        if data_args.max_predict_samples is not None:
+            # We will select sample from whole data
+            predict_examples = predict_examples.select(range(data_args.max_predict_samples))
+        # Predict Feature Creation
+        predict_dataset = predict_examples.map(
+            prepare_validation_features,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+        )
+        if data_args.max_predict_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+        processed_raw_datasets["test"] = predict_dataset
+    # endregion
+
+    # region Metrics and Post-processing:
+    def post_processing_function(examples, features, predictions, stage="eval"):
+        # Post-processing: we match the start logits and end logits to answers in the original context.
+        predictions = postprocess_qa_predictions(
+            examples=examples,
+            features=features,
+            predictions=predictions,
+            version_2_with_negative=data_args.version_2_with_negative,
+            n_best_size=data_args.n_best_size,
+            max_answer_length=data_args.max_answer_length,
+            null_score_diff_threshold=data_args.null_score_diff_threshold,
+            output_dir=training_args.output_dir,
+            prefix=stage,
+        )
+        # Format the result to the format the metric expects.
+        if data_args.version_2_with_negative:
+            formatted_predictions = [
+                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+            ]
+        else:
+            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+    metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
+
+    def compute_metrics(p: EvalPrediction):
+        return metric.compute(predictions=p.predictions, references=p.label_ids)
+
+    # Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
+    def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
+        """
+        Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
+
+        Args:
+            start_or_end_logits(:obj:`tensor`):
+                This is the output predictions of the model. We can only enter either start or end logits.
+            eval_dataset: Evaluation dataset
+            max_len(:obj:`int`):
+                The maximum length of the output tensor. ( See the model.eval() part for more details )
+        """
+
+        step = 0
+        # create a numpy array and fill it with -100.
+        logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float64)
+        # Now since we have create an array now we will populate it with the outputs of the model.
+        for i, output_logit in enumerate(start_or_end_logits):  # populate columns
+            # We have to fill it such that we have to take the whole tensor and replace it on the newly created array
+            # And after every iteration we have to change the step
+
+            batch_size = output_logit.shape[0]
+            cols = output_logit.shape[1]
+
+            if step + batch_size < len(dataset):
+                logits_concat[step : step + batch_size, :cols] = output_logit
+            else:
+                logits_concat[step:, :cols] = output_logit[: len(dataset) - step]
+
+            step += batch_size
+
+        return logits_concat
+
+    # endregion
+
+    # region Training steps and logging init
+    train_dataset = processed_raw_datasets["train"]
+    eval_dataset = processed_raw_datasets["validation"]
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(train_dataset)), 3):
+        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
+
+    # Define a summary writer
+    summary_writer = tensorboard.SummaryWriter(training_args.output_dir)
+    summary_writer.hparams({**training_args.to_dict(), **vars(model_args), **vars(data_args)})
+
+    def write_train_metric(summary_writer, train_metrics, train_time, step):
+        summary_writer.scalar("train_time", train_time, step)
+
+        train_metrics = get_metrics(train_metrics)
+        for key, vals in train_metrics.items():
+            tag = f"train_{key}"
+            for i, val in enumerate(vals):
+                summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+
+    def write_eval_metric(summary_writer, eval_metrics, step):
+        for metric_name, value in eval_metrics.items():
+            summary_writer.scalar(f"eval_{metric_name}", value, step)
+
+    num_epochs = int(training_args.num_train_epochs)
+    rng = jax.random.PRNGKey(training_args.seed)
+    dropout_rngs = jax.random.split(rng, jax.local_device_count())
+
+    train_batch_size = training_args.per_device_train_batch_size * jax.local_device_count()
+    eval_batch_size = training_args.per_device_eval_batch_size * jax.local_device_count()
+    # endregion
+
+    # region Load model
+    model = FlaxAutoModelForQuestionAnswering.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        seed=training_args.seed,
+        dtype=getattr(jnp, model_args.dtype),
+    )
+
+    learning_rate_fn = create_learning_rate_fn(
+        len(train_dataset),
+        train_batch_size,
+        training_args.num_train_epochs,
+        training_args.warmup_steps,
+        training_args.learning_rate,
+    )
+
+    state = create_train_state(model, learning_rate_fn, num_labels=max_seq_length, training_args=training_args)
+    # endregion
+
+    # region Define train step functions
+    def train_step(
+        state: train_state.TrainState, batch: Dict[str, Array], dropout_rng: PRNGKey
+    ) -> Tuple[train_state.TrainState, float]:
+        """Trains model with an optimizer (both in `state`) on `batch`, returning a pair `(new_state, loss)`."""
+        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
+        start_positions = batch.pop("start_positions")
+        end_positions = batch.pop("end_positions")
+        targets = (start_positions, end_positions)
+
+        def loss_fn(params):
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)
+            loss = state.loss_fn(logits, targets)
+            return loss
+
+        grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+        metrics = jax.lax.pmean({"loss": loss, "learning_rate": learning_rate_fn(state.step)}, axis_name="batch")
+        return new_state, metrics, new_dropout_rng
+
+    p_train_step = jax.pmap(train_step, axis_name="batch", donate_argnums=(0,))
+    # endregion
+
+    # region Define eval step functions
+    def eval_step(state, batch):
+        logits = state.apply_fn(**batch, params=state.params, train=False)
+        return state.logits_fn(logits)
+
+    p_eval_step = jax.pmap(eval_step, axis_name="batch")
+    # endregion
+
+    # region Define train and eval loop
+    logger.info(f"===== Starting training ({num_epochs} epochs) =====")
+    train_time = 0
+
+    # make sure weights are replicated on each device
+    state = replicate(state)
+
+    train_time = 0
+    step_per_epoch = len(train_dataset) // train_batch_size
+    total_steps = step_per_epoch * num_epochs
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    for epoch in epochs:
+
+        train_start = time.time()
+        train_metrics = []
+
+        # Create sampling rng
+        rng, input_rng = jax.random.split(rng)
+
+        # train
+        for step, batch in enumerate(
+            tqdm(
+                train_data_collator(input_rng, train_dataset, train_batch_size),
+                total=step_per_epoch,
+                desc="Training...",
+                position=1,
+            ),
+            1,
+        ):
+            state, train_metric, dropout_rngs = p_train_step(state, batch, dropout_rngs)
+            train_metrics.append(train_metric)
+
+            cur_step = epoch * step_per_epoch + step
+
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+
+                epochs.write(
+                    f"Step... ({cur_step}/{total_steps} | Training Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
+                )
+
+                train_metrics = []
+
+            if (
+                training_args.do_eval
+                and (cur_step % training_args.eval_steps == 0 or cur_step % step_per_epoch == 0)
+                and cur_step > 0
+            ):
+
+                eval_metrics = {}
+                all_start_logits = []
+                all_end_logits = []
+                # evaluate
+                for batch in tqdm(
+                    eval_data_collator(eval_dataset, eval_batch_size),
+                    total=len(eval_dataset) // eval_batch_size,
+                    desc="Evaluating ...",
+                    position=2,
+                ):
+                    _ = batch.pop("example_id")
+                    _ = batch.pop("offset_mapping")
+                    predictions = p_eval_step(state, batch)
+                    start_logits = np.array([pred for pred in chain(*predictions[0])])
+                    end_logits = np.array([pred for pred in chain(*predictions[1])])
+                    all_start_logits.append(start_logits)
+                    all_end_logits.append(end_logits)
+
+                # evaluate also on leftover examples (not divisible by batch_size)
+                num_leftover_samples = len(eval_dataset) % eval_batch_size
+
+                # make sure leftover batch is evaluated on one device
+                if num_leftover_samples > 0 and jax.process_index() == 0:
+                    # take leftover samples
+                    batch = eval_dataset[-num_leftover_samples:]
+                    batch = {k: np.array(v) for k, v in batch.items()}
+                    _ = batch.pop("example_id")
+                    _ = batch.pop("offset_mapping")
+
+                    predictions = eval_step(unreplicate(state), batch)
+                    start_logits = np.array([pred for pred in predictions[0]])
+                    end_logits = np.array([pred for pred in predictions[1]])
+                    all_start_logits.append(start_logits)
+                    all_end_logits.append(end_logits)
+
+                max_len = max([x.shape[1] for x in all_start_logits])  # Get the max_length of the tensor
+
+                # concatenate the numpy array
+                start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, max_len)
+                end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset, max_len)
+
+                # delete the list of numpy arrays
+                del all_start_logits
+                del all_end_logits
+                outputs_numpy = (start_logits_concat, end_logits_concat)
+                prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy)
+                eval_metrics = compute_metrics(prediction)
+
+                logger.info(f"Step... ({cur_step}/{total_steps} | Evaluation metrics: {eval_metrics})")
+
+                if jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+
+            if (cur_step % training_args.save_steps == 0 and cur_step > 0) or (cur_step == total_steps):
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(unreplicate(state.params))
+                    model.save_pretrained(
+                        training_args.output_dir,
+                        params=params,
+                        push_to_hub=training_args.push_to_hub,
+                        commit_message=f"Saving weights and logs of step {cur_step}",
+                    )
+        epochs.desc = f"Epoch ... {epoch + 1}/{num_epochs}"
+    # endregion
+
+
+if __name__ == "__main__":
+    main()
--- a/Show More
+++ b/Show More